In [None]:
import pandas as pd

# --- Part 1: Load a Dataset & Check Missing Values ---

# Task 1: Titanic Dataset
# - Load the dataset using Pandas.
titanic_df = None  # Initialize titanic_df outside the try block
try:
    titanic_df = pd.read_csv('titanic.csv')  # Assuming 'titanic.csv' is in the same directory
except FileNotFoundError:
    print("Error: 'titanic.csv' not found. Please make sure the file is in the correct directory.")
    exit()

if titanic_df is not None:
    # - Check for missing values in the 'Age' column.
    missing_age = titanic_df['Age'].isnull().sum()
    print("--- Part 1: Load a Dataset & Check Missing Values ---")
    print(f"Number of missing values in the 'Age' column: {missing_age}")

    # You can also print all columns with missing values and their counts:
    all_missing = titanic_df.isnull().sum()
    columns_with_missing = all_missing[all_missing > 0]
    print("\nColumns with missing values:")
    if columns_with_missing.empty:
        print("No missing values found in any column.")
    else:
        print(columns_with_missing)

    # Display the first few rows to get a sense of the data
    print("\nFirst 5 rows of the Titanic dataset:")
    print(titanic_df.head())

    # --- Part 2: Identify Duplicates & Inconsistencies ---

    # Task 2: Duplicate Rows in Titanic Dataset
    # - Identify any duplicate rows in the dataset.

    print("\n--- Part 2: Identify Duplicates & Inconsistencies ---")

    # Check for complete duplicate rows
    duplicate_rows = titanic_df[titanic_df.duplicated()]

    if duplicate_rows.empty:
        print("No complete duplicate rows found in the Titanic dataset.")
    else:
        print(f"Number of complete duplicate rows: {len(duplicate_rows)}")
        print("Duplicate rows:")
        print(duplicate_rows)

    # It's also common to check for duplicates based on a subset of columns
    # that uniquely identify a passenger (though this dataset might not have perfect identifiers)
    # Example: Checking for duplicates based on 'PassengerId' (which should be unique)
    duplicate_passenger_id = titanic_df[titanic_df.duplicated(subset=['PassengerId'], keep=False)]
    if duplicate_passenger_id.shape[0] > 0:
        print("\nWarning: Duplicate 'PassengerId' found (this should ideally not happen):")
        print(duplicate_passenger_id)
    else:
        print("\nNo duplicate 'PassengerId' found.")

    # We can also check for duplicates based on other combinations of columns
    # that might indicate potential inconsistencies (e.g., same Name, Age, and Fare)
    duplicate_subset = titanic_df[titanic_df.duplicated(subset=['Name', 'Age', 'Fare'], keep=False)]
    if duplicate_subset.shape[0] > 0:
        print("\nPotential inconsistencies (duplicates based on Name, Age, Fare):")
        print(duplicate_subset.sort_values(by=['Name', 'Age', 'Fare']))
    else:
        print("\nNo potential inconsistencies found based on Name, Age, and Fare.")

    # --- Part 3: Generate a Data Quality Report ---

    # Task 3: Titanic Dataset Overview
    # - Create a simple report of missing values, duplicates, and some basic statistics for the Titanic dataset.

    print("\n--- Part 3: Generate a Data Quality Report ---")

    print("\n--- Missing Values Report ---")
    all_missing = titanic_df.isnull().sum()
    columns_with_missing = all_missing[all_missing > 0]
    if columns_with_missing.empty:
        print("No missing values found in the dataset.")
    else:
        print("Columns with missing values and their counts:")
        print(columns_with_missing)

    print("\n--- Duplicates Report ---")
    duplicate_rows = titanic_df[titanic_df.duplicated()]
    if duplicate_rows.empty:
        print("No complete duplicate rows found.")
    else:
        print(f"Number of complete duplicate rows: {len(duplicate_rows)}")
        print("Complete duplicate rows:")
        print(duplicate_rows)

    print("\n--- Basic Statistics for Numerical Columns ---")
    print(titanic_df.describe())

    print("\n--- Value Counts for Categorical Columns ---")
    categorical_cols = titanic_df.select_dtypes(include='object').columns
    for col in categorical_cols:
        print(f"\nValue counts for column '{col}':")
        print(titanic_df[col].value_counts())

    print("\n--- End of Data Quality Report ---")
else:
    print("\nSkipping further analysis as the Titanic dataset was not loaded.")

Error: 'titanic.csv' not found. Please make sure the file is in the correct directory.

Skipping further analysis as the Titanic dataset was not loaded.


: 