In [None]:
#Step 1: Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Step 2: Load the Dataset
# Download from: https://www.kaggle.com/datasets/yasserh/titanic-dataset
# File name: Titanic.csv

df = pd.read_csv("Titanic.csv")  # Make sure Titanic.csv is in the same folder
print("✅ Dataset Loaded Successfully\n")
print("Dataset Shape:", df.shape)
print("\nColumns:\n", df.columns)
print("\nFirst 5 Rows:")
display(df.head())

# Step 3: Check Basic Information

print("\n--- Dataset Info ---")
df.info()

print("\n--- Missing Values ---")
print(df.isnull().sum())

print("\n--- Statistical Summary ---")
display(df.describe(include='all'))

# Step 4: Handle Missing Values

df['Age'].fillna(df['Age'].median(), inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)

df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

print("\n✅ Missing values handled successfully!")
print(df.isnull().sum())

# Step 5: Encode Categorical Variables

df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

print("\n✅ Encoding completed successfully!")
display(df.head())

# Step 6: Normalize / Standardize Numerical Features

scaler = StandardScaler()
num_cols = ['Age', 'Fare']  # you can add more if needed

df[num_cols] = scaler.fit_transform(df[num_cols])

print("\n✅ Scaling completed successfully!")
display(df.head())

# Step 7: Detect & Remove Outliers

for col in ['Age', 'Fare']:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot Before Removing Outliers: {col}')
    plt.show()

def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_limit) & (data[column] <= upper_limit)]

for col in ['Age', 'Fare']:
    df = remove_outliers_iqr(df, col)

print("\n✅ Outliers removed successfully!")
print("New Shape after removing outliers:", df.shape)

for col in ['Age', 'Fare']:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot After Removing Outliers: {col}')
    plt.show()

# Step 8: Final Cleaned Data Overview

print("\n🎯 Final Cleaned Dataset Info:")
df.info()

print("\nSample of Final Cleaned Data:")
display(df.head())

df.to_csv("Cleaned_Titanic.csv", index=False)
print("\n💾 Cleaned dataset saved as 'Cleaned_Titanic.csv'")
