In [None]:
# Titanic Dataset Preprocessing - Jupyter Notebook

In [None]:
# Cell 1: Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
# Cell 2: Load the dataset
file_path = '/content/Titanic-Dataset.csv'  # Change the path as needed

In [None]:
try:
    df_encoded = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")

In [None]:
# Cell 3: Display initial data info
df_encoded.info()
df_encoded.head()

In [None]:
# Cell 4: Define numeric features to be scaled
numeric_features = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [None]:
# Cell 5: Standardize numeric features using StandardScaler
scaler = StandardScaler()
df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])

In [None]:
# Cell 6: Display the transformed data
print("Transformed Data:")
print(df_encoded.head())

In [None]:
# Cell 7: Visualize boxplots to identify outliers
for col in numeric_features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df_encoded[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
# Cell 8: Function to remove outliers using IQR method
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

In [None]:
# Cell 9: Remove outliers
df_cleaned = remove_outliers_iqr(df_encoded, numeric_features)

In [None]:
# Cell 10: Check the shape of cleaned dataset
print("Shape after outlier removal:", df_cleaned.shape)

In [None]:
# Cell 11: Display cleaned data preview
df_cleaned.head()