# Titanic Dataset - Data Cleaning & Preprocessing
This notebook performs preprocessing on the Titanic dataset as per the internship task:
- Handle missing values
- Encode categorical variables
- Normalize numerical features
- Remove outliers

We'll prepare the dataset for machine learning.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:

df = pd.read_csv("Titanic-Dataset.csv")
df.head()

In [None]:

df.info()

df.isnull().sum()

In [None]:

df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin'], inplace=True)

In [None]:

df.drop(columns=['Name', 'Ticket'], inplace=True)

In [None]:

le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])  
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

In [None]:
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

In [None]:
for col in ['Age', 'Fare', 'SibSp', 'Parch']:
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
Q1 = df[['Age', 'Fare', 'SibSp', 'Parch']].quantile(0.25)
Q3 = df[['Age', 'Fare', 'SibSp', 'Parch']].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[['Age', 'Fare', 'SibSp', 'Parch']] < (Q1 - 1.5 * IQR)) | 
          (df[['Age', 'Fare', 'SibSp', 'Parch']] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:

df.head()

print("Final shape:", df.shape)
df.to_csv("Titanic-Dataset-CLEANED.csv", index=False)