In [18]:
import pandas as pd

df = pd.read_csv("Titanic-Dataset.csv")

print("\nMissing Values BEFORE Handling:")
print(df.isnull().sum())

df['Age'] = df['Age'].fillna(df['Age'].median())

df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

irrelevant_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
df.drop(columns=irrelevant_cols, inplace=True)

print("\nMissing Values After Handling:")
print(df.isnull().sum())


Missing Values BEFORE Handling:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing Values After Handling:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [19]:
import numpy as np

Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR

print("Q1 (25th percentile):", Q1)
print("Q3 (75th percentile):", Q3)
print("IQR:", IQR)
print("Lower Bound:", lower_bound)


df['Fare'] = np.where(df['Fare'] < lower_bound, lower_bound, df['Fare'])




Q1 (25th percentile): 7.9104
Q3 (75th percentile): 31.0
IQR: 23.0896
Lower Bound: -26.724


In [20]:
import numpy as np

Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)

IQR = Q3 - Q1

upper_bound = Q3 + 1.5 * IQR

print("Q1 (25th percentile):", Q1)
print("Q3 (75th percentile):", Q3)
print("IQR:", IQR)
print("Upper Bound:", upper_bound)

df['Fare'] = np.where(df['Fare'] > upper_bound, upper_bound, df['Fare'])


Q1 (25th percentile): 7.9104
Q3 (75th percentile): 31.0
IQR: 23.0896
Upper Bound: 65.6344


In [21]:
from sklearn.preprocessing import LabelEncoder

df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

# Label encoding for remaining categorical columns
label_encoder = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [22]:
display(df)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.2500,True,False,True
1,1,1,38.0,1,0,65.6344,False,False,False
2,1,3,26.0,0,0,7.9250,False,False,True
3,1,1,35.0,1,0,53.1000,False,False,True
4,0,3,35.0,0,0,8.0500,True,False,True
...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,True,False,True
887,1,1,19.0,0,0,30.0000,False,False,True
888,0,3,28.0,1,2,23.4500,False,False,True
889,1,1,26.0,0,0,30.0000,True,False,False


In [26]:
from sklearn.model_selection import train_test_split

X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

print("\nDataset Split Shapes")
print("---------------------")
print("Training Set   :", X_train.shape, y_train.shape)
print("Validation Set :", X_val.shape, y_val.shape)
print("Testing Set    :", X_test.shape, y_test.shape)


Dataset Split Shapes
---------------------
Training Set   : (623, 8) (623,)
Validation Set : (134, 8) (134,)
Testing Set    : (134, 8) (134,)
