In [8]:
# ðŸ“Œ Step 1: Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# ðŸ“Œ Step 2: Load Titanic Dataset
df = sns.load_dataset('titanic')
print("Initial shape:", df.shape)
print(df.head())

# ðŸ“Œ Step 3: Handle Missing Values
# Drop columns with too many missing values
df.drop(columns=['deck'], inplace=True)

# Fill missing 'age' with median
df['age'].fillna(df['age'].median(), inplace=True)

# Fill missing 'embark_town' with mode
df['embark_town'].fillna(df['embark_town'].mode()[0], inplace=True)

# Drop remaining rows with missing values
df.dropna(inplace=True)
print("Shape after cleaning:", df.shape)

# ðŸ“Œ Step 4: Encode Categorical Variables
le = LabelEncoder()

# Encode 'sex' and 'embark_town'
df['sex'] = le.fit_transform(df['sex'])  # male=1, female=0
df['embark_town'] = le.fit_transform(df['embark_town'])

# One-hot encode 'class' column
df = pd.get_dummies(df, columns=['class'], drop_first=True)

# ðŸ“Œ Step 5: Feature Scaling
scaler = StandardScaler()
df[['age', 'fare']] = scaler.fit_transform(df[['age', 'fare']])

# ðŸ“Œ Step 6: Prepare Features and Target
# Dynamically select available one-hot encoded class columns
available_class_cols = [col for col in df.columns if col.startswith('class_')]
X = df[['sex', 'age', 'fare', 'embark_town'] + available_class_cols]
y = df['survived']

# ðŸ“Œ Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ðŸ“Œ Step 8: Display Final Shapes
print("âœ… Preprocessing Complete")
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Initial shape: (891, 15)
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
Shape after cleaning: (889, 14)
âœ… Preprocessing Complete
Training set shape: (711, 6)
Testing set shape: (178, 6)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embark_town'].fillna(df['embark_town'].mode()[0], inplace=True)
