In [2]:
# Import all libraries first
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1. Data Loading & Initial Cleanup
df = pd.read_csv('Titanic-Dataset.csv')
df.drop('Cabin', axis=1, inplace=True)

# 2. Handling Missing Values
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)
most_frequent_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(most_frequent_embarked, inplace=True)

# 3. Categorical Encoding & Dropping Unnecessary Columns
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked', drop_first=True)
df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

# 4. Standardization
features_to_scale = ['Age', 'Fare', 'SibSp', 'Parch']
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# 5. Outlier Removal (using IQR on 'Fare')
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_cleaned = df[(df['Fare'] >= lower_bound) & (df['Fare'] <= upper_bound)]

print("Preprocessing completed successfully in a single flow.")
print(df_cleaned.head())

Preprocessing completed successfully in a single flow.
   Survived  Pclass  Sex       Age     SibSp     Parch      Fare  Embarked_Q  \
0         0       3    1 -0.565736  0.432793 -0.473674 -0.502445       False   
2         1       3    0 -0.258337 -0.474545 -0.473674 -0.488854       False   
3         1       1    0  0.433312  0.432793 -0.473674  0.420730       False   
4         0       3    1  0.433312 -0.474545 -0.473674 -0.486337       False   
5         0       3    1 -0.104637 -0.474545 -0.473674 -0.478116        True   

   Embarked_S  
0        True  
2        True  
3        True  
4        True  
5       False  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(most_frequent_embarked, inplace=True)
