In [1]:
!pip install pandas numpy scikit-learn --quiet

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
import seaborn as sns
df = sns.load_dataset("titanic")   # you can replace this with your own dataset
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# check missing values
print(df.isnull().sum())

# drop duplicates if any
df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df.shape)

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
Shape after removing duplicates: (784, 15)


In [4]:
# fill numerical columns with mean
df.fillna(df.mean(numeric_only=True), inplace=True)

# fill categorical columns with mode
for col in df.select_dtypes(include=['object', 'category']):
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [5]:
label_enc = LabelEncoder()
for col in df.select_dtypes(include=['object', 'category']):
    df[col] = label_enc.fit_transform(df[col])
    df.head()

In [6]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.select_dtypes(include=[np.number]))
scaled_df = pd.DataFrame(scaled_data, columns=df.select_dtypes(include=[np.number]).columns)
scaled_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,deck,embark_town,alive
0,-0.837049,0.885158,0.77249,-0.573777,0.484009,-0.497157,-0.526825,0.58625,0.885158,-0.350481,-0.129306,0.58625,-0.837049
1,1.194673,-1.455362,-1.294515,0.592829,0.484009,-0.497157,0.701587,-1.904915,-1.455362,1.27542,-0.129306,-1.904915,1.194673
2,1.194673,0.885158,-1.294515,-0.282126,-0.530599,-0.497157,-0.513876,0.58625,0.885158,1.27542,-0.129306,0.58625,1.194673
3,1.194673,-1.455362,-1.294515,0.374091,0.484009,-0.497157,0.35276,0.58625,-1.455362,1.27542,-0.129306,0.58625,1.194673
4,-0.837049,0.885158,0.77249,0.374091,-0.530599,-0.497157,-0.511478,0.58625,0.885158,-0.350481,-0.129306,0.58625,-0.837049


In [7]:
df.to_csv("Cleaned_Titanic_Data.csv", index=False)
print("✅ Data cleaned and saved successfully!")

✅ Data cleaned and saved successfully!
