In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Step 2: Load the dataset
df = pd.read_csv("/content/Titanic-Dataset.csv")

# Step 3: Preview dataset
print(" Dataset Preview:")
print(df.iloc[:, :4].head())

# Step 4: Check missing values
print("\n Missing Values:")
print(df.isnull().sum())

# Step 5: Fill missing values
# Fill Age with median
if 'Age' in df.columns:
    df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill Embarked with mode
if 'Embarked' in df.columns:
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Fill Cabin with 'Unknown' or drop if sparse
if 'Cabin' in df.columns:
    missing_ratio = df['Cabin'].isnull().mean()
    if missing_ratio > 0.8:
        df.drop('Cabin', axis=1, inplace=True)
    else:
        df['Cabin'].fillna('Unknown', inplace=True)

# Drop rows with any remaining missing values (if needed)
df.dropna(inplace=True)

# Step 6: Remove duplicates
duplicates = df.duplicated().sum()
print(f"\n Duplicates Found: {duplicates}")
df.drop_duplicates(inplace=True)

# Step 7: Drop unnecessary columns (optional)
drop_cols = ['Name', 'Ticket']  # Add more if needed
df.drop([col for col in drop_cols if col in df.columns], axis=1, inplace=True)

# Step 8: Convert data types
if 'Survived' in df.columns:
    df['Survived'] = df['Survived'].astype('category')
if 'Pclass' in df.columns:
    df['Pclass'] = df['Pclass'].astype('category')

# Step 9: Ensure consistency in categorical values
if 'Sex' in df.columns:
    df['Sex'] = df['Sex'].str.lower().str.strip()
if 'Embarked' in df.columns:
    df['Embarked'] = df['Embarked'].str.upper().str.strip()

# Step 10: Normalize numeric columns
# Choose either Min-Max or Standardization
numeric_cols = ['Age', 'Fare']
available_numeric_cols = [col for col in numeric_cols if col in df.columns]

# Min-Max Scaling
scaler = MinMaxScaler()
df[available_numeric_cols] = scaler.fit_transform(df[available_numeric_cols])

# Final info
print("\n Cleaned Data Info:")
print(df.info())

print("\n Summary Statistics:")
print(df.describe(include='all'))

 Dataset Preview:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  
0                            Braund, Mr. Owen Harris  
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  
2                             Heikkinen, Miss. Laina  
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  
4                           Allen, Mr. William Henry  

 Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

 Duplicates Found: 0

 Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null C

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we