In [49]:
import seaborn as sns
import pandas as pd
import numpy as np

# Step 1: Load Titanic dataset
df = pd.read_csv('/content/Titanic-Dataset.csv')

print("Original Data:")

Original Data:


In [50]:
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [51]:
print("\nShape before cleaning:", df.shape)


Shape before cleaning: (891, 12)


In [52]:
# Check missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Fill missing numeric values with median
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Fill missing categorical values with mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Cabin'] = df['Cabin'].fillna('Unknown')
# The 'embark_town' column does not exist in this dataset based on the df.info() output.
# Removing the line that tries to fill missing values in 'embark_town'.
# df['embark_town'] = df['embark_town'].fillna('Unknown')

# Drop columns with too many missing values (optional)
# e.g., 'embark_town' if you want
# df.drop(columns=['embark_town'], inplace=True)


Missing values per column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [53]:
# Check data types
print("\nBefore fixing dtypes:")
print(df.dtypes)

# Ensure correct types
df['Pclass'] = df['Pclass'].astype(int)
df['Survived'] = df['Survived'].astype(int)
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')

print("\nAfter fixing dtypes:")
print(df.dtypes)


Before fixing dtypes:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

After fixing dtypes:
PassengerId       int64
Survived          int64
Pclass            int64
Name             object
Sex            category
Age             float64
SibSp             int64
Parch             int64
Ticket           object
Fare            float64
Cabin            object
Embarked       category
dtype: object


In [54]:
print("\nDuplicate rows before:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicate rows after:", df.duplicated().sum())


Duplicate rows before: 0
Duplicate rows after: 0


In [55]:
# Make all column names lowercase and replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("\nStandardized column names:")
print(df.columns)


Standardized column names:
Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')


In [56]:
cleaning_log = {
    "Rows after cleaning": len(df),
    "Missing values handled": True,
    "Duplicates removed": True,
    "Data types fixed": True,
    "Columns standardized": list(df.columns)
}

print("\nðŸ§¾ Cleaning Log:")
for key, value in cleaning_log.items():
    print(f"{key}: {value}")


ðŸ§¾ Cleaning Log:
Rows after cleaning: 891
Missing values handled: True
Duplicates removed: True
Data types fixed: True
Columns standardized: ['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']


In [57]:
df.to_csv("titanic_cleaned.csv", index=False)
print("\n Cleaned dataset saved as 'titanic_cleaned.csv'")


 Cleaned dataset saved as 'titanic_cleaned.csv'
