# Titanic Data Cleaning Notebook
This notebook performs cleaning and feature engineering on the Titanic dataset.

In [11]:
import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv('/content/cleaned_titanic.csv')

In [13]:
print('Shape:', df.shape)
print('\nInfo:')
print(df.info())
print('\nMissing values per column:')
print(df.isnull().sum())
print('\nSample rows:')
print(df.head())

Shape: (891, 10)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
 8   Title     891 non-null    object 
 9   AgeGroup  891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB
None

Missing values per column:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       0
AgeGroup    0
dtype: int64

Sample rows:
   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked Title  \
0         0       3    male  22.0      1      0   7.2500        S

In [14]:
print('\nValue counts - Embarked:\n', df['Embarked'].value_counts(dropna=False))
print('\nValue counts - Sex:\n', df['Sex'].value_counts(dropna=False))
print('\nValue counts - Pclass:\n', df['Pclass'].value_counts(dropna=False))


Value counts - Embarked:
 Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

Value counts - Sex:
 Sex
male      577
female    314
Name: count, dtype: int64

Value counts - Pclass:
 Pclass
3    491
1    216
2    184
Name: count, dtype: int64


In [15]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

if 'Name' in df.columns:
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
                                       'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
else:
    print("⚠️ Column 'Name' not found!")



⚠️ Column 'Name' not found!


In [16]:
df.to_csv('cleaned_titanic.csv', index=False)
print('\n✅ Cleaned dataset saved as cleaned_titanic.csv')


✅ Cleaned dataset saved as cleaned_titanic.csv
