In [1]:
import pandas as pd 
import numpy as np 
import matplotlib as plt

In [2]:
titanic = pd.read_csv("Titanic-Dataset.csv")

In [3]:
titanic_df = pd.DataFrame(titanic)

In [4]:
# Shape of dataset (rows × columns).
titanic_df.shape

(891, 12)

In [5]:
# First 5 rows (head).
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# Column data types.
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
# Count missing values per column.
titanic_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
# Summary statistics for numeric columns (mean, median, std, min, max).
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Preprocessing Steps
### A. Handle Missing Values

In [9]:
# Age → fill with median or group median (by Pclass & Sex).
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace = True)


In [10]:
titanic_df['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [11]:
# Cabin → many missing; consider new column Has_Cabin or drop.
titanic_df.drop('Cabin',axis = 1, inplace = True)

In [12]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [13]:
# Embarked → fill with most common port (mode).
titanic_df["Embarked"].fillna(titanic_df["Embarked"].mode, inplace = True)

In [14]:
# Fare → fill with median if missing.
titanic_df["Fare"].fillna(titanic_df["Fare"].median(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df["Fare"].fillna(titanic_df["Fare"].median(), inplace = True)


### B. Handle Duplicates

In [15]:
# Rare in Titanic dataset, but check PassengerId or Name duplicates.
titanic_df.duplicated(subset =["PassengerId","Name"]).sum()

np.int64(0)

### C. Feature Engineering

In [16]:
# Title from Name (Mr, Mrs, Miss, Master, etc.).
titanic_df["Title"] = titanic_df["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)

In [17]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr


In [18]:
# FamilySize = SibSp + Parch + 1.
titanic_df["FamilySize"] = titanic_df["SibSp"] + titanic_df["Parch"] + 1

In [19]:
# IsAlone = 1 if FamilySize = 1 else 0.
titanic_df['IsAlone'] = titanic_df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)


In [20]:
# AgeGroup: Child, Teen, Adult, Senior.
conditions = [
    (titanic_df['Age'] < 12),
    (titanic_df['Age'] < 18),
    (titanic_df['Age'] < 60),
    (titanic_df['Age'] >= 60)
]
choices = ['Child', 'Teen', 'Adult', 'Senior']
titanic_df["AgeGroup"] = np.select(conditions, choices, default = "Unknown")

In [21]:
# FareBin: Low, Medium, High fare categories.
fare_conditions = [
    (titanic_df["Fare"] < 10),
    (titanic_df["Fare"] < 30),
    (titanic_df["Fare"] <= 520)
]
fare_choices = ["Low", "Medium","High"]
titanic_df["FareBin"] = np.select(fare_conditions,fare_choices, default="Unknown")