In [64]:
# initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

<h5><strong>Loading the data and EDA</strong></h5>

In [65]:
# loading the data from .csv into a pandas DataFrame
data_path = "titanic\\train.csv"
titanic_data = pd.read_csv(data_path)

In [66]:
print(f"The dataset contains {titanic_data.shape[0]} rows and {titanic_data.shape[1]} columns.")
print(f"The dataset contains the following columns: {titanic_data.columns.tolist()}")

The dataset contains 891 rows and 12 columns.
The dataset contains the following columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [67]:
titanic_data.head() # display the first five rows of the dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [68]:
titanic_data.info() # a concise summary of the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [69]:
deceased_count = titanic_data['Survived'].value_counts()[0] # count of deceased passengers
percentage_deceased = (deceased_count / titanic_data.shape[0]) * 100 # percentage of deceased passengers
survived_count = titanic_data['Survived'].value_counts()[1] # count of survived passengers
percentage_survived = (survived_count / titanic_data.shape[0]) * 100 # percentage of survived passengers
print(f"Total deceased passengers: {deceased_count}\nThis is {percentage_deceased:.2f}% of the total passengers.")
print(f"Total survived passengers: {survived_count}\nThis is {percentage_survived:.2f}% of the total passengers.")

Total deceased passengers: 549
This is 61.62% of the total passengers.
Total survived passengers: 342
This is 38.38% of the total passengers.


In [70]:
pd.crosstab(titanic_data['Sex'], titanic_data['Survived'], margins=True) # cross-tabulation of 'Sex' and 'Survived' 

Survived,0,1,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,81,233,314
male,468,109,577
All,549,342,891


In [71]:
pd.crosstab(titanic_data['Pclass'], titanic_data['Survived'], margins=True) # cross-tabulation of 'Pclass' and 'Survived' 

Survived,0,1,All
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80,136,216
2,97,87,184
3,372,119,491
All,549,342,891


In [72]:
pd.crosstab(titanic_data['Embarked'], titanic_data['Survived'], margins=True) # cross-tabulation of 'Embarked' and 'Survived' 

Survived,0,1,All
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,75,93,168
Q,47,30,77
S,427,217,644
All,549,340,889


In [73]:
age_bins = np.arange(0, 90, 10) # 0–80 in steps of 10
survival_by_age = titanic_data.groupby(
    pd.cut(titanic_data["Age"], bins=age_bins, precision=0),
    observed=True
)["Survived"].mean()

print("\nSurvival Rate by Age:")
for age_range, rate in (survival_by_age * 100).round().items():
    print(f"{age_range}: {rate:.0f}%")


Survival Rate by Age:
(0, 10]: 59%
(10, 20]: 38%
(20, 30]: 37%
(30, 40]: 45%
(40, 50]: 38%
(50, 60]: 40%
(60, 70]: 24%
(70, 80]: 20%


In [74]:
fare_bins = np.arange(0, 600, 50)  # 0–550 in steps of 50

survival_by_fare = titanic_data.groupby(
    pd.cut(titanic_data["Fare"], bins=fare_bins, precision=0),
    observed=True
)["Survived"].mean()

print("\nSurvival Rate by Fare:")
for fare_range, rate in (survival_by_fare * 100).round().items():
    print(f"{fare_range}: {rate:.0f}%")


Survival Rate by Fare:
(0, 50]: 32%
(50, 100]: 65%
(100, 150]: 79%
(150, 200]: 67%
(200, 250]: 64%
(250, 300]: 67%
(500, 550]: 100%


Initial Exploratory Data Analysis (EDA) suggests - 

1. More passengers died than survived
2. Women stood a better chance of survival than men.
3. Higher class passengers stood a better chance of survival than lower class passengers
4. Children (10 and <) stood a better chance of survival than any other age group
5. Those who paid the lowest fare had less chance of surviving.
6. Those who embarked at Sounthampton had less chance of surviving.

<h5><strong>Missing Data<strong></h5>


In [75]:
missing_count = titanic_data.isnull().sum() # count missing values in each column
print(f"Missing values in each column:\n{missing_count}") 

Missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [76]:
percentage_age_missing = (missing_count['Age'] / titanic_data.shape[0]) * 100 # calculate percentage of missing values in 'Age' column
percentage_cabin_missing = (missing_count['Cabin'] / titanic_data.shape[0]) * 100 # calculate percentage of missing values in 'Cabin' column
percentage_embarked_missing = (missing_count['Embarked'] / titanic_data.shape[0]) * 100 # calculate percentage of missing values in 'Embarked' column    
print(f"Percentage of missing values in 'Age': {percentage_age_missing:.2f}%")
print(f"Percentage of missing values in 'Cabin': {percentage_cabin_missing:.2f}%")
print(f"Percentage of missing values in 'Embarked': {percentage_embarked_missing:.2f}%")

Percentage of missing values in 'Age': 19.87%
Percentage of missing values in 'Cabin': 77.10%
Percentage of missing values in 'Embarked': 0.22%


Drop <strong>Cabin</strong> due to such a high volume of missing values.
<strong>Age</strong> missing values imputed using a median value grouped by <strong>Sex</strong> and <strong>Pclass</strong>.
<strong>Embarked</strong> missing values impute using mode.

In [77]:
titanic_data = titanic_data.drop(columns=['Cabin'], axis=1)  # drop 'Cabin' column due to high missing values

In [78]:
titanic_data['Age'] = titanic_data.groupby(['Sex', 'Pclass'])['Age'].transform(
    lambda x: x.fillna(x.median())
) # fill missing 'Age' values with median age based on 'Sex' and 'Pclass'    

In [79]:
titanic_data['Embarked'] = titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0]) # fill missing 'Embarked' values with mode

In [80]:
missing_count = titanic_data.isnull().sum() # count missing values in each column
print(f"Missing values in each column:\n{missing_count}") 

Missing values in each column:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


<h5><strong>Feature Engineering</strong></h5>

1. Consider if <strong>SibSp</strong> and <strong>Parch</strong> can infer a family size group.
2. Consider therefore if there is a way of identifying solo passengers.
3. There are some honorific title (Sir, Countess etc) in passenger name data, can these be stripped and used as an attribute (stripped and categorised as honorific "Yes / No")


In [81]:
titanic_data["FamilySize"] = titanic_data["SibSp"] + titanic_data["Parch"] + 1  # create 'FamilySize' feature

In [83]:
titanic_data["SoloTraveler"] = (titanic_data["FamilySize"] == 1).astype(int)  # create 'SoloTraveler' feature

In [None]:
titanic_data["Title"] = titanic_data["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False) # extract 'Title' from 'Name' column

In [None]:
titanic_data["Title"].value_counts() # display counts of each unique title

Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: count, dtype: int64

In [89]:
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
titanic_data['Title'] = titanic_data['Title'].replace(rare_titles, 'Rare') # group rare titles under 'Rare' category

In [91]:
titanic_data["Title"].value_counts() # display counts of each unique title

Title
Mr        517
Miss      182
Mrs       125
Master     40
Rare       23
Mlle        2
Mme         1
Ms          1
Name: count, dtype: int64

In [92]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,SoloTraveler,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,2,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,1,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,2,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1,1,Mr
