In [14]:
# initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# loading the data from .csv into a pandas DataFrame
data_path = "titanic\\train.csv"
titanic_data = pd.read_csv(data_path)

In [16]:
print(f"The dataset contains {titanic_data.shape[0]} rows and {titanic_data.shape[1]} columns.")
print(f"The dataset contains the following columns: {titanic_data.columns.tolist()}")

The dataset contains 891 rows and 12 columns.
The dataset contains the following columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [17]:
titanic_data.head() # display the first five rows of the dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
titanic_data.info() # get a concise summary of the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [19]:
missng_count = titanic_data.isnull().sum() # count missing values in each column
print(f"Missing values in each column:\n{missng_count}") 

Missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [20]:
percentage_age_missing = (missng_count['Age'] / titanic_data.shape[0]) * 100 # calculate percentage of missing values in 'Age' column
percentage_cabin_missing = (missng_count['Cabin'] / titanic_data.shape[0]) * 100 # calculate percentage of missing values in 'Cabin' column
percentage_embarked_missing = (missng_count['Embarked'] / titanic_data.shape[0]) * 100 # calculate percentage of missing values in 'Embarked' column    
print(f"Percentage of missing values in 'Age': {percentage_age_missing:.2f}%")
print(f"Percentage of missing values in 'Cabin': {percentage_cabin_missing:.2f}%")
print(f"Percentage of missing values in 'Embarked': {percentage_embarked_missing:.2f}%")

Percentage of missing values in 'Age': 19.87%
Percentage of missing values in 'Cabin': 77.10%
Percentage of missing values in 'Embarked': 0.22%


<h5><strong>Summary of initial view on the Titanic data set</strong></h5>

1. 12 columns, seven of which are numeric and five of which are categorical.
2. Columns <strong>Age</strong>, <strong>Cabin</strong> and <strong>Embarked</strong> contain missing values as noted above.
3. <strong>PassengerId</strong> seems to simply be a sequential identifier from 1 through 891 - it likely has little meaning.
4. <strong>Survived</strong> is the target attribute (0 for deceased, 1 for survived)
5. <strong>Pclass</strong> proxies socio-economic status (1 = upper class, 2 = middle class 3 = lower class)
6. <strong>SibSp</strong> (siblings / spouse) and <strong>Parch</strong> (parents / children) is unclear in terms of when is someone a spouse or sibling or a parent / child... however it would indicate a familial group metric.
7. <strong>Ticket</strong> is a mix of catergoric and numeric and likely an ticket identifier that is not hugely helpful to the data set.
8. <strong>Ticket</strong> has a high proportion of missing data and likely is not helpful without having a map of the ship to determine where in the vessel cabins were!

Initial considerations are that some attributes noted above; <strong>PassengerId</strong>, <strong>Ticket</strong> and <strong>Ticket</strong> can likely be dropped however some wider exploration will be done first.

Test data is already separate so no splitting is needed however I will pipeline the preprocessing.

In [25]:
deceased_count = titanic_data['Survived'].value_counts()[0] # count of deceased passengers
percentage_deceased = (deceased_count / titanic_data.shape[0]) * 100 # percentage of deceased passengers
survived_count = titanic_data['Survived'].value_counts()[1] # count of survived passengers
percentage_survived = (survived_count / titanic_data.shape[0]) * 100 # percentage of survived passengers
print(f"Total deceased passengers: {deceased_count}\nThis is {percentage_deceased:.2f}% of the total passengers.")
print(f"Total survived passengers: {survived_count}\nThis is {percentage_survived:.2f}% of the total passengers.")

Total deceased passengers: 549
This is 61.62% of the total passengers.
Total survived passengers: 342
This is 38.38% of the total passengers.


In [30]:
print("Deceased passengers by Sex:")
deceased_by_sex = titanic_data[titanic_data['Survived'] == 0]['Sex'].value_counts() # analysis of deceased passengers by sex
print(deceased_by_sex)
print("\n****************\n")
print("Deceased passengers by Pclass:")
deceased_by_pclass = titanic_data[titanic_data['Survived'] == 0]['Pclass'].value_counts().sort_index() # analysis of deceased passengers by passenger class
print(deceased_by_pclass)

Deceased passengers by Sex:
Sex
male      468
female     81
Name: count, dtype: int64

****************

Deceased passengers by Pclass:
Pclass
1     80
2     97
3    372
Name: count, dtype: int64
