In [1]:
import pandas as pd
# import plotly.graph_objects as go

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Not all columns are relevant so we will keep those we intend to analyse
df_trimmed = df[['Pclass', 'Sex', 'Age', 'Survived']]

# Most of the columns are categorical which is ideal for a sankey visualisation except for Age. Hence, we will consider binning them into age groups which will be defined below
df_trimmed

Unnamed: 0,Pclass,Sex,Age,Survived
0,3,male,22.0,0
1,1,female,38.0,1
2,3,female,26.0,1
3,1,female,35.0,1
4,3,male,35.0,0
...,...,...,...,...
886,2,male,27.0,0
887,1,female,19.0,1
888,3,female,,0
889,1,male,26.0,1


In [5]:
# Check for NA 
df_trimmed.isnull().any()

Pclass      False
Sex         False
Age          True
Survived    False
dtype: bool

In [6]:
# From here, we can see that there are 177 passengers with NULL as their age. There is no need to clean up the NULLs since it does not affect binning as NA is a valid bin

df_trimmed.groupby(['Age'], dropna = False).size()

Age
0.42       1
0.67       1
0.75       2
0.83       2
0.92       1
        ... 
70.50      1
71.00      2
74.00      1
80.00      1
NaN      177
Length: 89, dtype: int64

In [7]:
# Set up binning of different age group here using the categorisation here as a guideline: https://www.nih.gov/nih-style-guide/age
# But we will merge 'Newborns' and 'Infants' and rename 'Older Adults' as 'Senior'

# Infants (1 month to 1 year)
# Children (1 year through 12 years)
# Adolescents (13 years through 17 years. They may also be referred to as teenagers depending on the context.)
# Adults (18 years or older)
# Senior (65 and older)

age_bins = [0,1,12,18,65,999]
df_trimmed = df_trimmed.assign(AgeGroup = pd.cut(x = df_trimmed['Age'], bins = age_bins, right = True, include_lowest = False, labels = ['Infants', 'Children', 'Adolescents', 'Adults', 'Senior']))

In [79]:
df_trimmed_sankey_1 = df_trimmed.groupby(by = ['Pclass', 'Sex'])['Survived'].count().reset_index().rename(columns={'Pclass': 'Source', 'Sex': 'Target', 'Survived': 'Value'}, inplace = False)

# This forms the first sankey dataframe
df_trimmed_sankey_1

Unnamed: 0,Source,Target,Value
0,1,female,94
1,1,male,122
2,2,female,76
3,2,male,108
4,3,female,144
5,3,male,347
