In [1]:
import pandas as pd
# import plotly.graph_objects as go

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Not all columns are relevant so we will keep those we intend to analyse
df_trimmed = df[['Pclass', 'Sex', 'Age', 'Survived', 'Name']]
df_trimmed['Sex'] = df_trimmed['Sex'].str.capitalize()

# Most of the columns are categorical which is ideal for a sankey visualisation except for Age. Hence, we will consider binning them into age groups which will be defined below
df_trimmed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed['Sex'] = df_trimmed['Sex'].str.capitalize()


Unnamed: 0,Pclass,Sex,Age,Survived,Name
0,3,Male,22.0,0,"Braund, Mr. Owen Harris"
1,1,Female,38.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,3,Female,26.0,1,"Heikkinen, Miss. Laina"
3,1,Female,35.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,3,Male,35.0,0,"Allen, Mr. William Henry"
...,...,...,...,...,...
886,2,Male,27.0,0,"Montvila, Rev. Juozas"
887,1,Female,19.0,1,"Graham, Miss. Margaret Edith"
888,3,Female,,0,"Johnston, Miss. Catherine Helen ""Carrie"""
889,1,Male,26.0,1,"Behr, Mr. Karl Howell"


In [5]:
# Check for NA 
df_trimmed.isnull().any()

Pclass      False
Sex         False
Age          True
Survived    False
Name        False
dtype: bool

In [6]:
# From here, we can see that there are 177 passengers with NULL as their age. We will clean the NULL at the binning level.

df_trimmed.groupby(['Age'], dropna = False).size()

Age
0.42       1
0.67       1
0.75       2
0.83       2
0.92       1
        ... 
70.50      1
71.00      2
74.00      1
80.00      1
NaN      177
Length: 89, dtype: int64

In [7]:
# Set up binning of different age group here using the categorisation here as a guideline: https://www.nih.gov/nih-style-guide/age
# But we will merge 'Newborns' and 'Infants' and rename 'Older Adults' as 'Senior'

# Infants (1 month to 1 year)
# Children (1 year through 12 years)
# Adolescents (13 years through 17 years. They may also be referred to as teenagers depending on the context.)
# Adults (18 years or older)
# Senior (65 and older)

age_bins = [0,1,12,18,65,999]
df_trimmed = df_trimmed.assign(AgeGroup = pd.cut(x = df_trimmed['Age'], bins = age_bins, right = True, include_lowest = False, labels = ['Infants', 'Children', 'Adolescents', 'Adults', 'Senior']))

In [8]:
# Add a new categori - 'Unknown' to group all passengers without NULL as their AgeGroup
df_trimmed['AgeGroup'] = df_trimmed['AgeGroup'].cat.add_categories('Unknown')
df_trimmed['AgeGroup'] = df_trimmed['AgeGroup'].fillna('Unknown')

# Check if there is anymore NULL in the AgeGrou column
df_trimmed[df_trimmed.AgeGroup.isna()]

Unnamed: 0,Pclass,Sex,Age,Survived,Name,AgeGroup


In [9]:
df_trimmed_sankey_1 = df_trimmed.groupby(by = ['Pclass', 'AgeGroup'])['Name'].count().reset_index().rename(columns={'Pclass': 'Source', 'AgeGroup': 'Target', 'Name': 'Value'}, inplace = False)

# This forms the first sankey dataframe
df_trimmed_sankey_1

Unnamed: 0,Source,Target,Value
0,1,Infants,1
1,1,Children,3
2,1,Adolescents,12
3,1,Adults,166
4,1,Senior,4
5,1,Unknown,30
6,2,Infants,5
7,2,Children,12
8,2,Adolescents,12
9,2,Adults,142


In [10]:
# Map the values within the Source column to their respective Pclasses
df_trimmed_sankey_1['Source'] = df_trimmed_sankey_1.Source.map({1: 'Pclass1', 2: 'Pclass2', 3: 'Pclass3'})

df_trimmed_sankey_1

Unnamed: 0,Source,Target,Value
0,Pclass1,Infants,1
1,Pclass1,Children,3
2,Pclass1,Adolescents,12
3,Pclass1,Adults,166
4,Pclass1,Senior,4
5,Pclass1,Unknown,30
6,Pclass2,Infants,5
7,Pclass2,Children,12
8,Pclass2,Adolescents,12
9,Pclass2,Adults,142


In [11]:
df_trimmed_sankey_2 = df_trimmed.groupby(by = ['AgeGroup', 'Sex'])['Name'].count().reset_index().rename(columns={'AgeGroup': 'Source', 'Sex': 'Target', 'Name': 'Value'}, inplace = False)

# This forms the second sankey dataframe
df_trimmed_sankey_2

Unnamed: 0,Source,Target,Value
0,Infants,Female,4
1,Infants,Male,10
2,Children,Female,28
3,Children,Male,27
4,Adolescents,Female,36
5,Adolescents,Male,34
6,Adults,Female,193
7,Adults,Male,374
8,Senior,Female,0
9,Senior,Male,8


In [12]:
df_trimmed_sankey_3 = df_trimmed.groupby(by = ['Sex', 'Survived'])['Name'].count().reset_index().rename(columns={'Sex': 'Source', 'Survived': 'Target', 'Name': 'Value'}, inplace = False)

# This forms the third sankey dataframe
df_trimmed_sankey_3

Unnamed: 0,Source,Target,Value
0,Female,0,81
1,Female,1,233
2,Male,0,468
3,Male,1,109


In [13]:
# Map the values within the Target column to their respective Survival Status
df_trimmed_sankey_3['Target'] = df_trimmed_sankey_3.Target.map({0: 'Died', 1: 'Survived'})

df_trimmed_sankey_3

Unnamed: 0,Source,Target,Value
0,Female,Died,81
1,Female,Survived,233
2,Male,Died,468
3,Male,Survived,109


In [14]:
# Combine all sankey dataframe together
sankey_main = pd.concat([df_trimmed_sankey_1, df_trimmed_sankey_2, df_trimmed_sankey_3], axis = 0)

# We now need to manipulate the dataset in a way that plotly is able to understand in order to generate the Alluvial diagram
sankey_main

Unnamed: 0,Source,Target,Value
0,Pclass1,Infants,1
1,Pclass1,Children,3
2,Pclass1,Adolescents,12
3,Pclass1,Adults,166
4,Pclass1,Senior,4
5,Pclass1,Unknown,30
6,Pclass2,Infants,5
7,Pclass2,Children,12
8,Pclass2,Adolescents,12
9,Pclass2,Adults,142


In [15]:
# This allows us to retrieve the unique categorical values under the Source and Target columns
unique_source_target = list(pd.unique(list(sankey_main[['Source', 'Target']].values.ravel('K'))))

# We will use this for the mapping dictionary later
unique_source_target

['Pclass1',
 'Pclass2',
 'Pclass3',
 'Infants',
 'Children',
 'Adolescents',
 'Adults',
 'Senior',
 'Unknown',
 'Female',
 'Male',
 'Died',
 'Survived']

In [16]:
# This allows us to assign a unique value to each categorical value
mapping_dict = {k:v for v, k in enumerate(unique_source_target)}

mapping_dict

{'Pclass1': 0,
 'Pclass2': 1,
 'Pclass3': 2,
 'Infants': 3,
 'Children': 4,
 'Adolescents': 5,
 'Adults': 6,
 'Senior': 7,
 'Unknown': 8,
 'Female': 9,
 'Male': 10,
 'Died': 11,
 'Survived': 12}

In [17]:
# We will now apply the mapping dictionary back to the Source and Target column
sankey_main['Source'] = sankey_main['Source'].map(mapping_dict)
sankey_main['Target'] = sankey_main['Target'].map(mapping_dict)

sankey_main

Unnamed: 0,Source,Target,Value
0,0,3,1
1,0,4,3
2,0,5,12
3,0,6,166
4,0,7,4
5,0,8,30
6,1,3,5
7,1,4,12
8,1,5,12
9,1,6,142


In [18]:
sankey_main = sankey_main.to_dict(orient = 'list')

sankey_main

{'Source': [0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  3,
  3,
  4,
  4,
  5,
  5,
  6,
  6,
  7,
  7,
  8,
  8,
  9,
  9,
  10,
  10],
 'Target': [3,
  4,
  5,
  6,
  7,
  8,
  3,
  4,
  5,
  6,
  7,
  8,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  9,
  10,
  9,
  10,
  9,
  10,
  9,
  10,
  9,
  10,
  11,
  12,
  11,
  12],
 'Value': [1,
  3,
  12,
  166,
  4,
  30,
  5,
  12,
  12,
  142,
  2,
  11,
  8,
  40,
  46,
  259,
  2,
  136,
  4,
  10,
  28,
  27,
  36,
  34,
  193,
  374,
  0,
  8,
  53,
  124,
  81,
  233,
  468,
  109]}

As I am currently using Jupyterlite which doesn't have Plotly installed as a module. I am unable to build the Alluvial diagram here. All the data exploration and manipulation has been completed here. From this point onwards, I will be transferring the minimum required code to Sublime Text 3 in order to develop the chart there using Plotly.