## test sankey_flow visualization with Titanic dataset

In [1]:
import pandas as pd
import numpy as np

## read data

In [2]:
raw = pd.read_csv('data/titanic_train.csv')

In [3]:
raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
raw.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [5]:
# use these columns as layers of nodes
use_cols = ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for col in use_cols:
    print '%s: %s' %(col, str(raw[col].unique()))

Survived: [0 1]
Pclass: [3 1 2]
Sex: ['male' 'female']
SibSp: [1 0 3 4 2 5 8]
Parch: [0 1 2 5 3 4 6]
Embarked: ['S' 'C' 'Q' nan]


In [6]:
raw['Embarked'] = raw['Embarked'].fillna('unknown')

## input for generate_sankey_flow
The function is expecting a pandas DataFrame. In this DataFrame, each colum represents a state.  
Take Titanic for example, columns **['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']** are 6 different states. Unique values in each column are all possible state value in that state. And each row represents a unit and its flowing path.  
Well, to draw a proper sankey path, you need to calculate how many units is flowing from (status1, value1) to (status2, value1), how many units is flowing from (status2, value1) to (status3, value3), ..., which was tedious. Fortunately, it is handled automatically in **generate_sankey_flow** function.

In [7]:
raw[use_cols].head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked
0,0,3,male,1,0,S
1,1,1,female,1,0,C
2,1,3,female,0,0,S
3,1,1,female,1,0,S
4,0,3,male,0,0,S


## color strategy
**node_color_type**, default="col_val"  
Can be ['col', 'val', 'col_val', 'cus']
- 'col': each column has different color
- 'val': each unique value has different color (unique values through all columns)
- 'col_val': each unique value in each column has different color
- 'cus': customer provide node color mapping

**node_color_mapping**, default=None  
Customized color mapping.
- node_color_mapping = '#fff'  
All nodes have the same color
- node_color_mapping = dict()  

```python
node_color_mapping = {
    'type': 'col',
    'mapping': {
        column1: color1, column2: color2, ...
    }
}

node_color_mapping = {
    'type': 'val',
    'mapping': {
        value1: color1, value2: color2, ...
    }
}

node_color_mapping = {
    'type': 'col_val',
    'mapping': {
        column1: {value1: color1, value2: color2, ...},
        column2: {value1: color3, value2: color4, ...}
    }
}
```

**link_color_type**, default='source'  
Can be ['source', 'target', 'both', 'same']
- 'source': same color as the source node
- 'target': same color as the target node
- 'both': color from both target and source
- 'same': all links have same color

**link_color**, default=None  
Only required when `link_color_type="same"`

## start generating sankey flow!

In [8]:
import sys
sys.path.insert(0, 'src/')
import generate_sankey_flow

### use node_color_type

In [9]:
generate_sankey_flow.draw_sankey_flow(df=raw[use_cols], node_color_type='col', link_color_type='source', 
                                      width=1600, height=900, graph_name='Titanic', 
                                      node_color_mapping=None, color_map=None, link_color=None)

In [18]:
# change color_map
generate_sankey_flow.draw_sankey_flow(df=raw[use_cols], node_color_type='col', link_color_type='same', 
                                      width=1600, height=900, graph_name='Titanic_same', 
                                      node_color_mapping=None, color_map='tab20', link_color='#ccc')

### use customized node_color_type and provide node_color_mapping

In [15]:
node_color_mapping = {
    'type': 'col',
    'mapping': {
        'Survived': '#9e0142', 
        'Pclass': '#d53e4f', 
        'Sex': '#f46d43', 
        'SibSp': '#fdae61', 
        'Parch': '#fee08b', 
        'Embarked': '#ffffbf'
    }
}
generate_sankey_flow.draw_sankey_flow(df=raw[use_cols], node_color_type='cus', link_color_type='source', 
                                      width=1600, height=900, graph_name='Titanic_col', 
                                      node_color_mapping=node_color_mapping, color_map=None, link_color=None)

In [16]:
node_color_mapping = {
    'type': 'val',
    'mapping': {
        0: '#9e0142',
        1: '#d53e4f',
        2: '#f46d43',
        3: '#fdae61',
        4: '#fee08b',
        5: '#ffffbf',
        6: '#e6f598',
        8: '#abdda4',
        'male': '#66c2a5',
        'female': '#3288bd',
        'S': '#5e4fa2',
        'C': '#9e0142',
        'Q': '#d53e4f',
        'unknown': '#f46d43'
    }
}
generate_sankey_flow.draw_sankey_flow(df=raw[use_cols], node_color_type='cus', link_color_type='source', 
                                      width=1600, height=900, graph_name='Titanic_val', 
                                      node_color_mapping=node_color_mapping, color_map=None, link_color=None)

In [17]:
node_color_mapping = {
    'type': 'col_val',
    'mapping': {
        'Survived': {0: '#9e0142', 1: '#5e4fa2'},
        'Pclass': {1: '#d53e4f', 2: '#3288bd', 3: '#f46d43'},
        'Sex': {'male': '#fdae61', 'female': '#66c2a5'},
        'SibSp': {0: '#fee08b', 1: '#ffffbf', 2: '#e6f598', 3: '#abdda4', 4: '#9e0142', 5: '#5e4fa2', 8: '#d53e4f'},
        'Parch': {0: '#f46d43', 1: '#fdae61', 2: '#fee08b', 3: '#ffffbf', 4: '#e6f598', 5: '#abdda4', 6: '#66c2a5'},
        'Embarked': {'S': '#3288bd', 'C': '#9e0142', 'Q': '#d53e4f', 'unknown': '#f46d43'}
    }
}
generate_sankey_flow.draw_sankey_flow(df=raw[use_cols], node_color_type='cus', link_color_type='source', 
                                      width=1600, height=900, graph_name='Titanic_col_val', 
                                      node_color_mapping=node_color_mapping, color_map=None, link_color=None)