In [1]:
import pandas as pd
import numpy as np

### data

In [2]:
raw = pd.read_csv('data/titanic_train.csv')

In [3]:
raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
raw.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [7]:
use_cols = ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for col in use_cols:
    print '%s: %s' %(col, str(raw[col].unique()))

Survived: [0 1]
Pclass: [3 1 2]
Sex: ['male' 'female']
SibSp: [1 0 3 4 2 5 8]
Parch: [0 1 2 5 3 4 6]
Embarked: ['S' 'C' 'Q' nan]


In [8]:
raw['Embarked'] = raw['Embarked'].fillna('unknown')

In [9]:
raw[use_cols].head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked
0,0,3,male,1,0,S
1,1,1,female,1,0,C
2,1,3,female,0,0,S
3,1,1,female,1,0,S
4,0,3,male,0,0,S


### generate color mapping

In [10]:
node_names = []
use_cols = ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for col in use_cols:
    col_values = raw[col].unique()
    for col_value in col_values:
        node_name = '%s_%s' %(col, str(col_value))
        node_names.append(node_name)

In [12]:
np.array(node_names)

array(['Survived_0', 'Survived_1', 'Pclass_3', 'Pclass_1', 'Pclass_2',
       'Sex_male', 'Sex_female', 'SibSp_1', 'SibSp_0', 'SibSp_3',
       'SibSp_4', 'SibSp_2', 'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1',
       'Parch_2', 'Parch_5', 'Parch_3', 'Parch_4', 'Parch_6',
       'Embarked_S', 'Embarked_C', 'Embarked_Q', 'Embarked_unknown'],
      dtype='|S16')

In [33]:
import matplotlib.pyplot as plt
import matplotlib

In [14]:
node_colors = {}
cm = plt.cm.get_cmap('Vega20')

for i in range(len(node_names)):
    node_colors[node_names[i]] = matplotlib.colors.rgb2hex(cm(i))



In [19]:
links = []
nodes = []
for i in range(len(use_cols)-1):
    source_col, target_col = use_cols[i], use_cols[i+1]
    temp_df = raw[[source_col, target_col]]
    temp_df['count'] = 1
    temp_df = temp_df.rename(columns={source_col: 'source', target_col: 'target'})
    temp_df_gp = temp_df.groupby(['source', 'target'], as_index=False).count()
    
    temp_df_gp['source'] = temp_df_gp['source'].apply(lambda x : '%s_%s' %(source_col, str(x)))
    temp_df_gp['target'] = temp_df_gp['target'].apply(lambda x : '%s_%s' %(target_col, str(x)))
    
    temp_df_gp['color_source'] = temp_df_gp['source'].apply(lambda x : node_colors[x])
    temp_df_gp['color_target'] = temp_df_gp['target'].apply(lambda x : node_colors[x])
    temp_df_gp['value'] = temp_df_gp['count'].map(str)
    
    links+= temp_df_gp[['source', 'target', 'value']].to_dict('records')

nodes = [{'name': n, 'color': c} for (n, c) in node_colors.items()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## render output

In [22]:
data = {
    'links': links,
    'nodes': nodes
}

In [31]:
import json
import jinja2

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

In [32]:
temp = open('sankey_path_template.html').read()
template = jinja2.Template(temp)

# generate output html
with open('sankey_path_test.html', 'wb') as fh:
    fh.write(template.render({'data': json.dumps(data)}))