In [1]:
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
import csv
import sys
import os
import re
%matplotlib inline

<a id='0'></a>
# DASHBOARD FOR SINGLE STATION


### Part 0 Related Functions

### [Part 1 Visualization Process](#1)
- 1.0 Navigate to the folder created by K_means_package in MRT_K-means_Analysis.ipynb
- 1.1 Create station dashboard in a new created folder, 'single_plot'
- 1.2 Create sub folders grouped by entrance-exit cluster type


In [2]:

def station_eng_name():
    file_ID = '../station_name.csv'
    with open(file_ID, 'r', encoding="utf-8") as fid:
        stationlist = csv.reader(fid)
        id_dict = {rows[0]:rows[3] for rows in stationlist}
    return id_dict

def df_reformat_for_line(df):
    result_df = df.copy()
    insert_value = []
    for i in range(df.shape[0]):
        insert_value.append(0)
    # 7 for Monday to Sunday, 3 for 2,3,4 hour
    for i in range(7):
        for j in range(3):
            idx = (24*i)+2 #0,1,"2","3","4",5,6,7,8
            result_df.insert(loc=idx, column='{}_{}'.format(i,j), value=insert_value)           
    rename_cols = []
    for i in range(168):
        rename_cols.append(i)
    rename_cols.append('cluster')
    result_df.columns = rename_cols   
    return result_df

def line_graph_xlabel():
    result = []
    for i in range(168):
        result.append(i)
    return result

def cluster_mean_dict(df):
    result_dict = {}
    for cluster in df['cluster'].unique():
        result_dict[cluster] = df[df['cluster']==cluster].iloc[:,:-1].mean() #ignore last column, which is cluster group
    return result_dict
            
def cluster_group_dict_list_format(df):
    group_dict = {}
    for station in df.index:
        station_group = df.loc[station][-1]
        station_dict = {}
        station_dict[station] = df.loc[station][:-1]
        group_dict.setdefault(station_group, []).append(station_dict)
    return group_dict

def single_station_data(station_code,df_in,df_out):
    result_dict = {}
    in_data =  df_in.loc['{} IN'.format(station_code)]
    out_data =  df_out.loc['{} OUT'.format(station_code)]
    result_dict['IN'] = in_data
    result_dict['OUT'] = out_data
    return result_dict

def single_station_heatmap_data(single_station_data):
    result_dict = {}
    for k,v in single_station_data.items():
        new_list =[]
        old_list = list(v)

        week_list = []
        for number in old_list:        
            if len(week_list) < 21:
                week_list.append(number)
            else:
                new_list.append(week_list)
                week_list = []
                week_list.append(number)

        result_df = pd.DataFrame(new_list,columns=[0,1,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23],
                                 index=['Mon','Tue','Wed','Thu','Fri','Sat','Sun'])
        result_dict[k] = result_df
    return result_dict


def heatmap_single_station(single_station_heatmap_data,station_code):
    name_dict = station_eng_name()
    for k,v in name_dict.items():
        if k == station_code:
            station_name = name_dict[k]

    i=0
    heatmap = sns.heatmap(single_station_heatmap_data['IN']*100,ax=ax1,center=None,
                          #cbar=i == 0,
                          cmap='Blues',
                          vmin= 0,
                          vmax= 100,
                          annot=True,
                          fmt='g')
    ax1.set_title('{} {} {} (by min max %)'.format(station_code,'IN', station_name))
    ax1.xaxis.tick_top()
    ax1.set_xlabel('Time of Day')

    heatmap = sns.heatmap(single_station_heatmap_data['OUT']*100,ax=ax2,center=None,
                          #cbar=i == 0,
                          cmap='Reds',
                          vmin= 0,
                          vmax= 100,
                          annot=True,
                         fmt='g')
    ax2.set_title('{} {} {} (by min max %)'.format(station_code, 'OUT' ,station_name))
    ax2.xaxis.tick_top()
    ax2.set_xlabel('Time of Day')

def lineGraph_single_station(result_of_single_staion_data,df_in,df_out,station_code):
    #indexes for ax3 and ax4
    df_list = [df_in, df_out]
    ax_list = [ax3, ax4]
    flow_list = ['IN', 'OUT']
    color_list = ['Blue', 'Red']
    
    #mapping station names
    name_dict = station_eng_name()
    for k,v in name_dict.items():
        if k == station_code:
            station_name = name_dict[k]
    
    for i in range(len(df_list)):
        #1. praparing basic infos
        #1.1 get the cluster of the station 
        station_cluster = result_of_single_staion_data[flow_list[i]][-1]
        #1.2 adding the missing hour (2,3,4) from orginal data (Pandas Dataframe)
        re_format = df_reformat_for_line(df_list[i])
        
        #1.3 transform to dictionary format: {cluster:[{station_code:{data}},{station_code:{data}},...]}
        line_group_dict = cluster_group_dict_list_format(re_format)
        cluster_mean = cluster_mean_dict(re_format)
        x_labels_single = line_graph_xlabel()
        #cluster average data
        for k in cluster_mean.keys():
            if k == station_cluster:
                average_data = cluster_mean[k]
                
        #station data and others 
        for dict_ in line_group_dict[station_cluster]:
            #ax_list[i].set_title('Station Code {}'.format(k),y=0.9)
            for k,v in dict_.items():
                if k == '{} {}'.format(station_code,flow_list[i]):
                    main_data = v    
                    ax_list[i].set_title('{} {}, CLuster Group: {}'.format(k,station_name,station_cluster),y=0.95)
                else:
                    ax_list[i].plot(x_labels_single,v,color='#d9d9d9',label='_nolegend_') 
                    
        #drawing main data and average data
        ax_list[i].plot(x_labels_single,average_data,color='Black',label='cluster avg.',linestyle='-.')
        ax_list[i].plot(x_labels_single,main_data,color=color_list[i])
        
        #adding vertical lines
        weekday = ['Mon Noon','Tue Noon','Wed Noon','Thu Noon','Fri Noon','Sat Noon','Sun Noon']
        n=0
        for j in range(0,168,24): # 168=24*7
            ax_list[i].axvline(x=j, color='k', linestyle='--',linewidth=1.2)
            ax_list[i].text(j+8,0,weekday[n],color=(0.3,0.3,0.3))
            n+=1
        #last vertical line on the right side
        ax_list[i].axvline(x=167, color='k', linestyle='--',linewidth=1)
        
        #noon vertical line
        for j in range(12,168,24):
            ax_list[i].axvline(x=j, color=(0.45,0.45,0.45), linestyle='--',linewidth=1)
        ax_list[i].legend()
        
        #turning off x ticks    
        ax_list[i].tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom=False,      # ticks along the bottom edge are off
                top=False,         # ticks along the top edge are off
                labelbottom=False)

        
def single_plot_packup(df_cluster,saved_folder):
    os.chdir('./{}'.format(saved_folder))
    final_group_dict = {}
    for index, row in df_cluster.iterrows():
        final_group_dict.setdefault(row[-1], []).append(index)

    for key in final_group_dict.keys():
        if not os.path.exists(key):
            os.mkdir(key)
            print("Directory " , key ,  " folder created ")
        else:    
            print("Directory " , key ,  " already exists")

        for station in final_group_dict[key]:
            for file_name in os.listdir():
                check_name = '{} Plot.png'.format(station)
                if check_name == file_name:
                    os.rename('{}'.format(file_name),  
                              '{}/{}'.format(key,file_name))  
    os.chdir('../')

<a id='1'></a>
### [Part 1 Visualization Process](#0)

- 1.0 Navigate to the folder created by K_means_package in MRT_K-means_Analysis.ipynb
- 1.1 Create station dashboard in a new created folder, 'single_plot'
- 1.2 Create sub folders grouped by entrance-exit cluster type


In [3]:
folder_by_analysis = 'notebook_illustration'

os.chdir(folder_by_analysis)
single_plot_folder = 'single_plot'
k_pack_in = pd.read_csv('k_pack_IN.csv',index_col=0)
k_pack_out = pd.read_csv('k_pack_OUT.csv',index_col=0)


code_name_dict = station_eng_name()
#create folder for single station plots
if not os.path.exists(single_plot_folder):
        os.mkdir(single_plot_folder)
        print("{} Directory created ".format(single_plot_folder))
else:    
    print("Directory " , single_plot_folder ,  " already exists")

    
for key in code_name_dict.keys():  
    sys.stdout.write('\rprocess {}'.format(key))
        
    try:
        station_code = key
        dict_ = single_station_data(station_code,k_pack_in,k_pack_out)
        heatmap_ = single_station_heatmap_data(dict_)

        #https://stackoverflow.com/questions/31671999/python-different-size-subplots-in-matplotlib?rq=1
        f=plt.figure(figsize = (25,15))

        gs=GridSpec(3,2)
        ax1=f.add_subplot(gs[0,0])
        ax2=f.add_subplot(gs[0,1])
        ax3=f.add_subplot(gs[1,:])
        ax4=f.add_subplot(gs[2,:])

        heatmap_single_station(heatmap_,station_code)
        lineGraph_single_station(dict_,k_pack_in,k_pack_out,station_code)

        plt.savefig('{}/{} Plot.png'.format(single_plot_folder,station_code))
        plt.close()
 
    except KeyError:
        sys.stdout.write('\nskip the station without data: {}\n'.format(station_code))

single_plot Directory created 
process ﻿ID
skip the station without data: ﻿ID
process BR24
skip the station without data: BR24
process BR11
skip the station without data: BR11
process BR10
skip the station without data: BR10
process BR09
skip the station without data: BR09
process R13A
skip the station without data: R13
process R11
skip the station without data: R11
process R10
skip the station without data: R10
process R08
skip the station without data: R08
process R07
skip the station without data: R07
process R01
skip the station without data: R01
process G15
skip the station without data: G15
process G12
skip the station without data: G12
process G09
skip the station without data: G09
process O07A
skip the station without data: O07
process BL01

In [4]:
df_final = pd.read_csv('df_cluster.csv',index_col=0)
single_plot_packup(df_final,single_plot_folder)

Directory  0_3  folder created 
Directory  0_1  folder created 
Directory  3_3  folder created 
Directory  1_2  folder created 
Directory  2_0  folder created 
Directory  1_0  folder created 
Directory  3_0  folder created 
Directory  3_2  folder created 
Directory  1_3  folder created 
