In [1]:
# Base dependancies:
import pandas as pd
import numpy as np
from collections import defaultdict

################################################
# Make an empty DefaultDict
# fill dictionary {year: dataframe}
# build hierarchal dataframe using dictionary
################################################

# Example of data location, uses year 2010
# https://raw.githubusercontent.com/SethDKelly/NiceRideMN/master/Nice_Ride_data/2010/NiceRide_station_2010.csv

station_dict = defaultdict()
ridership_dict = defaultdict()

for year in [2010 + x for x in range(8)] :
    station_dict[year] = pd.read_csv("https://raw.githubusercontent.com/SethDKelly/NiceRideMN/master/Nice_Ride_data/" \
                             +str(year)+"/NiceRide_station_"+str(year)+".csv")

    ridership_dict[year] = pd.read_csv("https://raw.githubusercontent.com/SethDKelly/NiceRideMN/master/Nice_Ride_data/" \
                             +str(year)+"/NiceRide_trip_history_"+str(year)+".csv")

NR_station = pd.concat(station_dict)
NR_ridership = pd.concat(ridership_dict)

In [2]:
# For KMeans model building
from sklearn.cluster import KMeans

# For Feature Optimizations
from sklearn.decomposition import PCA

#########################################################################
# This sections builds the clusters for each (2010-2017) years data
# The code uses KMeans clustering with a K value of 6
# Appends to the DF `NR_station` the cluster each station belongs to
# into a new column `Cluster`
########################################################################

NR_station['Cluster'] = int(0)

for year in [2010 + x for x in range(8)] :
    
    distance_matrix = pd.DataFrame() # Reinstantiate the  Distance Matrix DataFrame for clean run
    
    # Read in the distance matrix for the particular year
    distance_matrix = pd.read_csv("https://raw.githubusercontent.com/SethDKelly/NiceRideMN/master/Nice_Ride_data/" \
                               +str(year)+"/distance_matrix_"+str(year)+".csv",index_col=0)
    
    # Optimize distance matrix to two primary x-y components
    pca = PCA(n_components=2).fit_transform(distance_matrix)
    
    #Append to the DF `NR_station` the cluster each station belongs to in column `Cluster`
    NR_station.loc[year, 'Cluster'] =  KMeans(n_clusters=6, n_init=200).fit(pca).labels_

In [14]:
# Perform necessary imports for Bokeh plotting
from bokeh.io import output_file, show, curdoc
from bokeh.plotting import figure

# Import the models modules
from bokeh.models import ColumnDataSource, HoverTool, Slider, ColorMapper

# Import the layout modules
from bokeh.layouts import widgetbox, column

# To build graph as an application
from bokeh.io import curdoc

# For cluster coloring
from bokeh.palettes import Spectral6

In [29]:
####################################
# TESTING AREA
# HARD HAT REQUIRE
####################################

import networkx
from math import sqrt
network = networkx.read_gml('ep2016.gml')

layout = networkx.spring_layout(network,
                                k=1.1/sqrt(network.number_of_nodes()),
                                iterations=100)
# https://en.wikipedia.org/wiki/Force-directed_graph_drawing

FileNotFoundError: [Errno 2] No such file or directory: 'ep2016.gml'