## Currently used to find individual station/terminals using hover tool for descriptive stats

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

###############################
# Load data
#
# Make an empty DefaultDict
# fill dictionary {year: dataframe}
# build hierarchal dataframe using dictionary
################################################

# Example of data location, uses year 2010
# https://raw.githubusercontent.com/SethDKelly/NiceRideMN/master/Nice_Ride_data/2010/NiceRide_station_2010.csv

ridership_dict = defaultdict()
station_dict = defaultdict()

for year in [2010 + x for x in range(8)] :

    ridership_dict[year] = pd.read_csv("https://raw.githubusercontent.com/SethDKelly/NiceRideMN/master/Nice_Ride_data/" \
                                 +str(year)+"/NiceRide_trip_history_"+str(year)+".csv")
    station_dict[year] = pd.read_csv("https://raw.githubusercontent.com/SethDKelly/NiceRideMN/master/Nice_Ride_data/" \
                                 +str(year)+"/NiceRide_station_"+str(year)+".csv")
    
NR_ridership = pd.concat(ridership_dict)
NR_station = pd.concat(station_dict)

In [2]:
def kmeans_builder(NR_station) :
    
    # For KMeans model building
    from sklearn.cluster import KMeans

    # For Feature Optimizations
    from sklearn.decomposition import PCA

    #########################################################################
    # This sections builds the clusters for each (2010-2017) years data
    # The code uses KMeans clustering with a K value of 6
    # Appends to the DF `NR_station` the cluster each station belongs to
    # into a new column `Cluster`
    ########################################################################

    NR_station['Cluster'] = int(0)

    for year in [2010 + x for x in range(8)] :
    
        distance_matrix = pd.DataFrame() # Reinstantiate the  Distance Matrix DataFrame for clean run
    
        # Read in the distance matrix for the particular year
        distance_matrix = pd.read_csv("https://raw.githubusercontent.com/SethDKelly/NiceRideMN/master/Nice_Ride_data/" \
                               +str(year)+"/distance_matrix_"+str(year)+".csv",index_col=0)
        
        # Optimize distance matrix to two primary x-y components
        pca = PCA(n_components=2).fit_transform(distance_matrix)
        
        # assert len(NR_station.loc[year, 'Cluster'].values) == 
        
        #Append to the DF `NR_station` the cluster each station belongs to in column `Cluster`
        NR_station.loc[year, 'Cluster'] =  KMeans(n_clusters=6, n_init=200).fit(pca).labels_
        
    return NR_station

In [3]:
def graph_builder(ride_df) :
    
    import networkx as nx

    ################################################################
    # Takes single arguements, ride_df
    # ride_df must have columns: 'Start_id', 'End_id', 'counts'
    # builds a networkX graph of type Graph()
    # returns the graph
    ################################################################

    assert 'counts' in ride_df.columns, "Column named `counts` must be in arg ride_df "
    assert 'End_id' in ride_df.columns, "Column named `End_id` must be in arg ride_df"
    assert 'Start_id' in ride_df.columns, "Column named `Start_id` must be in arg ride_df"


    graph = nx.from_pandas_edgelist(ride_df,\
                                    source = 'Start_id', \
                                    target = 'End_id', \
                                    edge_attr = 'counts', \
                                    create_using = nx.Graph())
    return graph

### Below should be calculated and saved, calculation on the fly is time intensive

In [4]:
def get_edge_coor(graph, _station) :
    
    import networkx as nx
    
    ################################################################
    # Takes two arguements, graph and _station
    # graph must be of type nx.Graph()
    # _station must have columns: 'Termina', 'Latitude', 'Longitude'
    #
    # function takes the edges in graph
    # finds the nodes locations:
    # node1(x, y) / node1(long/lat), node2(x, y) / node2(long/lat)
    # creates 3 different arrays: xs, ys, alpha
    # xs are the x values for node1/2, ys are the y values for node1/2
    # alpha is the `weight` of the node, to be used as edge alpha value
    # returns xs, ys, alpha
    ################################################################

    assert (type(graph) == nx.Graph),"arg graph must be of type Graph not of type DiGraph, MultiGraph, or MultiDiGraph"
    assert 'Terminal' in _station.columns, "Column named Terminal must be in arg DataFrame"
    assert 'Longitude' in _station.columns, "Column named Longitude must be in arg DataFrame"
    assert 'Latitude' in _station.columns, "Column named Latitude must be in arg DataFrame"

    xs = []
    ys = []
    alpha = []
    edge = dict(xs=[], ys=[], alpha=[])
    # example: { ..., ('30001.0', '300005.0', {'counts': 243}), ... }
    # u is origin_node, v is terminus_node, d is data ('counts')
    
    for u, v, d in graph.edges.data('counts', default = 0):
        
        edge['xs'].append([_station[_station.Terminal == u].Longitude.values[0], \
                  _station[_station.Terminal == v].Longitude.values[0]])

        edge['ys'].append([_station[_station.Terminal == u].Latitude.values[0], \
                  _station[_station.Terminal == v].Latitude.values[0]])
        edge['alpha'].append(d) # rescale for later alpha normalization
        
    # create values of alpha that are between 0 and 1
    edge['alpha'] = [a / max(edge['alpha']) for a in edge['alpha']]
    
    return edge

In [19]:
ride_counts = pd.DataFrame.from_csv("/home/grimoire/Projects/NiceRide/Nice_Ride_data/2017/NiceRide_ride_count_2017.csv")
ride_counts = ride_counts[ride_counts.counts != 1]
graph = graph_builder(ride_counts)
edge = get_edge_coor(graph, NR_station)

  """Entry point for launching an IPython kernel.


In [20]:
edge

{'alpha': [0.19967030702658151,
  0.0016484648670925201,
  0.0010302905419328251,
  0.014630125695446116,
  0.010096847310941686,
  0.013805893261899856,
  0.016484648670925202,
  0.0014424067587059551,
  0.027611786523799711,
  0.0030908716257984753,
  0.039563156810220483,
  0.0063878013599835155,
  0.0049453946012775604,
  0.0010302905419328251,
  0.019163404079950545,
  0.0041211621677313005,
  0.0012363486503193901,
  0.0014424067587059551,
  0.0037090459509581701,
  0.026169379765093755,
  0.0010302905419328251,
  0.00061817432515969505,
  0.01359983515351329,
  0.0061817432515969507,
  0.0028848135174119102,
  0.0082423243354626009,
  0.018545229754790851,
  0.018751287863177417,
  0.033999587883783225,
  0.0028848135174119102,
  0.011333195961261075,
  0.0051514527096641251,
  0.014218009478672985,
  0.0028848135174119102,
  0.00041211621677313001,
  0.012157428394807336,
  0.019575520296723676,
  0.0051514527096641251,
  0.023078508139295281,
  0.00061817432515969505,
  0.0020

In [29]:
def main() :

    # Perform necessary imports for Bokeh plotting

    from bokeh.io import output_file, show, curdoc
    from bokeh.plotting import figure

    # Import the models modules
    from bokeh.models import ColumnDataSource, HoverTool, Slider, ColorMapper
    from bokeh.models.widgets import CheckboxGroup

    # Import the layout modules
    from bokeh.layouts import widgetbox, column, row

    # To build graph as an application
    from bokeh.io import curdoc

    # For cluster coloring
    from bokeh.palettes import Spectral6
    
    #########################################
    # Section to instantiate graph properties
    #########################################
    year = 2017
    # File Name
    output_file(str(year))

    # Creating hovertool tip
    hover = HoverTool(tooltips=[('Terminal', '@terminal'), 
                           ('Name', '@station'),
                           ('Location', '($x, $y)')])
    
    # Setting boundaries for x-y axis ranges
    xmin, xmax = (NR_station.loc[:,:].Longitude.values.min() - .01) , (NR_station.loc[:,:].Longitude.values.max() + .01)
    ymin, ymax = (NR_station.loc[:,:].Latitude.values.min() - .01) , (NR_station.loc[:,:].Latitude.values.max() + .01)
    
    # Create the figure: plot
    plot = figure(plot_height=750, plot_width=1000,
              x_range = (xmin, xmax), y_range = (ymin, ymax),
              tools=[hover, 'box_zoom', 'reset', 'wheel_zoom', 'pan', 'lasso_select'])

    # Set the x/y-axis label
    plot.xaxis.axis_label = 'Longitude'
    plot.yaxis.axis_label = 'Latitude'

    ###############################
    # Add Node Data
    ###############################
    
    # Make the ColumnDataSource: source
    node_source = ColumnDataSource(data={
        'x'        : NR_station.loc[year,:].Longitude,
        'y'        : NR_station.loc[year,:].Latitude,
        'terminal' : NR_station.loc[year,:].Terminal,
        'station'  : NR_station.loc[year,:].Station,
        'cluster'  : NR_station.loc[year,:].Cluster
        })

    # Add the nodes to the circle glyph
    r_circles = plot.circle(x='x', y='y', source=node_source,
                fill_alpha=0.8, legend='cluster', size = 10) # Add color mapper by cluster, add node size by

    # Set the legend.location attribute of the plot to 'top_right'
    plot.legend.location = 'top_right'

    ###############################
    # build NetworkX graph model
    # and add NetworkX edges to graph
    ###############################

    ride_counts = pd.DataFrame.from_csv("/home/grimoire/Projects/NiceRide/Nice_Ride_data/"+str(year)+
                                        "/NiceRide_ride_count_"+str(year)+".csv")
    ride_counts = ride_counts[ride_counts.counts != 1]
    
    graph = graph_builder(ride_counts)
    
    edge_source = ColumnDataSource(data = get_edge_coor(graph, NR_station.loc[year,:]))
    r_lines = plot.multi_line(xs = 'xs',ys = 'ys', source=edge_source,
                          line_width=1.5, alpha=.06, color='navy')

    ########################################
    # Graph Node and Edge interaction policy TEMPLATE
    ########################################

    '''
    graph_renderer.node_renderer.glyph = Circle(size=15, fill_color=Spectral4[0])
    graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
    graph_renderer.node_renderer.hover_glyph = Circle(size=15, fill_color=Spectral4[1])

    graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=5)
    graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=5)
    graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=5)

    graph_renderer.selection_policy = NodesAndLinkedEdges()
    graph_renderer.inspection_policy = EdgesAndLinkedNodes()
    '''
    ######################################################
    # Section is to set up cluster checkboxes
    # Allow Users to choose which clusters to show
    ######################################################
    '''
    checkbox_group = CheckboxGroup(
            labels=["Cluster 1", "Cluster 2", "Cluster 3"
                    "Cluster 4", "Cluster 5", "Cluster 6"], active=[0,1,2,3,4,5])
    Need to figure out how to change active cluster based on                 
                
    '''
    ######################################################
    # Section is to setup how the plot and widgets are displayed
    ######################################################
    '''
    # Make a row layout of widgetbox(slider) and plot and add it to the current document
    layout = column(plot, widgetbox(slider))
    curdoc().add_root(layout)


    # Add the plot to the current document and add a title
    curdoc().add_root(plot)
    curdoc().title = 'Nice Ride stations'
    '''
    show(plot)

In [30]:
NR_station = kmeans_builder(NR_station)
main()

