In [None]:
import pandas as pd
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import requests
import plotly.express as px
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter
np.set_printoptions(suppress=True)
from itertools import combinations
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth', None)
from scipy.spatial.distance import squareform, pdist
pd.options.mode.chained_assignment = None

In [None]:
df = pd.read_csv('../8Jul2020-14Jul2020.csv')
cluster = df[['StartStation Name', 'StartStation lat', 'StartStation long']]
cluster.drop_duplicates(keep='first', inplace=True)
cluster.head()

In [None]:
X = cluster[['StartStation lat', 'StartStation long']].values #select columns for G-clustering
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [None]:
n_clusters_ = 10
kmeans = KMeans(n_clusters = n_clusters_, init ='k-means++')
kmeans.fit(X) 
labels = kmeans.fit_predict(X)

# Coordinates of cluster centers.
centers = kmeans.cluster_centers_ 

In [None]:
# map each station with its region
cluster['clusters'] = labels
cluster.index = cluster['StartStation Name']
cluster_map = cluster['clusters'].to_dict()

df['Cluster'] = df['StartStation Name'].map(cluster_map)
df.head()

In [None]:
temp1 = df.groupby(['StartStation Name'])[['Duration']].count()
temp2 = df.groupby(['EndStation Name'])[['Duration']].count()

In [None]:
temp1_map = temp1['Duration'].to_dict()
temp2_map = temp2['Duration'].to_dict()

df['StartStation Outgoing Bike'] = df['StartStation Name'].map(temp1_map)
df['StartStation Incoming Bike'] = df['StartStation Name'].map(temp2_map)

df['EndStation Outgoing Bike'] = df['EndStation Name'].map(temp1_map)
df['EndStation Incoming Bike'] = df['EndStation Name'].map(temp2_map)

df.head()

In [None]:
#Inspired by: https://github.com/noernimat/k-means_clustering_model.git

plt.rcParams['figure.figsize'] = [16,13]

unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
fig, ax = plt.subplots()
for k, col in zip(unique_labels, colors):
    if k == -1:
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask]
    ax.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=5)
    ax.plot(centers[k, 0], centers[k, 1], '^',markerfacecolor=tuple(col), markersize=18)
    ax.text(centers[k, 0], centers[k, 1], k,fontsize=12) 
plt.show()

In [None]:
n_clusters_ = 3
grouped = df.groupby(['Cluster'])

li = []
i = 0

for name,group in grouped:
    temp = grouped.get_group(name)    
    X = temp[['StartStation Outgoing Bike', 'EndStation Incoming Bike']].values #select columns for T-clustering
    kmeans = KMeans(n_clusters = n_clusters_, init ='k-means++')
    kmeans.fit(X) # Compute k-means clustering.
    labels = kmeans.fit_predict(X)
    temp['Region'] = labels + i*3
    i += 1
    li.append(temp)

In [None]:
final_df = pd.concat(li)
final_df.sort_index(inplace=True)
final_df.head()

In [None]:
fig = px.scatter(final_df, x="StartStation lat", y="StartStation long", color="Region")
fig.show()

In [None]:
temp_df = final_df[['StartStation Name', 'Cluster', 'StartStation lat', 'StartStation long']]
temp_df.set_index(['StartStation Name'], inplace=True)

var1 = temp_df['Cluster'].to_dict()
var2 = temp_df['StartStation lat'].to_dict()
var3 = temp_df['StartStation long'].to_dict()

In [None]:
temp = final_df.groupby(['Region', 'Cluster', 'StartStation Name', 'EndStation Name'])[['Duration']].count()
temp.rename(columns={'Duration':'Count'}, inplace=True)
temp.reset_index(drop=False, inplace=True)
temp = temp[temp['Count'] >= 5] #Bike flow threshold (w)
temp.head()

In [None]:
temp['End Station Cluster'] = temp['EndStation Name'].map(var1)
temp['StartStation lat'] = temp['EndStation Name'].map(var2)
temp['StartStation long'] = temp['EndStation Name'].map(var3)

temp = temp[temp['Cluster'] != temp['End Station Cluster']]
temp.reset_index(drop=True, inplace = True)
temp.head()

In [None]:
temp3 = df['StartStation Name'].values
temp4 = df[['StartStation lat', 'StartStation long']].apply(tuple, axis=1).values

pos = {A: B for A, B in zip(temp3, temp4)}

In [None]:
X = np.array(temp[['StartStation Name', 'EndStation Name']])
y = np.array(temp['Count'])

temp1 = temp[['StartStation Name', 'EndStation Name']].apply(tuple, axis=1).values
temp2 = temp['Count'].values

edge_labels = {A: B for A, B in zip(temp1, temp2)}

In [None]:
G = nx.Graph() #graphical model of London bike network
G.add_edges_from(X)
plt.figure(figsize =(35, 40))
nx.draw_networkx(G, pos = pos, node_color ='green')
nx.draw_networkx_edge_labels(G, pos = pos,
                             edge_labels=edge_labels,
                             font_color='red')

plt.title("Bike Routes")
plt.show()

In [None]:
test = []
cliques = nx.find_cliques(G)
for clique in cliques:
    test += clique

clique_dict = Counter(test)

temp2 = pd.DataFrame.from_dict(clique_dict, orient='index').reset_index()
temp2.rename(columns = {'index':'StartStation Name', 0: 'Clique'}, inplace = True)
temp2.head()

In [None]:
clique_station_list = temp2['StartStation Name'].values
non_clique_df = final_df[~final_df['StartStation Name'].isin(clique_station_list)]
clique_df = final_df[final_df['StartStation Name'].isin(clique_station_list)]

In [None]:
final_df['clique_stations'] = np.where(~final_df['StartStation Name'].isin(clique_station_list), False, True)
final_df.head()

In [None]:
final_df[final_df['StartStation Name'].isin(clique_station_list)]

In [None]:
regions_dict = {}

number_of_nonclique_region = sorted(non_clique_df['Region'].unique())

for i in number_of_nonclique_region:
    temp_df = non_clique_df[non_clique_df['Region'] == i]
    temp = temp_df.groupby(['StartStation Name']).first()[['StartStation lat', 'StartStation long']]
    eucd_distance = pd.DataFrame(squareform(pdist(temp.iloc[:, :])), columns=temp.index.unique(), index=temp.index.unique())    

    X = np.array(np.meshgrid(eucd_distance.columns, eucd_distance.columns)).T.reshape(-1, 2)
    y = eucd_distance.values.flatten()


    temp1 = [tuple(i) for i in X]
    temp2 = eucd_distance.values.flatten()

    edge_labels = {A: B for A, B in zip(temp1, temp2)}
    
    temp3 = final_df['StartStation Name'].values
    temp4 = final_df[['StartStation lat', 'StartStation long']].apply(tuple, axis=1).values

    pos = {A: B for A, B in zip(temp3, temp4)}
    
    G = nx.Graph()
    G.add_edges_from(X)
    
    mst = nx.minimum_spanning_tree(G)   
    regions_dict['Region{}'.format(str(i+1))] =  mst.size(weight="weight")

In [None]:
mst_non_clique_regions_df = pd.DataFrame(list(regions_dict.items()), columns=['Region', 'MST'])

In [None]:
#Calculate the MST weight for the entire bike network 

temp = final_df.groupby(['StartStation Name']).first()[['StartStation lat', 'StartStation long']]
eucd_distance = pd.DataFrame(squareform(pdist(temp.iloc[:, :])), columns=temp.index.unique(), index=temp.index.unique())    

X = np.array(np.meshgrid(eucd_distance.columns, eucd_distance.columns)).T.reshape(-1, 2)
y = eucd_distance.values.flatten()


temp1 = [tuple(i) for i in X]
temp2 = eucd_distance.values.flatten()

edge_labels = {A: B for A, B in zip(temp1, temp2)}

temp3 = final_df['StartStation Name'].values
temp4 = final_df[['StartStation lat', 'StartStation long']].apply(tuple, axis=1).values

pos = {A: B for A, B in zip(temp3, temp4)}

G = nx.Graph()
G.add_edges_from(X)

mst = nx.minimum_spanning_tree(G)
    
Entire_Bike_Network = mst.size(weight="weight")
print(mst.size(weight="weight"))

In [None]:
#Calculate the sum of MST weights for the non-self-sustainable bike stations

temp = non_clique_df.groupby(['StartStation Name']).first()[['StartStation lat', 'StartStation long']]
eucd_distance = pd.DataFrame(squareform(pdist(temp.iloc[:, :])), columns=temp.index.unique(), index=temp.index.unique())    

X = np.array(np.meshgrid(eucd_distance.columns, eucd_distance.columns)).T.reshape(-1, 2)
y = eucd_distance.values.flatten()


temp1 = [tuple(i) for i in X]
temp2 = eucd_distance.values.flatten()

edge_labels = {A: B for A, B in zip(temp1, temp2)}

temp3 = final_df['StartStation Name'].values
temp4 = final_df[['StartStation lat', 'StartStation long']].apply(tuple, axis=1).values

pos = {A: B for A, B in zip(temp3, temp4)}

G = nx.Graph()
G.add_edges_from(X)

mst = nx.minimum_spanning_tree(G)    

Non_Clique_Stations = mst.size(weight="weight")
Clique_Stations = Entire_Bike_Network - Non_Clique_Stations 
print(mst.size(weight="weight"))

In [None]:
#The evaluation for w=5. Other bar charts made in report where made by changing the bike flow threshold (w) and running again.
data = {'Entire bike network': Entire_Bike_Network, 'Non-self-sustainable': Non_Clique_Stations}

mst = ['Entire Bike Network', 'Clique/Non Clique Stations']
values = [Entire_Bike_Network, Clique_Stations, Non_Clique_Stations]
mst = list(data.keys())
values = list(data.values())
fig, ax = plt.subplots(figsize=(16, 9)) 
plt.bar(mst, values, color='red', width=0.4)

plt.xlabel("Bike stations", fontsize=30)
plt.ylabel("Total distance for bike repositioning", fontsize=30)
plt.title("Bike repositioning distances with w = 5", fontsize=34)
plt.xticks(fontsize=29)
plt.yticks(fontsize=29)
plt.show()

In [None]:
#The values in this dataset were gathered from subsequent runs of the code and changing the number of Regions formed.  
#The data was gathered to speed up the processing time of my work - instead of having to calculate many Regions in every run.

df = pd.read_csv('evaluation_dataset.csv') 
lines = df.plot.line(linewidth=5)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=25)
plt.show()

In [None]:
#The values in this dataset were gathered from subsequent runs of the code and changing the bike floew thresolhd w. 
#The data was gathered to speed up the processing time of my work - instead of having to calculate many Cliques in every run.

df = pd.read_csv('connection_strength.csv')
lines = df.plot.line(x='Bike flow', y='Number of cliques', linewidth=5)
plt.xlabel('Bike flow', fontsize=30)
plt.ylabel('Number of cliques', fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend().remove()
plt.show()

In [None]:
color_discrete_map = {False: 'rgb(255,0,0)', True: 'rgb(0,255,0)'}
fig = px.scatter_mapbox(final_df, lat="StartStation lat", lon="StartStation long", hover_name="StartStation Name", hover_data=["StartStation Name", "StartStation nbBikes"],
                        color_discrete_map=color_discrete_map, zoom=3, height=300, color="clique_stations")



fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
#The data in X and Y were gathered from subsequent runs of the code by changing w in each run.
x = [5, 10, 15, 20]
y = [738, 391, 76, 27]

fig, ax = plt.subplots()
ax.plot(x, y, linewidth=7.5)
ax.set_xlabel('Bike flow threshold (w)', fontsize=29)
ax.set_ylabel('Number of self-sustaining bike stations', fontsize=29)
ax.tick_params(axis='both', which='major', labelsize=27)
ax.set_title('w effects on bike station self-sustainability', fontsize=30)
plt.show()

In [None]:
number_of_region = sorted(final_df['Region'].unique())

for i in number_of_region:
    temp = final_df[final_df['Region'] == i]
    individual_region = temp.groupby(['StartStation Name'])[['StartStation nbBikes']].first()
    mean_region = int(individual_region.mean().values[0])
    sum_region = individual_region.sum().values[0]
    number_of_station = len(individual_region)
    extra_bikes = sum_region - mean_region*number_of_station

    # Initially I assign mean to every station
    y = np.array(individual_region['StartStation nbBikes'])
    y[:] = mean_region
    
    # Incase I am left with n extra bikes then I allocate them to first n stations
    y[:extra_bikes] = y[:extra_bikes] + 1
    individual_region['StartStation nbBikes Relocated'] = y
    print('Region: {}'.format(i))
    print(individual_region)