In [None]:
import pandas as pd
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import requests
import plotly.express as px
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import networkx as nx
from collections import Counter
from scipy.stats import pearsonr
pd.set_option('display.max_columns',None)
pd.set_option('display.float_format', lambda x: f'%.{2}f' % x)
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv('../8Jul2020-14Jul2020.csv')
df['Date'] = pd.to_datetime(df['Start Date'], dayfirst=True)
df.sort_values(by='Date', inplace=True)
df['Date'] = df['Date'].dt.date
df.head()

In [None]:
temp = df.groupby(['Date', 'StartStation Name', 'EndStation Name', 'StartStation lat', 'StartStation long',
                  'EndStation lat', 'EndStation long'])['Duration'].agg(['mean'])
temp.reset_index(drop=False, inplace=True)
temp.head()

In [None]:
sorted(temp['EndStation Name'].unique()) == sorted(temp['StartStation Name'].unique())

In [None]:
N = 20000 #I use the first 20000 entries of thw TFL bike data as the computations take too long otherwise.
#Chapter 4.3.2 of the report discusses how I use node and edge filterign on to see how my approach mitigates this drawback 

temp3 = temp['StartStation Name'].values[:N]
temp4 = temp[['StartStation lat', 'StartStation long']].apply(tuple, axis=1).values[:N]

temp5 = temp['EndStation Name'].values[:N]
temp6 = temp[['EndStation lat', 'EndStation long']].apply(tuple, axis=1).values[:N]

pos1 = {A: B for A, B in zip(temp3, temp4)}
pos2 = {A: B for A, B in zip(temp5, temp6)}
pos = {**pos1, **pos2}

In [None]:
temp3 = temp['StartStation Name'].values
temp4 = temp[['StartStation lat', 'StartStation long']].apply(tuple, axis=1).values

pos = {A: B for A, B in zip(temp3, temp4)}

In [None]:
X = np.array(temp[['StartStation Name', 'EndStation Name']])[:N]
y = np.array(temp['mean'])[:N]

temp1 = temp[['StartStation Name', 'EndStation Name']].apply(tuple, axis=1).values[:N]
temp2 = temp['mean'].values[:N]

edge_labels = {A: B for A, B in zip(temp1, temp2)}

In [None]:
G = nx.Graph()
G.add_edges_from(X)
plt.figure(figsize =(35, 40))
nx.draw_networkx(G, pos = pos, node_color ='green')
nx.draw_networkx_edge_labels(G, pos = pos,
                             edge_labels=edge_labels,
                             font_color='red')


plt.title("Bike Routes")
plt.show()

In [None]:
temp1 = df.groupby(['StartStation Name'])[['Duration']].count()
temp1.rename(columns = {'Duration':'Count'}, inplace = True)
temp1.reset_index(drop=False, inplace=True)
temp1.head()

In [None]:
test = []
cliques = nx.find_cliques(G)
for clique in cliques:
    test += clique

clique_dict = Counter(test)

temp2 = pd.DataFrame.from_dict(clique_dict, orient='index').reset_index()
temp2.rename(columns = {'index':'StartStation Name', 0: 'Clique'}, inplace = True)
temp2.head()

In [None]:
final_df = pd.merge(temp1, temp2, how="left")
final_df.head()

In [None]:
first_20_df = final_df.head(90)
count = first_20_df["Count"]
clique = first_20_df["Clique"]

corr, p = pearsonr(count, clique)
print("Correlation coefficient: ", corr)

In [None]:
fig = px.scatter(first_20_df, x="StartStation Name", y=["Count", "Clique"], 
              labels={'Counts':'Counts', 'Clique':'Clique'}, height=500)

mean = first_20_df['Count'].mean()
std = first_20_df['Count'].std()
outliers = first_20_df[(np.abs(first_20_df['Count'] - mean) > 3*std)]
fig.add_trace(go.Scatter(x=outliers['StartStation Name'], y=outliers['Count'],mode='markers', name='Outliers', marker=dict(color="red")))
fig.update_layout(xaxis=dict(tickvals=[]), height=400)
fig.show()