In [None]:
import pandas as pd
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import requests
import plotly.express as px
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import networkx as nx
from collections import Counter
pd.set_option('display.max_columns',None)
pd.set_option('display.float_format', lambda x: f'%.{2}f' % x)
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv('../8Jul2020-14Jul2020.csv')
df['Date'] = pd.to_datetime(df['Start Date'], dayfirst=True)
df.sort_values(by='Date', inplace=True)
df['Date'] = df['Date'].dt.date
df[df['Duration'] >= 10000] = np.NaN
df = df.dropna()
df.head()

In [None]:
temp = df.groupby(['Date', 'StartStation Name', 'EndStation Name', 'StartStation lat', 'StartStation long',
                  'EndStation lat', 'EndStation long'])['Duration'].agg(['mean'])
temp.reset_index(drop=False, inplace=True)
temp.head()

In [None]:
sorted(temp['EndStation Name'].unique()) == sorted(temp['StartStation Name'].unique())

In [None]:
N = 20000 #I use the first 20000 entries of thw TFL bike data as the clique computations take too long otherwise.
#Chapter 4.3.2 of the report discusses how I use node and edge filterign on to see how my approach mitigates this drawback 

temp3 = temp['StartStation Name'].values[:N]
temp4 = temp[['StartStation lat', 'StartStation long']].apply(tuple, axis=1).values[:N]

temp5 = temp['EndStation Name'].values[:N]
temp6 = temp[['EndStation lat', 'EndStation long']].apply(tuple, axis=1).values[:N]

pos1 = {A: B for A, B in zip(temp3, temp4)}
pos2 = {A: B for A, B in zip(temp5, temp6)}
pos = {**pos1, **pos2}

In [None]:
temp3 = temp['StartStation Name'].values
temp4 = temp[['StartStation lat', 'StartStation long']].apply(tuple, axis=1).values

pos = {A: B for A, B in zip(temp3, temp4)}

In [None]:
X = np.array(temp[['StartStation Name', 'EndStation Name']])[:N]
y = np.array(temp['mean'])[:N]

temp1 = temp[['StartStation Name', 'EndStation Name']].apply(tuple, axis=1).values[:N]
temp2 = temp['mean'].values[:N]

edge_labels = {A: B for A, B in zip(temp1, temp2)}

In [None]:
G = nx.Graph()
G.add_edges_from(X)

In [None]:
temp1 = df.groupby(['StartStation Name','Date'])[['Duration']].count()
temp1.rename(columns = {'Duration':'StartStation Count'}, inplace = True)

temp2 = df.groupby(['EndStation Name','Date'])[['Duration']].count()
temp2.rename(columns = {'Duration':'EndStation Count'}, inplace = True)

In [None]:
temp = pd.concat([temp1,temp2], axis=1, join='inner')
temp.reset_index(drop=False, inplace=True)
temp.rename(columns = {'level_0':'Station'}, inplace = True)
temp.head()

In [None]:
fig = px.histogram(df, x="Duration")
fig.show()

In [None]:
cliques = list(nx.find_cliques(G))
station_counts = Counter([station for clique in cliques for station in clique]) # Count the number of times each station appears in the cliques
stations, counts = zip(*station_counts.items()) # Get the names of the stations and their counts
fig, ax = plt.subplots()
ax.plot(stations, counts)
ax.set_xlabel("Bike stations")
ax.set_ylabel("Clique count")
plt.gca().set_xticklabels([])
plt.bar(stations, counts)
plt.show()