In [None]:
import pandas as pd
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import requests
import plotly.express as px
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import networkx as nx
from statistics import mean
pd.set_option('display.max_columns',None)
pd.set_option('display.float_format', lambda x: f'%.{2}f' % x)
pd.set_option('display.max_colwidth', None)
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../8Jul2020-14Jul2020.csv')
df['Date'] = pd.to_datetime(df['Start Date'], dayfirst=True)
df.sort_values(by='Date', inplace=True)
df['Date'] = df['Date'].dt.date

cluster = df[['StartStation Name', 'StartStation lat', 'StartStation long']]
cluster.drop_duplicates(keep='first', inplace=True)
cluster.head()

In [None]:
X = cluster[['StartStation lat', 'StartStation long']].values
n_clusters_ = 3
kmeans = KMeans(n_clusters = n_clusters_, init ='k-means++')
kmeans.fit(X) # Compute k-means clustering.
labels = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
cluster['clusters'] = labels
cluster.index = cluster['StartStation Name']
cluster_map = cluster['clusters'].to_dict()
df['cluster'] = df['StartStation Name'].map(cluster_map)
df.head()

In [None]:
#Inspired by: https://github.com/noernimat/k-means_clustering_model.git
#This creates 750 clusters - hence takes some time to execute this 

sum_duration = {}
mean_duration = {}

for n in range(3,750,3):
    print(n)
    n_clusters_ = n
    kmeans = KMeans(n_clusters = n_clusters_, init ='k-means++')
    kmeans.fit(X) # Compute k-means clustering.
    labels = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
    cluster['clusters'] = labels
    cluster.index = cluster['StartStation Name']
    cluster_map = cluster['clusters'].to_dict()
    df['cluster'] = df['StartStation Name'].map(cluster_map)
    mean_cluster = df.groupby(['cluster'])['Duration'].mean().values
    flag = sum(mean_cluster)
    sum_duration[n] = flag
    flag = mean(mean_cluster)
    mean_duration[n] = flag    

In [None]:
mean_cluster_df = pd.DataFrame(list(mean_duration.items()), columns=['cluster', 'mean'])
mean_cluster_df.head()

In [None]:
fig = px.line(mean_cluster_df, x='cluster', y='mean')
fig.update_layout(xaxis_title='Number of clusters', yaxis_title='Average duration for bike repositioning')
fig.show()