In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap, HeatMapWithTime
import plotly.express as px

### Data preparation

In [None]:
def date_to_time(date):
    day_to_h = {'Lun':0,'Mar':24,'Mer':2*24,'Jeu':3*24,'Ven':4*24,'Sam':5*24,'Dim':6*24}
    day = date[:3]
    hour = int(date[-2:])
    return day_to_h[day] + hour

def time_to_day(time):
    return time[:3]

In [None]:
velib = pd.read_csv('velibLoading.csv', sep=' ')
velibAdds = pd.read_csv('velibAdds.csv', sep=' ')
velib = pd.concat([velibAdds, velib], axis=1)
#velib = velib.rename(columns={'longitude':'lon', 'latitude':'lat', 'bonus':'hill'})
velib.head()

In [None]:
days = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')
velib_long = velib.melt(id_vars=['longitude','latitude','bonus','names'], var_name='time', value_name='load')#.groupby('name').mean()
velib_long['day'] = velib_long['time'].apply(lambda t: t[:3])
velib_long['hour'] = velib_long['time'].apply(lambda t: int(t[-2:]))
#velib_long.drop('time', inplace=True, axis=1)
velib_long.head(10)

# Exploratory statistics

In [None]:
velib_long['time'] = velib_long['time'].apply(date_to_time)
data = []
for _, d in velib_long.groupby('time'):
   data.append([[row['latitude'], row['longitude'], row['load']] for _, row in d.iterrows()])
init_lat = velib_long['latitude'].median() #(velib_long['lat'].max() + velib_long['lat'].min()) / 2
init_long = velib_long['longitude'].median() #(velib_long['long'].max() + velib_long['long'].min()) / 2

m = folium.Map(location=[init_lat, init_long], zoom_start=12)
hm = HeatMapWithTime(data, radius=15)
hm.add_to(m)
m

In [None]:
load_by_hour = velib_long[['day', 'hour', 'load']].groupby(['day', 'hour']).mean()
hour = np.arange(24)
day_colors = ['#D9F0A3', '#ADDD8E', '#78C679', '#31A354', '#006837', '#6BAED6', '#08519C']
days = ['Lun', 'Mar', 'Mer', 'Jeu', 'Ven', 'Sam', 'Dim']
daysEng = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.figure(figsize=(10,6))
plt.title('Average daily loading')
for k in range(7):
    plt.plot(hour, load_by_hour.loc[days[k]].values, color=day_colors[k], label=daysEng[k-1])
plt.legend()
plt.show()

# PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5, whiten=True)
pca.fit(velib.iloc[:,4:])
PC = pca.components_
velib_pc = pca.transform(velib.iloc[:,4:])

plt.figure(figsize=(10,10))
t = np.arange(168)
for k in range(5):
    plt.subplot(5,1,k+1)
    plt.title('PC%d loading profile' % (k+1))
    plt.plot(t, PC[k])
    plt.ylim([-1,1])
plt.tight_layout()
plt.show()

In [None]:
velib_pc.shape
fig = lambda w,h: plt.figure(figsize=(w,h))
fig(6,6)
plt.scatter(velib_pc[:,0],velib_pc[:,1], c=velib['bonus'])
plt.show()

# K-means clustering

In [None]:
from sklearn.cluster import KMeans

inertia = np.zeros(10)
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k).fit(velib.iloc[:,4:])
    inertia[k-1] = kmeans.inertia_

plt.title('Within-cluster variance')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.bar(np.arange(1, 11), inertia)

plt.show()

In [None]:
kmeans = KMeans(n_clusters=4).fit(velib.iloc[:,4:])

fig(10,10)
t = np.arange(168)
for i in range(4):
    plt.subplot(5,1,i+1)
    plt.title('Centroid %d loading profile' % (i+1))
    plt.plot(t, kmeans.cluster_centers_[i])
    plt.ylim([0,1])
plt.tight_layout()
plt.show()

In [None]:
fig(6,6)
plt.scatter(velib_pc[:,0],velib_pc[:,1], c=kmeans.labels_, cmap='tab10')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering
from time import time
# #############################################################################
# Compute clustering
print("Compute structured hierarchical clustering...")
st = time()
n_clusters = 5  # number of regions
ward = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage='ward')
ward.fit(velib_pc)
print("Elapsed time: ", time() - st)

In [None]:
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
    plt.xticks([])

    
plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(ward)
plt.show()

# Gaussian mixture models

In [None]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=7, covariance_type='full')
gmm_labels = gmm.fit_predict(velib_pc)

In [None]:
fig(6,6)
plt.scatter(velib_pc[:,0],velib_pc[:,1], c=gmm_labels, cmap='tab10')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()