In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import umap.umap_ as umap
import seaborn as sns
stat_path = r'RIO_statistics.csv'
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.decomposition import PCA

In [None]:
#Reading data
stat_path_full = os.path.join(stat_path)
df = pd.read_csv(stat_path_full,index_col=0)
#Drop columns
df = df.drop(['Num_of_loc','Num_of_days'],axis=1)
#Transform RoG
df['RoG k2'] = df['RoG k2']/df['RoG']
df['RoG k4'] = df['RoG k4']/df['RoG']
df['RoG k8'] = df['RoG k8']/df['RoG']
#Scale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [None]:
#Get potential combinations 
import itertools

combinations = []
for r in range(1, 3):  # For combinations of 1, 2, and 3 elements
    combinations.extend(itertools.combinations(df.columns, r))
#Recreate DF
df_scaled = pd.DataFrame(df_scaled,columns=df.columns,index=df.index)


In [None]:
# Run tests
combs = {}
for w in combinations:
    scores_ch = {}
    scores_db = {}
    scores_s = {}
    df_filter = df_scaled.drop([*w],axis=1)
    
    #Broken-stick criterion
    pca = PCA(n_components=5,random_state=42)
    principal_components = pca.fit_transform(df_filter)
    thersholds = [1/(11-x) for x in range(1,6)]
    stacked = np.vstack([pca.explained_variance_ratio_,thersholds])
    n_components = (stacked[0,:]>stacked[1,:]).sum()
    #Proper PCA
    pca = PCA(n_components=n_components,random_state=42)
    principal_components = pca.fit_transform(df_filter)
    print(np.sum(pca.explained_variance_ratio_))
    Z = linkage(df_scaled, method='ward')
    # Elbow Method on Dendrogram
    last = Z[-20:, 2]
    last_rev = last[::-1]
    idxs = np.arange(1, len(last) + 1)
    plt.plot(idxs, last_rev)

    acceleration = np.diff(last, 2)  # 2nd derivative of the distances
    acceleration_rev = acceleration[::-1]
    plt.plot(idxs[:-2] + 1, acceleration_rev)

    k_elbow = acceleration_rev.argmax() + 2  # Add 2 to get the right number of clusters
    print(f"Optimal number of clusters based on the elbow method: {k_elbow}")
    
    for n in range(2,6):
        agglomerative_clustering = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='ward')
        clusters = agglomerative_clustering.fit_predict(df_filter)

        # Add cluster labels to the original data
        score = davies_bouldin_score(df_filter, clusters)
        score2 = calinski_harabasz_score(df_filter,clusters)
        score3 = silhouette_score(df_filter,clusters)
        scores_ch[n] = score2
        scores_db[n] = score
        scores_s[n] = score3
    print("Drop:",w)
    print(max(scores_ch,key=scores_ch.get),max(scores_ch.values()))
    print(min(scores_db,key=scores_db.get),min(scores_db.values()))
    print(max(scores_s,key=scores_s.get),max(scores_s.values()))
    combs[w] = (max(scores_ch,key=scores_ch.get),max(scores_ch.values()),
               min(scores_db,key=scores_db.get),min(scores_db.values()),
               max(scores_s,key=scores_s.get),max(scores_s.values()),
              (k_elbow))

In [None]:
cluster_df = pd.DataFrame(combs).T
cluster_df.columns = ['CCH','CH','CDB','DB','CS','S','EL']
cluster_df

In [None]:
#Selected parameters - process
#Reading data
stat_path_full = os.path.join(stat_path)
df = pd.read_csv(stat_path_full,index_col=0)
#Drop columns
df = df.drop(['Num_of_loc','Num_of_days','Median from Home'],axis=1)
#Transform RoG
df['RoG k2'] = df['RoG k2']/df['RoG']
df['RoG k4'] = df['RoG k4']/df['RoG']
df['RoG k8'] = df['RoG k8']/df['RoG']
#Scale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
#PCA
pca = PCA(n_components=3)
principal_components = pca.fit_transform(df_scaled)

In [None]:
#Dendrogram PLOT
import matplotlib.pyplot as plt
from cycler import cycler

cluster_colors = {
    2: '#f0f922',    # Local Stayers
    3: '#ee7850',   # Urban Wanderers
    0: '#9c179d',  # Distant Commuters
    1: '#0c0987', # Neighborhood Explorers
}

# Set a custom color cycle
plt.rcParams['axes.prop_cycle'] = cycler(color=['grey']+['#9c179d','#0c0987','#ee7850','#f0f922'])

# Compute the linkage matrix
Z = linkage(principal_components, method='ward')

custom_palette = ['#9c179d','#0c0987','#ee7850','#f0f922']
cluster_labels = ['Distant Commuters','Neighborhood Explorers','Urban Wanderers','Local Stayers']
# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z,truncate_mode='lastp')
plt.xlabel('Samples',size=20)
plt.ylabel("Distance",size=20)
# plt.show()

# Add legend manually
legend_patches = [
    plt.Line2D([0], [0], color=color, lw=2, label=label)
    for color, label in zip(custom_palette, cluster_labels)
]

plt.legend(handles=legend_patches, title="Clusters", loc='upper right', fontsize=12)

In [None]:
# Fit Agglomerative Clustering
num_clusters = 4  # Adjust based on dendrogram analysis
agglomerative_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric='euclidean', linkage='ward')
clusters = agglomerative_clustering.fit_predict(principal_components)

# Add cluster labels to the original| data
df['Cluster'] = clusters

In [None]:
#Rebuild full data
stat_path_full = os.path.join(stat_path)
df = pd.read_csv(stat_path_full,index_col=0)
df['RoG k2'] = df['RoG k2']/df['RoG']
df['RoG k4'] = df['RoG k4']/df['RoG']
df['RoG k8'] = df['RoG k8']/df['RoG']
df['Cluster'] = clusters


In [None]:
#Average parameters
df[[x for x in df.columns if 'Return' not in x and 'Median' not in x ]].groupby('Cluster').agg(['mean','std'])


In [None]:
#Population
df.groupby('Cluster').count().iloc[:,0]

In [None]:
#Population ratio
df.groupby('Cluster').count().iloc[:,0]/df.groupby('Cluster').count().iloc[:,0].sum()

In [None]:
#PCA PLOT
unique_clusters = np.unique(df['Cluster'])

# Define custom colors for each cluster
cluster_colors = {
    2: '#f0f922',    # Local Stayers
    3: '#ee7850',   # Urban Wanderers
    0: '#9c179d',  # Distant Commuters
    1: '#0c0987', # Neighborhood Explorers
}
names = ['Local Stayers','Urban Wanderers','Distant Commuters','Neighborhood Explorers']
point_colors = df['Cluster'].map(cluster_colors)

scatter = plt.scatter(principal_components[:,0],principal_components[:,1], c=point_colors)
plt.xlabel('PC 1')
plt.ylabel("PC 2")

# Create a legend
# Add custom legend
legend_handles = [
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=f'Cluster {cluster}')
    for color, cluster in zip(cluster_colors.values(),names)
]
plt.legend(handles=legend_handles, title="Clusters", frameon=True, loc='upper right')
