In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import RobustScaler, StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, pairwise_distances
from scipy.spatial.distance import euclidean, cityblock
from scipy.cluster.hierarchy import linkage, dendrogram
from collections import defaultdict
import itertools

In [2]:
df_up_down = pd.read_csv("UP/1.csv", index_col=1, parse_dates=True)
df_up_down.drop('Unnamed: 0', axis=1, inplace=True)
a = df_up_down.copy().resample('1S').mean()

df_right_left = pd.read_csv("UP/2.csv", index_col=1, parse_dates=True)
df_right_left.drop('Unnamed: 0', axis=1, inplace=True)
b = df_right_left.copy().resample('1S').mean()

box =pd.concat([a, b])
c = box.copy()

casi = [a, b, c]

  df_up_down = pd.read_csv("UP/1.csv", index_col=1, parse_dates=True)
  df_right_left = pd.read_csv("UP/2.csv", index_col=1, parse_dates=True)


In [4]:
scalers = {
    "RobustScaler": RobustScaler(),
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "PowerTransformer": PowerTransformer(method='yeo-johnson')
}

k_values = range(2, 6)
n_inits = [10, 20]
init_methods = ['k-means++', 'random']
metrics = ['euclidean', 'manhattan']

results = {}

In [5]:
def dunn_index(X, labels, centroids, metric='euclidean'):
    if metric == 'euclidean':
        pair_distances = pairwise_distances(X, centroids, metric='euclidean')
    else:
        pair_distances = pairwise_distances(X, centroids, metric='manhattan')
    
    intra_cluster_dists = [np.mean(pair_distances[np.where(labels == i)][:, i]) for i in range(len(centroids))]
    d_min = np.min([np.min([euclidean(centroids[i], centroids[j]) if metric == 'euclidean' 
                            else cityblock(centroids[i], centroids[j])
                            for j in range(len(centroids)) if j != i]) for i in range(len(centroids))])
    return d_min / np.max(intra_cluster_dists)

In [6]:
for name, i in zip(["up_down", "right_left", "box"], casi):
    res = []
    for scaler_name, scaler in scalers.items():
        X_scaled = scaler.fit_transform(i)
        for k, n_init, init, metric in itertools.product(k_values, n_inits, init_methods, metrics):

            kmeans = KMeans(n_clusters=k, n_init=n_init, init=init)
            kmeans.fit(X_scaled)
            labels = kmeans.labels_
        
            silhouette = silhouette_score(X_scaled, labels, metric=metric)
            db_score = davies_bouldin_score(X_scaled, labels)
            ch_score = calinski_harabasz_score(X_scaled, labels)
        
            centroids = kmeans.cluster_centers_
            dunn = dunn_index(X_scaled, labels, centroids, metric=metric)
        
            res.append({
                "Scaler": scaler_name,
                "Clusters": k,
                "n_init": n_init,
                "Init": init,
                "Metric": metric,
                "Silhouette Score": silhouette,
                "Davies-Bouldin Score": db_score,
                "Calinski-Harabasz Score": ch_score,
                "Dunn Index": dunn
            })

        results[name] = res

In [7]:
for k, i in results.items():
    results[k] = pd.DataFrame(i)

    results[k]['Silhouette Score Norm'] = (results[k]['Silhouette Score'] - results[k]['Silhouette Score'].min()) / (results[k]['Silhouette Score'].max() - results[k]['Silhouette Score'].min())
    results[k]['Dunn Index Norm'] = (results[k]['Dunn Index'] - results[k]['Dunn Index'].min()) / (results[k]['Dunn Index'].max() - results[k]['Dunn Index'].min())
    results[k]['Davies-Bouldin Score Norm'] = (results[k]['Davies-Bouldin Score'].max() - results[k]['Davies-Bouldin Score']) / (results[k]['Davies-Bouldin Score'].max() - results[k]['Davies-Bouldin Score'].min())
    results[k]['Calinski-Harabasz Score Norm'] = (results[k]['Calinski-Harabasz Score'] - results[k]['Calinski-Harabasz Score'].min()) / (results[k]['Calinski-Harabasz Score'].max() - results[k]['Calinski-Harabasz Score'].min())

    results[k]['Combined Score'] = (results[k]['Silhouette Score Norm'] + 
                                 results[k]['Dunn Index Norm'] + 
                                 results[k]['Davies-Bouldin Score Norm'] + 
                                 results[k]['Calinski-Harabasz Score Norm']) / 4
    
    results[k] = results[k].sort_values(by='Combined Score', ascending=False)


In [21]:
results['up_down'].head()

Unnamed: 0,Scaler,Clusters,n_init,Init,Metric,Silhouette Score,Davies-Bouldin Score,Calinski-Harabasz Score,Dunn Index,Silhouette Score Norm,Dunn Index Norm,Davies-Bouldin Score Norm,Calinski-Harabasz Score Norm,Combined Score
64,MinMaxScaler,2,10,k-means++,euclidean,0.346313,1.099914,75.450242,1.736571,0.999579,1.0,1.0,1.0,0.999895
68,MinMaxScaler,2,20,k-means++,euclidean,0.346313,1.099914,75.450242,1.736571,0.999579,1.0,1.0,1.0,0.999895
66,MinMaxScaler,2,10,random,euclidean,0.346313,1.099914,75.450242,1.736571,0.999579,1.0,1.0,1.0,0.999895
70,MinMaxScaler,2,20,random,euclidean,0.346313,1.099914,75.450242,1.736571,0.999579,1.0,1.0,1.0,0.999895
65,MinMaxScaler,2,10,k-means++,manhattan,0.346373,1.099914,75.450242,1.589433,1.0,0.815377,1.0,1.0,0.953844


In [None]:
print(results["right_left"][results["right_left"]["Clusters"] <= 3].head())
results["right_left"][results["right_left"]['Clusters'] <= 3].sort_values(by='Silhouette Score', ascending=False).head()

Unnamed: 0,Scaler,Clusters,n_init,Init,Metric,Silhouette Score,Davies-Bouldin Score,Calinski-Harabasz Score,Dunn Index,Silhouette Score Norm,Dunn Index Norm,Davies-Bouldin Score Norm,Calinski-Harabasz Score Norm,Combined Score
66,MinMaxScaler,2,10,random,euclidean,0.321941,1.211267,68.042049,1.638126,0.799801,0.887391,0.494637,1.0,0.795457
68,MinMaxScaler,2,20,k-means++,euclidean,0.321941,1.211267,68.042049,1.638126,0.799801,0.887391,0.494637,1.0,0.795457
70,MinMaxScaler,2,20,random,euclidean,0.321941,1.211267,68.042049,1.638126,0.799801,0.887391,0.494637,1.0,0.795457
64,MinMaxScaler,2,10,k-means++,euclidean,0.321941,1.211267,68.042049,1.638126,0.799801,0.887391,0.494637,1.0,0.795457
73,MinMaxScaler,3,10,k-means++,manhattan,0.335274,1.155995,63.175214,1.579332,0.917918,0.792506,0.621054,0.798046,0.782381


In [24]:
print(results["box"][results["box"]['Clusters'] >= 4].head())
results['box'][results["box"]['Clusters'] >= 4].sort_values(by='Silhouette Score', ascending=False).head()

          Scaler  Clusters  n_init       Init     Metric  Silhouette Score  \
22  RobustScaler         4      20     random  euclidean          0.353363   
24  RobustScaler         5      10  k-means++  euclidean          0.362479   
28  RobustScaler         5      20  k-means++  euclidean          0.362479   
30  RobustScaler         5      20     random  euclidean          0.352960   
18  RobustScaler         4      10     random  euclidean          0.343851   

    Davies-Bouldin Score  Calinski-Harabasz Score  Dunn Index  \
22              1.025355                94.371754    1.587187   
24              1.024614                91.328951    1.527280   
28              1.024614                91.328951    1.527280   
30              1.007296                91.340188    1.474518   
18              1.051954                94.385987    1.508914   

    Silhouette Score Norm  Dunn Index Norm  Davies-Bouldin Score Norm  \
22               0.660967         0.581145                   0.8486

Unnamed: 0,Scaler,Clusters,n_init,Init,Metric,Silhouette Score,Davies-Bouldin Score,Calinski-Harabasz Score,Dunn Index,Silhouette Score Norm,Dunn Index Norm,Davies-Bouldin Score Norm,Calinski-Harabasz Score Norm,Combined Score
28,RobustScaler,5,20,k-means++,euclidean,0.362479,1.024614,91.328951,1.52728,0.707829,0.529393,0.849716,0.700595,0.696883
24,RobustScaler,5,10,k-means++,euclidean,0.362479,1.024614,91.328951,1.52728,0.707829,0.529393,0.849716,0.700595,0.696883
26,RobustScaler,5,10,random,euclidean,0.356517,1.033858,91.430142,1.460799,0.677181,0.471962,0.836086,0.702883,0.672028
22,RobustScaler,4,20,random,euclidean,0.353363,1.025355,94.371754,1.587187,0.660967,0.581145,0.848624,0.769392,0.715032
30,RobustScaler,5,20,random,euclidean,0.35296,1.007296,91.340188,1.474518,0.658897,0.483813,0.87525,0.700849,0.679702


In [None]:
for k, i in results.items():

    i.to_csv(f"{k}_Kmeans_1S.csv")