In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import RobustScaler, StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, pairwise_distances
from scipy.spatial.distance import euclidean, cityblock
from scipy.cluster.hierarchy import linkage, dendrogram
from collections import defaultdict
import itertools

In [4]:
df_up_down = pd.read_csv("UP/1.csv", index_col=1)
df_up_down.drop('Unnamed: 0', axis=1, inplace=True)

df_right_left = pd.read_csv("UP/2.csv", index_col=1)
df_right_left.drop('Unnamed: 0', axis=1, inplace=True)

box =pd.concat([df_up_down, df_right_left])

X = box.copy()
X

Unnamed: 0_level_0,AccX,AccY,AccZ,GyroX,GyroY,GyroZ
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12:47:27.22,-7.98680,-6.52642,0.28367,0.03333,0.00000,-0.00000
12:47:27.26,-7.98680,-6.52642,0.28367,0.03333,0.00000,-0.00000
12:47:27.28,-8.09339,-6.70965,0.28997,0.03333,0.00000,-0.00000
12:47:27.32,-8.09339,-6.70965,0.28997,0.03333,0.00000,-0.00000
12:47:27.34,-9.26388,-6.97198,0.30954,0.03333,0.00000,-0.00000
...,...,...,...,...,...,...
12:56:38.14,0.34078,-0.51538,0.36650,-0.13333,0.00000,-0.00000
12:56:38.17,0.39978,-0.52438,0.33781,-0.06667,0.06667,-0.03333
12:56:38.20,0.35078,-0.46638,0.29524,0.00000,0.16667,-0.06667
12:56:38.23,0.41978,-0.48538,0.42731,-0.13333,0.00000,-0.10000


In [5]:
scalers = {
    "RobustScaler": RobustScaler(),
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "PowerTransformer": PowerTransformer(method='yeo-johnson')
}

k_values = [4, 5]
n_inits = [10, 20]
init_methods = ['k-means++', 'random']
metrics = ['euclidean', 'manhattan']

results = []

In [6]:
def dunn_index(X, labels, centroids, metric='euclidean'):
    if metric == 'euclidean':
        pair_distances = pairwise_distances(X, centroids, metric='euclidean')
    else:
        pair_distances = pairwise_distances(X, centroids, metric='manhattan')
    
    intra_cluster_dists = [np.mean(pair_distances[np.where(labels == i)][:, i]) for i in range(len(centroids))]
    d_min = np.min([np.min([euclidean(centroids[i], centroids[j]) if metric == 'euclidean' 
                            else cityblock(centroids[i], centroids[j])
                            for j in range(len(centroids)) if j != i]) for i in range(len(centroids))])
    return d_min / np.max(intra_cluster_dists)

In [7]:
for scaler_name, scaler in scalers.items():
    X_scaled = scaler.fit_transform(X)
    for k, n_init, init, metric in itertools.product(k_values, n_inits, init_methods, metrics):
        # Definisci il modello KMeans
        kmeans = KMeans(n_clusters=k, n_init=n_init, init=init)
        kmeans.fit(X_scaled)
        labels = kmeans.labels_
        
        # Calcolo delle metriche
        silhouette = silhouette_score(X_scaled, labels, metric=metric)
        db_score = davies_bouldin_score(X_scaled, labels)
        ch_score = calinski_harabasz_score(X_scaled, labels)
        
        # Calcola i centroidi in base alla distanza specificata
        centroids = kmeans.cluster_centers_
        dunn = dunn_index(X_scaled, labels, centroids, metric=metric)
        
        # Aggiungi i risultati al DataFrame
        results.append({
            "Scaler": scaler_name,
            "Clusters": k,
            "n_init": n_init,
            "Init": init,
            "Metric": metric,
            "Silhouette Score": silhouette,
            "Davies-Bouldin Score": db_score,
            "Calinski-Harabasz Score": ch_score,
            "Dunn Index": dunn
        })

In [8]:
results_df = pd.DataFrame(results)
print(results_df)

              Scaler  Clusters  n_init       Init     Metric  \
0       RobustScaler         4      10  k-means++  euclidean   
1       RobustScaler         4      10  k-means++  manhattan   
2       RobustScaler         4      10     random  euclidean   
3       RobustScaler         4      10     random  manhattan   
4       RobustScaler         4      20  k-means++  euclidean   
..               ...       ...     ...        ...        ...   
59  PowerTransformer         5      10     random  manhattan   
60  PowerTransformer         5      20  k-means++  euclidean   
61  PowerTransformer         5      20  k-means++  manhattan   
62  PowerTransformer         5      20     random  euclidean   
63  PowerTransformer         5      20     random  manhattan   

    Silhouette Score  Davies-Bouldin Score  Calinski-Harabasz Score  \
0           0.431224              0.825487              4293.438105   
1           0.348461              0.826331              4293.433906   
2           0.4312

In [9]:
results_df['Silhouette Score Norm'] = (results_df['Silhouette Score'] - results_df['Silhouette Score'].min()) / (results_df['Silhouette Score'].max() - results_df['Silhouette Score'].min())
results_df['Dunn Index Norm'] = (results_df['Dunn Index'] - results_df['Dunn Index'].min()) / (results_df['Dunn Index'].max() - results_df['Dunn Index'].min())
results_df['Davies-Bouldin Score Norm'] = (results_df['Davies-Bouldin Score'].max() - results_df['Davies-Bouldin Score']) / (results_df['Davies-Bouldin Score'].max() - results_df['Davies-Bouldin Score'].min())
results_df['Calinski-Harabasz Score Norm'] = (results_df['Calinski-Harabasz Score'] - results_df['Calinski-Harabasz Score'].min()) / (results_df['Calinski-Harabasz Score'].max() - results_df['Calinski-Harabasz Score'].min())

results_df['Combined Score'] = (results_df['Silhouette Score Norm'] + 
                                 results_df['Dunn Index Norm'] + 
                                 results_df['Davies-Bouldin Score Norm'] + 
                                 results_df['Calinski-Harabasz Score Norm']) / 4

sorted_results_df = results_df.sort_values(by='Combined Score', ascending=False)

In [10]:
print(sorted_results_df[['Scaler', 'Clusters', 'n_init', 'Init', 'Metric', 
                          'Silhouette Score', 'Dunn Index', 
                          'Davies-Bouldin Score', 'Calinski-Harabasz Score', 
                          'Combined Score']])

              Scaler  Clusters  n_init       Init     Metric  \
0       RobustScaler         4      10  k-means++  euclidean   
2       RobustScaler         4      10     random  euclidean   
4       RobustScaler         4      20  k-means++  euclidean   
6       RobustScaler         4      20     random  euclidean   
5       RobustScaler         4      20  k-means++  manhattan   
..               ...       ...     ...        ...        ...   
63  PowerTransformer         5      20     random  manhattan   
18    StandardScaler         4      10     random  euclidean   
22    StandardScaler         4      20     random  euclidean   
56  PowerTransformer         5      10  k-means++  euclidean   
17    StandardScaler         4      10  k-means++  manhattan   

    Silhouette Score  Dunn Index  Davies-Bouldin Score  \
0           0.431224    1.298961              0.825487   
2           0.431224    1.298961              0.825487   
4           0.431224    1.298961              0.825487   

In [11]:
print(sorted_results_df[['Clusters',
                          'Silhouette Score', 'Dunn Index', 
                          'Davies-Bouldin Score', 'Calinski-Harabasz Score', 
                          'Combined Score']])

    Clusters  Silhouette Score  Dunn Index  Davies-Bouldin Score  \
0          4          0.431224    1.298961              0.825487   
2          4          0.431224    1.298961              0.825487   
4          4          0.431224    1.298961              0.825487   
6          4          0.431224    1.298961              0.825487   
5          4          0.348893    0.826500              0.825487   
..       ...               ...         ...                   ...   
63         5          0.233637    0.709457              1.438729   
18         4          0.220626    0.767588              1.499829   
22         4          0.220626    0.767588              1.499829   
56         5          0.216501    0.750439              1.447375   
17         4          0.216957    0.740036              1.499687   

    Calinski-Harabasz Score  Combined Score  
0               4293.438105        1.000000  
2               4293.438105        1.000000  
4               4293.438105        1.000000  

In [12]:
sorted_results_df.to_csv("BOX_KMeans.csv")