<a href="https://colab.research.google.com/github/SuhanaSethi/Clustering-comparison/blob/main/CLUSTERING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install pycaret pandas

from pycaret.datasets import get_data
from pycaret.clustering import *
import pandas as pd

data = get_data('wine')

preprocessing_techniques = [
    'no_preprocessing', 'normalize', 'transformation', 'pca',
    'transform_and_normalize', 'transform_normalize_pca'
]

cluster_numbers = [3, 4, 5]

models_list = ['kmeans', 'hclust', 'meanshift']

results = {}

def evaluate_models(preprocessing, clusters, model_name):

    cluster_setup = setup(data,
                          normalize=(preprocessing == 'normalize'),
                          transformation=(preprocessing == 'transformation'),
                          pca=(preprocessing == 'pca'),
                          session_id=123, verbose=False)


    model = create_model(model_name, num_clusters=clusters)
    metrics = pull()
    silhouette = metrics['Silhouette']
    calinski_harabasz = metrics['Calinski-Harabasz']
    davies_bouldin = metrics['Davies-Bouldin']

    return silhouette, calinski_harabasz, davies_bouldin

for preprocess in preprocessing_techniques:
    for c in cluster_numbers:
        for model in models_list:
            silhouette, calinski, davies_bouldin = evaluate_models(preprocess, c, model)
            results[(preprocess, c, model)] = {
                'Silhouette': silhouette,
                'Calinski-Harabasz': calinski,
                'Davies-Bouldin': davies_bouldin
            }
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df)

results_df.to_csv('clustering_results_wine.csv')




Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5036,13953.1224,0.6417,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4593,11671.3325,0.6587,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4514,14160.7156,0.7179,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.437,13305.753,0.7301,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4169,13917.3432,0.774,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3442,11613.0286,0.8048,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2356,2033.7262,1.511,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2149,1877.4671,1.58,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.29,56.5686,0.6327,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.177,1618.5688,1.8377,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1961,1508.5949,1.6739,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.29,56.5686,0.6327,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1564,1319.9421,1.9108,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1469,1307.3101,2.0254,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.29,56.5686,0.6327,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5609,18444.5214,0.5382,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5356,16028.5741,0.5206,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4715,4364.6228,0.3821,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5201,20995.1243,0.5669,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4988,19463.9527,0.5703,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4715,4364.6228,0.3821,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5047,23124.8442,0.5909,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4489,18920.3667,0.6004,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4715,4364.6228,0.3821,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5036,13953.1224,0.6417,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4593,11671.3325,0.6587,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4514,14160.7156,0.7179,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.437,13305.753,0.7301,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4169,13917.3432,0.774,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3442,11613.0286,0.8048,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5036,13953.1224,0.6417,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4593,11671.3325,0.6587,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4514,14160.7156,0.7179,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.437,13305.753,0.7301,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4169,13917.3432,0.774,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3442,11613.0286,0.8048,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5036,13953.1224,0.6417,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4593,11671.3325,0.6587,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4514,14160.7156,0.7179,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.437,13305.753,0.7301,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4169,13917.3432,0.774,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3442,11613.0286,0.8048,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3846,3625.9526,0.735,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

                                                                       Silhouette  \
no_preprocessing        3 kmeans     0    0.5036
Name: Silhouette, dtype: float64   
                          hclust     0    0.4593
Name: Silhouette, dtype: float64   
                          meanshift  0    0.3846
Name: Silhouette, dtype: float64   
                        4 kmeans     0    0.4514
Name: Silhouette, dtype: float64   
                          hclust      0    0.437
Name: Silhouette, dtype: float64   
                          meanshift  0    0.3846
Name: Silhouette, dtype: float64   
                        5 kmeans     0    0.4169
Name: Silhouette, dtype: float64   
                          hclust     0    0.3442
Name: Silhouette, dtype: float64   
                          meanshift  0    0.3846
Name: Silhouette, dtype: float64   
normalize               3 kmeans     0    0.2356
Name: Silhouette, dtype: float64   
                          hclust     0    0.2149
Name: Silhouette