In [33]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.pipeline import Pipeline
import numpy as np

In [34]:
df = pd.read_csv('/content/jewelleryDataSet.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Income,SpendingScore,Savings
0,0,58,77769,0.791329,6559.829923
1,1,59,81799,0.791082,5417.661426
2,2,62,74751,0.702657,9258.992965
3,3,59,74373,0.76568,7346.334504
4,4,87,17760,0.348778,16869.50713


In [35]:
X=df.values

In [36]:
X

array([[0.00000000e+00, 5.80000000e+01, 7.77690000e+04, 7.91328777e-01,
        6.55982992e+03],
       [1.00000000e+00, 5.90000000e+01, 8.17990000e+04, 7.91082047e-01,
        5.41766143e+03],
       [2.00000000e+00, 6.20000000e+01, 7.47510000e+04, 7.02656952e-01,
        9.25899297e+03],
       ...,
       [5.02000000e+02, 9.00000000e+01, 3.52970000e+04, 3.55149019e-01,
        1.60914020e+04],
       [5.03000000e+02, 9.10000000e+01, 2.06810000e+04, 3.54679169e-01,
        1.84010884e+04],
       [5.04000000e+02, 8.90000000e+01, 3.02670000e+04, 2.89310066e-01,
        1.43863519e+04]])

In [37]:
#columns=['Gender','family_history_with_overweight','CAEC','SMOKE','CALC','MTRANS','NObeyesdad']

In [38]:

#from sklearn.preprocessing import LabelEncoder
#le = LabelEncoder()
#for col in columns:
  #df[col] = le.fit_transform(df[col])


In [39]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Income,SpendingScore,Savings
0,0,58,77769,0.791329,6559.829923
1,1,59,81799,0.791082,5417.661426
2,2,62,74751,0.702657,9258.992965
3,3,59,74373,0.76568,7346.334504
4,4,87,17760,0.348778,16869.50713


In [40]:
def compute_metrics(X, labels):
    n_clusters = len(set(labels))


    if n_clusters > 1:
        silhouette = silhouette_score(X, labels)
        calinski_harabasz = calinski_harabasz_score(X, labels)
        davies_bouldin = davies_bouldin_score(X, labels)
    else:
        silhouette = np.nan
        calinski_harabasz = np.nan
        davies_bouldin = np.nan

    return {
        "Silhouette": silhouette,
        "Calinski-Harabasz": calinski_harabasz,
        "Davies-Bouldin": davies_bouldin
    }

In [41]:
def get_preprocessing_pipeline(with_normalization=False, with_pca=False, with_transform=False):
    steps = []

    if with_normalization:
        steps.append(('scaler', StandardScaler()))

    if with_transform:
        steps.append(('custom_transform', StandardScaler()))

    if with_pca:
        steps.append(('pca', PCA(n_components=2)))

    return Pipeline(steps) if steps else None

In [42]:
clustering_methods = {
    "KMeans": KMeans,
    "Hierarchical": AgglomerativeClustering,
    "MeanShift": MeanShift
}

In [43]:
preprocessing_scenarios = {
    "No Data Processing": (False, False, False),
    "Using Normalization": (True, False, False),
    "Using Transform": (False, False, True),
    "Using PCA": (False, True, False),
    "Using T+N": (True, False, True),
    "T+N+PCA": (True, True, True)
}

In [44]:
cluster_values = [3, 4, 5]

In [45]:
results = []

In [46]:
for clustering_method_name, clustering_method in clustering_methods.items():
    for scenario_name, (with_normalization, with_pca, with_transform) in preprocessing_scenarios.items():

        preprocessing_pipeline = get_preprocessing_pipeline(with_normalization, with_pca, with_transform)


        if preprocessing_pipeline:
            X_transformed = preprocessing_pipeline.fit_transform(X)
        else:
            X_transformed = X


        if clustering_method_name == "MeanShift":
            cluster_model = clustering_method()
            labels = cluster_model.fit_predict(X_transformed)


            metrics = compute_metrics(X_transformed, labels)
            metrics["Parameters"] = clustering_method_name
            metrics["Scenario"] = scenario_name
            metrics["Clusters (c)"] = "Auto"


            results.append(metrics)

        else:

            for c in cluster_values:
                cluster_model = clustering_method(n_clusters=c)
                labels = cluster_model.fit_predict(X_transformed)


                metrics = compute_metrics(X_transformed, labels)
                metrics["Parameters"] = clustering_method_name
                metrics["Scenario"] = scenario_name
                metrics["Clusters (c)"] = c


                results.append(metrics)

In [47]:
df_results = pd.DataFrame(results)

In [48]:
df_results

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Parameters,Scenario,Clusters (c)
0,0.739201,3566.668168,0.343671,KMeans,No Data Processing,3
1,0.721496,5007.199788,0.408418,KMeans,No Data Processing,4
2,0.681269,4628.907459,0.470079,KMeans,No Data Processing,5
3,0.508922,464.62988,0.86045,KMeans,Using Normalization,3
4,0.512855,387.520712,0.783037,KMeans,Using Normalization,4
5,0.426912,386.273544,0.929068,KMeans,Using Normalization,5
6,0.508922,464.62988,0.86045,KMeans,Using Transform,3
7,0.522598,443.796901,0.813131,KMeans,Using Transform,4
8,0.49083,419.92704,0.806659,KMeans,Using Transform,5
9,0.739317,3567.544408,0.343527,KMeans,Using PCA,3


In [49]:
output_file = 'clustering_results.xlsx'
df_results.to_excel(output_file, index=False)