<a href="https://colab.research.google.com/github/Tanya-Sood/clustering/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import estimate_bandwidth
from IPython.display import display
import zipfile
import urllib.request
import io
import warnings
warnings.filterwarnings("ignore")

In [11]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip"
response = urllib.request.urlopen(url)
zip_file = zipfile.ZipFile(io.BytesIO(response.read()))
csv_file = zip_file.open('AirQualityUCI.csv')

df = pd.read_csv(csv_file, sep=';', decimal=',', parse_dates=[['Date', 'Time']], na_values=-200)

df.dropna(axis=1, how='all', inplace=True)
df.dropna(inplace=True)
df = df.iloc[:, 2:-2]

In [12]:
df.head()

Unnamed: 0,PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T
0,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6
1,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3
2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9
3,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0
4,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2


In [13]:
def normalize(data):
    return MinMaxScaler().fit_transform(data)

def transform(data):
    return PowerTransformer().fit_transform(data)

def apply_pca(data, n=2):
    return PCA(n_components=n).fit_transform(data)

In [14]:
def evaluate_clustering(data, labels):
    if len(set(labels)) == 1:
        return [0, 0, np.inf]
    return [
        round(silhouette_score(data, labels), 2),
        int(calinski_harabasz_score(data, labels)),
        round(davies_bouldin_score(data, labels), 2)
    ]

In [15]:
prep_variants = {
    "No Data Processing": df,
    "Using Normalization": normalize(df),
    "Using Transform": transform(df),
    "Using PCA": apply_pca(df),
    "Using T+N": normalize(transform(df)),
    "Using T+N+PCA": apply_pca(normalize(transform(df)))
}

In [16]:
def run_clustering(algo, algo_name):
    silhouette = {v: [] for v in prep_variants.keys()}
    ch_score = {v: [] for v in prep_variants.keys()}
    db_score = {v: [] for v in prep_variants.keys()}

    for variant, processed_data in prep_variants.items():
        for k in [3, 4, 5]:
            if algo_name == 'KMeans':
                model = KMeans(n_clusters=k, random_state=42).fit(processed_data)
            elif algo_name == 'Hierarchical':
                model = AgglomerativeClustering(n_clusters=k).fit(processed_data)
            elif algo_name == 'MeanShift':
                try:
                    bandwidth = estimate_bandwidth(processed_data, quantile=0.1)
                    if bandwidth <= 0 or np.isnan(bandwidth):
                        silhouette[variant].append("NA")
                        ch_score[variant].append("NA")
                        db_score[variant].append("NA")
                        continue
                    model = MeanShift(bandwidth=bandwidth).fit(processed_data)
                except:
                    silhouette[variant].append("NA")
                    ch_score[variant].append("NA")
                    db_score[variant].append("NA")
                    continue


            labels = model.labels_
            if len(set(labels)) <= 1:
              silhouette[variant].append("NA")
              ch_score[variant].append("NA")
              db_score[variant].append("NA")
              continue

            s, c, d = evaluate_clustering(processed_data, labels)
            silhouette[variant].append(s)
            ch_score[variant].append(c)
            db_score[variant].append(d)

    return silhouette, ch_score, db_score

In [17]:
def format_dataframe(title, sil, ch, db):
    metrics = ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldins']
    index = pd.Index(metrics, name='Parameters')
    cols = pd.MultiIndex.from_product(
        [[*prep_variants.keys()], ['c=3', 'c=4', 'c=5']],
        names=['Preprocessing', 'Clusters']
    )

    data = []
    for metric_dict in [sil, ch, db]:
        row = []
        for variant in prep_variants.keys():
            for i in range(3):
                row.append(metric_dict[variant][i])
        data.append(row)

    df_final = pd.DataFrame(data, index=index, columns=cols)
    print(f"\n🔹 {title} Table Format:\n")
    display(df_final)
    return df_final

In [18]:
k_df = format_dataframe("KMeans Clustering", *run_clustering(KMeans, 'KMeans'))
h_df = format_dataframe("Hierarchical Clustering", *run_clustering(AgglomerativeClustering, 'Hierarchical'))
m_df = format_dataframe("Mean Shift Clustering", *run_clustering(MeanShift, 'MeanShift'))


🔹 KMeans Clustering Table Format:



Preprocessing,No Data Processing,No Data Processing,No Data Processing,Using Normalization,Using Normalization,Using Normalization,Using Transform,Using Transform,Using Transform,Using PCA,Using PCA,Using PCA,Using T+N,Using T+N,Using T+N,Using T+N+PCA,Using T+N+PCA,Using T+N+PCA
Clusters,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Parameters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Silhouette,0.44,0.39,0.35,0.36,0.31,0.25,0.36,0.29,0.26,0.49,0.44,0.42,0.36,0.29,0.27,0.43,0.37,0.35
Calinski-Harabasz,1399.0,1313.0,1242.0,924.0,809.0,704.0,1046.0,895.0,754.0,1726.0,1717.0,1712.0,1039.0,878.0,799.0,1344.0,1194.0,1162.0
Davies-Bouldins,0.73,0.84,0.89,0.93,1.1,1.2,0.92,1.07,1.25,0.64,0.73,0.74,0.92,1.07,1.17,0.77,0.87,0.93



🔹 Hierarchical Clustering Table Format:



Preprocessing,No Data Processing,No Data Processing,No Data Processing,Using Normalization,Using Normalization,Using Normalization,Using Transform,Using Transform,Using Transform,Using PCA,Using PCA,Using PCA,Using T+N,Using T+N,Using T+N,Using T+N+PCA,Using T+N+PCA,Using T+N+PCA
Clusters,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Parameters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Silhouette,0.43,0.39,0.32,0.35,0.31,0.26,0.35,0.28,0.24,0.47,0.41,0.4,0.31,0.27,0.24,0.38,0.35,0.32
Calinski-Harabasz,1314.0,1133.0,1147.0,826.0,777.0,672.0,988.0,800.0,731.0,1524.0,1622.0,1564.0,897.0,793.0,721.0,1206.0,1032.0,995.0
Davies-Bouldins,0.74,0.82,0.94,0.89,1.11,1.14,0.96,1.11,1.22,0.63,0.73,0.71,1.01,1.08,1.18,0.82,0.96,0.95



🔹 Mean Shift Clustering Table Format:



Preprocessing,No Data Processing,No Data Processing,No Data Processing,Using Normalization,Using Normalization,Using Normalization,Using Transform,Using Transform,Using Transform,Using PCA,Using PCA,Using PCA,Using T+N,Using T+N,Using T+N,Using T+N+PCA,Using T+N+PCA,Using T+N+PCA
Clusters,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Parameters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Silhouette,0.31,0.31,0.31,0.27,0.27,0.27,0.33,0.33,0.33,0.41,0.41,0.41,0.33,0.33,0.33,0.3,0.3,0.3
Calinski-Harabasz,689.0,689.0,689.0,266.0,266.0,266.0,844.0,844.0,844.0,1010.0,1010.0,1010.0,862.0,862.0,862.0,964.0,964.0,964.0
Davies-Bouldins,1.01,1.01,1.01,1.24,1.24,1.24,1.03,1.03,1.03,0.76,0.76,0.76,1.03,1.03,1.03,1.06,1.06,1.06


##Final Observations:

1. **Preprocessing has a major impact** on clustering quality.
   - PCA combined with normalization and transformation (T+N+PCA) often yields better results, especially for KMeans and Hierarchical clustering.

2. **KMeans** consistently produced meaningful clusters across all preprocessing variants and performed best in terms of **Calinski-Harabasz** and **Davies-Bouldins** scores.

3. **Hierarchical Clustering** showed stable and slightly lower performance compared to KMeans, but still formed reasonable groupings.

4. **Mean Shift Clustering** was highly sensitive to data scaling and density. Without tuning, it produced:
   - Identical clustering results regardless of c=3/4/5, since MeanShift does not take n_clusters as a parameter.
   - Cluster counts are determined by estimated bandwidth, which may need manual tuning for real-world datasets.

5. **PCA preprocessing** helped all clustering algorithms by reducing dimensionality and noise, leading to better separation in most cases.

6. **Davies-Bouldins Score** was the lowest (best) for PCA-based preprocessing in all models, showing tighter intra-cluster cohesion and better inter-cluster separation.

**Conclusion**: For this dataset, **KMeans with PCA** preprocessing yielded the best balance of interpretability and performance.