<a href="https://colab.research.google.com/github/Preetish2603/Clustering---Assignment-4--UCS654/blob/main/clustering_assignment_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load dataset
data = load_iris()
X = data.data
df = pd.DataFrame(X, columns=data.feature_names)

# Define Preprocessing Functions
def normalize(X):
    return MinMaxScaler().fit_transform(X)

def standardize(X):
    return StandardScaler().fit_transform(X)

def apply_pca(X, n_components=2):
    return PCA(n_components=n_components).fit_transform(X)

def log_transform(X):
    return np.log1p(X)

# Define evaluation function
def evaluate_clustering(X, labels):
    return {
        'Silhouette': round(silhouette_score(X, labels), 2),
        'Calinski-Harabasz': round(calinski_harabasz_score(X, labels), 2),
        'Davies-Bouldin': round(davies_bouldin_score(X, labels), 2)
    }

# Define all preprocessing scenarios
scenarios = {
    "Raw": lambda X: X,
    "Normalized": normalize,
    "Log Transformed": log_transform,
    "PCA": lambda X: apply_pca(X),
    "T+N": lambda X: normalize(log_transform(X)),
    "T+N+PCA": lambda X: apply_pca(normalize(log_transform(X)))
}

# Define clustering algorithms
clustering_algorithms = {
    "KMeans": lambda k: KMeans(n_clusters=k, random_state=42),
    "Hierarchical": lambda k: AgglomerativeClustering(n_clusters=k),
    "MeanShift": lambda _: MeanShift()
}

# Result collector
results = []

# Run experiments
for method_name, preprocess_func in scenarios.items():
    X_proc = preprocess_func(df.values)
    for algo_name, algo_func in clustering_algorithms.items():
        for k in [3, 4, 5] if algo_name != "MeanShift" else [None]:
            model = algo_func(k)
            labels = model.fit_predict(X_proc)

            # ✅ NEW: Get actual number of clusters (especially for MeanShift)
            n_clusters = len(np.unique(labels)) if algo_name == "MeanShift" else k

            scores = evaluate_clustering(X_proc, labels)
            scores.update({
                "Preprocessing": method_name,
                "Algorithm": algo_name,
                "Clusters": n_clusters
            })
            results.append(scores)

# Display Results
results_df = pd.DataFrame(results)
pivot_table = pd.pivot_table(results_df,
                              values=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'],
                              index=['Algorithm', 'Preprocessing', 'Clusters'])

print("\n=== Evaluation Results ===")
print(pivot_table)

# Identify best performing configurations

# Normalize the metrics for comparison
normalized = results_df.copy()
normalized['Silhouette_N'] = (normalized['Silhouette'] - normalized['Silhouette'].min()) / (normalized['Silhouette'].max() - normalized['Silhouette'].min())
normalized['CH_N'] = (normalized['Calinski-Harabasz'] - normalized['Calinski-Harabasz'].min()) / (normalized['Calinski-Harabasz'].max() - normalized['Calinski-Harabasz'].min())
normalized['DB_N'] = 1 - (normalized['Davies-Bouldin'] - normalized['Davies-Bouldin'].min()) / (normalized['Davies-Bouldin'].max() - normalized['Davies-Bouldin'].min())  # lower is better

# Compute average score
normalized['Overall_Score'] = (normalized['Silhouette_N'] + normalized['CH_N'] + normalized['DB_N']) / 3

# Find the best
best_row = normalized.sort_values(by='Overall_Score', ascending=False).iloc[0]
print("\n=== 🔍 Best Clustering Configuration ===")
print(f"Algorithm     : {best_row['Algorithm']}")
print(f"Preprocessing : {best_row['Preprocessing']}")
print(f"Clusters      : {best_row['Clusters']}")
print(f"Silhouette    : {best_row['Silhouette']}")
print(f"Calinski-Harabasz : {best_row['Calinski-Harabasz']}")
print(f"Davies-Bouldin    : {best_row['Davies-Bouldin']}")
print(f"Overall Score     : {round(best_row['Overall_Score'], 3)}")



=== Evaluation Results ===
                                       Calinski-Harabasz  Davies-Bouldin  \
Algorithm    Preprocessing   Clusters                                      
Hierarchical Log Transformed 3                    974.18            0.63   
                             4                    786.66            0.72   
                             5                    671.50            0.84   
             Normalized      3                    349.25            0.75   
                             4                    301.10            0.85   
                             5                    272.02            0.91   
             PCA             3                    688.62            0.56   
                             4                    673.95            0.65   
                             5                    665.88            0.65   
             Raw             3                    558.06            0.66   
                             4                    515.08    