In [None]:
# 📦 KMeans Clustering on PCA(2) data
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import ParameterGrid

# Load and prepare data
full_data = pd.read_csv('3_direct_encoded.csv')
data_v2 = full_data.iloc[:, 2:]  # Exclude ID & Parameters
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(data_v2), columns=data_v2.columns)

# PCA transformation
pca = PCA(n_components=2)
pca_data = pd.DataFrame(pca.fit_transform(scaled_data), columns=["PC1", "PC2"])

# Elbow & Silhouette Analysis
params = {'n_clusters': range(2, 20)}
inertia_values = []

divide = int(len(pca_data) * 0.8)
pca_data_train = pca_data[:divide]
pca_data_test = pca_data[divide:]

for param in ParameterGrid(params):
    kmeans = KMeans(n_clusters=param['n_clusters'], random_state=42)
    kmeans.fit(pca_data_train)
    preds = kmeans.predict(pca_data_test)
    score = silhouette_score(pca_data_test, preds)
    inertia_values.append(kmeans.inertia_)
    print(f"K={param['n_clusters']}, Silhouette Score={score:.4f}")

# Elbow Plot
elbow_df = pd.DataFrame({'K': list(range(2, 20)), 'Inertia': inertia_values})
sns.lineplot(data=elbow_df, x="K", y="Inertia", marker="o")
plt.title("Elbow Method")
plt.show()


🔹 Clustering and Visualization for k=5, 6, 7

In [None]:
def run_kmeans_and_plot(pca_data, k, title):
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(pca_data)
    preds = model.predict(pca_data)
    score = silhouette_score(pca_data, preds)

    pca_clustered = pca_data.copy()
    pca_clustered["cluster"] = preds

    print(f"\nSilhouette Score (K={k}): {score:.4f}")
    plt.figure(figsize=(8, 5))
    sns.scatterplot(data=pca_clustered, x='PC1', y='PC2', hue='cluster', palette='tab10', s=60)
    plt.title(title)
    plt.grid(True)
    plt.show()

    return model, pca_clustered

# Run for K = 5, 6, 7
kmeans_model_5, pca_clustered_5 = run_kmeans_and_plot(pca_data, 5, "KMeans Clustering (K=5)")
kmeans_model_6, pca_clustered_6 = run_kmeans_and_plot(pca_data, 6, "KMeans Clustering (K=6)")
kmeans_model_7, pca_clustered_7 = run_kmeans_and_plot(pca_data, 7, "KMeans Clustering (K=7)")
