In [None]:
# 🔍 t-SNE + KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score

# Use same scaled data
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
tsne_data = pd.DataFrame(tsne.fit_transform(scaled_data), columns=["PC1", "PC2"])

k = 6
kmeans_tsne = KMeans(n_clusters=k, random_state=42)
tsne_data["cluster"] = kmeans_tsne.fit_predict(tsne_data)

# Scores
sil = silhouette_score(tsne_data, tsne_data["cluster"])
dbi = davies_bouldin_score(tsne_data, tsne_data["cluster"])
ch = calinski_harabasz_score(tsne_data, tsne_data["cluster"])
print(f"K={k} | Silhouette={sil:.4f} | DBI={dbi:.4f} | CH={ch:.4f}")

# Plot
sns.scatterplot(data=tsne_data, x='PC1', y='PC2', hue='cluster', palette='tab10')
plt.title(f'tSNE + KMeans Clustering (K={k})')
plt.grid(True)
plt.show()


🔹 GMM (Soft Clustering) on Output Features Only

In [None]:
from sklearn.mixture import GaussianMixture

X = scaled_data.iloc[:, :-3]  # Drop C, S, G
pca_out = PCA(n_components=2)
X_pca_out = pd.DataFrame(pca_out.fit_transform(X), columns=["pca1", "pca2"])

bics, aics = [], []
for k in range(1, 10):
    gmm = GaussianMixture(n_components=k, random_state=42)
    gmm.fit(X_pca_out)
    bics.append(gmm.bic(X_pca_out))
    aics.append(gmm.aic(X_pca_out))

plt.plot(range(1, 10), bics, label="BIC", marker='o')
plt.plot(range(1, 10), aics, label="AIC", marker='o')
plt.title("GMM Clustering - BIC & AIC")
plt.legend()
plt.show()

# Fit optimal
optimal_k = 5
gmm = GaussianMixture(n_components=optimal_k, random_state=42)
labels = gmm.fit_predict(X_pca_out)
X_pca_out["cluster"] = labels

sns.scatterplot(data=X_pca_out, x='pca1', y='pca2', hue='cluster', palette="tab10")
plt.title(f'GMM Clustering (K={optimal_k})')
plt.grid(True)
plt.show()
