In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

df = pd.read_csv("C:/Users/varsh/Downloads/wine-clustering-extended.csv")  # <-- change dataset here

features = ['Alcohol', 'Malic_Acid', 'Ash', 'Ash_Alcanity', 'Magnesium',
            'Total_Phenols', 'Flavanoids', 'Nonflavanoid_Phenols',
            'Proanthocyanins', 'Color_Intensity', 'Hue', 'OD280', 'Proline']

X = df[features]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


pca = PCA(n_components='mle', svd_solver='full')  
X_pca = pca.fit_transform(X_scaled)

print("Number of components selected by PCA:", pca.n_components_)
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Cumulative explained variance:", pca.explained_variance_ratio_.cumsum())


kmeans = KMeans(n_clusters=4, random_state=42, n_init=20)
labels = kmeans.fit_predict(X_pca)


score = silhouette_score(X_pca, labels)
print("Silhouette Score:", score)


if X_pca.shape[1] >= 2:
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap="viridis")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title("KMeans Clustering after PCA")
    plt.show()

Number of components selected by PCA: 1
Explained variance ratio: [0.09102601]
Cumulative explained variance: [0.09102601]
Silhouette Score: 0.5310479464440097
