# Unsupervised clustering on Wine Quality (Red)

This notebook documents EDA, preprocessing, KMeans baseline training (silhouette-based k selection), and saving artifacts.


In [None]:
%%bash
python -m pip install scikit-learn matplotlib seaborn joblib --quiet

# Verify installations
python - <<'PY'
import sklearn, matplotlib, seaborn, joblib
print('sklearn', sklearn.__version__)
print('matplotlib', matplotlib.__version__)
print('seaborn', seaborn.__version__)
print('joblib', joblib.__version__)
PY

In [None]:
# Imports and notebook config
%matplotlib inline
import os
import logging
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import joblib

logging.basicConfig(level=logging.INFO)
RANDOM_SEED = 42
ROOT = os.getcwd()
DATA_PATH = os.path.join('Mock_student_packet_v4','winequality-red.csv')
MODELS_DIR = 'models'
RESULTS_DIR = 'results'
for d in (MODELS_DIR, RESULTS_DIR):
    os.makedirs(d, exist_ok=True)

print('ROOT:', ROOT)
print('Data path:', DATA_PATH)


In [None]:
# Helper functions
from typing import Tuple

def load_wine(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, sep=';')
    assert 'quality' in df.columns, 'Expected quality column'
    return df


def preprocess(df: pd.DataFrame) -> Tuple[np.ndarray, list]:
    X = df.drop(columns=['quality']).values
    feature_names = df.drop(columns=['quality']).columns.tolist()
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, feature_names, scaler


# small unit test
_example = pd.DataFrame({'a':[1,2], 'quality':[0,1]})
try:
    _ = preprocess(_example)
    print('Preprocess smoke test: OK')
except Exception as e:
    print('Preprocess smoke test: FAILED', e)


In [None]:
# Load data and run EDA

df = load_wine(DATA_PATH)
print('Shape:', df.shape)
print('\nMissing values per column:\n', df.isnull().sum())
print('\nQuality distribution:\n', df['quality'].value_counts().sort_index())

# Correlation
corr = df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature correlation')
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'correlation.png'))
print('Saved correlation heatmap to', os.path.join(RESULTS_DIR, 'correlation.png'))


In [None]:
# Preprocessing, k selection, training, and saving artifacts

X_scaled, feature_names, scaler = preprocess(df)

# PCA for visualization
pca = PCA(n_components=2, random_state=RANDOM_SEED)
X_pca = pca.fit_transform(X_scaled)

# silhouette-based k selection
scores = {}
for k in range(2,9):
    km = KMeans(n_clusters=k, random_state=RANDOM_SEED, n_init=10)
    labels = km.fit_predict(X_scaled)
    scores[k] = silhouette_score(X_scaled, labels)

best_k = max(scores, key=scores.get)
print('Silhouette scores:', scores)
print('Best k:', best_k)

# fit final
kmeans = KMeans(n_clusters=best_k, random_state=RANDOM_SEED, n_init=10)
kmeans.fit(X_scaled)
labels = kmeans.predict(X_scaled)
df['cluster'] = labels

# save artifacts
joblib.dump({'scaler':scaler, 'kmeans':kmeans, 'feature_names':feature_names}, os.path.join(MODELS_DIR, 'kmeans_pipeline.pkl'))
joblib.dump(pca, os.path.join(MODELS_DIR, 'pca.pkl'))

# plot clusters in PCA space
plt.figure(figsize=(8,6))
for cluster in sorted(df['cluster'].unique()):
    idx = df['cluster'] == cluster
    plt.scatter(X_pca[idx,0], X_pca[idx,1], label=f'Cluster {cluster}', alpha=0.6)
plt.legend(); plt.title(f'KMeans clusters (k={best_k}) in PCA space')
plt.xlabel('PCA1'); plt.ylabel('PCA2'); plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'cluster_pca.png'))
print('Saved cluster image to', os.path.join(RESULTS_DIR, 'cluster_pca.png'))


In [None]:
# Quick evaluation and suggested next steps
print('\nCluster sizes:\n', df['cluster'].value_counts().sort_index())
print('\nFinal silhouette score:', silhouette_score(X_scaled, df['cluster']))

# Suggested next steps
print('\nSuggested next steps:')
print('- Try GaussianMixture or DBSCAN to find non-convex clusters')
print('- Experiment with feature selection or transformations for skewed features')
print('- Compare clusters to quality labels and compute stats per cluster')


In [None]:
# Save summary and how to load model
print('Models saved to', MODELS_DIR)

print('\nExample: load model and predict on new data')
print('''
import joblib
m = joblib.load('models/kmeans_pipeline.pkl')
scaler = m['scaler']
kmeans = m['kmeans']
X_new_scaled = scaler.transform(X[:5])
print(kmeans.predict(X_new_scaled))
''')
