In [None]:
# Fashion MNIST Full Unsupervised Pipeline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.datasets import fashion_mnist
import warnings
warnings.filterwarnings("ignore")

# Phase 1: Data Loading & Preprocessing
(x_train, y_train), (_, _) = fashion_mnist.load_data()

# Normalize and flatten
x = x_train / 255.0
x_flat = x.reshape(x.shape[0], -1)

# Sample labels map (for later visualization)
label_map = {
    0: "T-shirt", 1: "Trouser", 2: "Pullover", 3: "Dress", 4: "Coat",
    5: "Sandal", 6: "Shirt", 7: "Sneaker", 8: "Bag", 9: "Ankle boot"
}

# Phase 1: EDA
plt.figure(figsize=(10, 6))
for i in range(10):
    plt.subplot(2, 5, i+1)
    plt.imshow(x[i], cmap="gray")
    plt.title(label_map[y_train[i]])
    plt.axis("off")
plt.suptitle("Sample Images")
plt.tight_layout()
plt.show()

# Pixel distribution
plt.hist(x_flat.ravel(), bins=50)
plt.title("Pixel Value Distribution")
plt.xlabel("Pixel Intensity")
plt.ylabel("Frequency")
plt.show()

# Phase 2: PCA
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_flat)

pca = PCA()
x_pca = pca.fit_transform(x_scaled)
explained_var = np.cumsum(pca.explained_variance_ratio_)

plt.plot(explained_var[:100])
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA Explained Variance")
plt.grid(True)
plt.show()

# Reconstruction
def reconstruct_image(n_components):
    pca = PCA(n_components=n_components)
    x_reduced = pca.fit_transform(x_scaled)
    x_reconstructed = pca.inverse_transform(x_reduced)
    mse = mean_squared_error(x_scaled, x_reconstructed)
    return x_reconstructed, mse

for n in [10, 50, 100]:
    x_rec, mse = reconstruct_image(n)
    print(f"Reconstruction MSE with {n} components: {mse:.4f}")
    plt.figure(figsize=(6, 3))
    for i in range(5):
        plt.subplot(2, 5, i+1)
        plt.imshow(x[i], cmap="gray")
        plt.axis("off")
        plt.title("Original")
        
        plt.subplot(2, 5, i+6)
        plt.imshow(x_rec[i].reshape(28, 28), cmap="gray")
        plt.axis("off")
        plt.title(f"{n} comps")
    plt.suptitle(f"Reconstruction with {n} PCA Components")
    plt.tight_layout()
    plt.show()

# SVD
svd = TruncatedSVD(n_components=100)
x_svd = svd.fit_transform(x_scaled)
print(f"SVD Explained Variance (100 comps): {np.sum(svd.explained_variance_ratio_):.4f}")

# Phase 3: Clustering
# KMeans
for k in [10, 15, 20]:
    kmeans = KMeans(n_clusters=k, random_state=42)
    preds = kmeans.fit_predict(x_pca[:, :50])
    score = silhouette_score(x_pca[:, :50], preds)
    print(f"KMeans (k={k}) Silhouette Score: {score:.4f}, Inertia: {kmeans.inertia_:.2f}")

# DBSCAN
for eps in [2, 5, 7]:
    db = DBSCAN(eps=eps, min_samples=5)
    preds = db.fit_predict(x_pca[:, :20])
    labels_unique = len(set(preds)) - (1 if -1 in preds else 0)
    if labels_unique > 1:
        score = silhouette_score(x_pca[:, :20], preds)
        print(f"DBSCAN (eps={eps}) Silhouette Score: {score:.4f}, Clusters: {labels_unique}")
    else:
        print(f"DBSCAN (eps={eps}) failed to form clusters")

# Hierarchical
agg = AgglomerativeClustering(n_clusters=10)
preds = agg.fit_predict(x_pca[:, :50])
score = silhouette_score(x_pca[:, :50], preds)
print(f"Agglomerative Clustering Score: {score:.4f}")

# Phase 4: Visualization
from mpl_toolkits.mplot3d import Axes3D

pca_vis = PCA(n_components=3)
x_vis = pca_vis.fit_transform(x_scaled)

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(x_vis[:, 0], x_vis[:, 1], x_vis[:, 2], c=y_train, cmap='tab10', s=10)
plt.legend(*scatter.legend_elements(), loc="best", title="Classes")
plt.title("3D PCA Visualization (True Labels)")
plt.show()

# Phase 5: Reporting Notes (for slide/report)
# - Natural number of clusters is approx. 10 (aligns with true labels)
# - Common confusion: Shirt vs T-shirt, Sneaker vs Ankle boot
# - System can auto-group new images if projected to PCA space and assigned via trained cluster model

# Phase 6: Optional Deployment with Streamlit (not included here)

# END OF PIPELINE
