# Unsupervised Learning

This notebook applies unsupervised learning techniques to identify AMR patterns.

## Objectives
- Perform clustering (K-Means, Hierarchical, DBSCAN)
- Apply dimensionality reduction (PCA, t-SNE, UMAP)
- Discover association rules for antibiotic resistance
- Visualize patterns and clusters
- Interpret findings

## 1. Setup and Imports

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Custom modules
import sys
sys.path.append('..')
from src.models.unsupervised import (
    perform_kmeans_clustering,
    perform_hierarchical_clustering,
    perform_dbscan_clustering,
    perform_pca,
    perform_tsne,
    perform_umap,
    find_optimal_clusters,
    apply_association_rules
)
from src.visualization.plots import (
    plot_cluster_visualization,
    plot_dimensionality_reduction
)

%matplotlib inline

## 2. Load Processed Data

In [None]:
# TODO: Load processed data from data/processed/
df = pd.read_csv('../data/processed/cleaned_data.csv')

print(f"Dataset shape: {df.shape}")
df.head()

## 3. Dimensionality Reduction

### 3.1 PCA (Principal Component Analysis)

In [None]:
# TODO: Apply PCA
# pca = PCA(n_components=2)
# X_pca = pca.fit_transform(X)

# TODO: Visualize explained variance
# Plot cumulative explained variance

### 3.2 t-SNE

In [None]:
# TODO: Apply t-SNE
# tsne = TSNE(n_components=2, random_state=42)
# X_tsne = tsne.fit_transform(X)

### 3.3 UMAP

In [None]:
# TODO: Apply UMAP
# import umap
# reducer = umap.UMAP(n_components=2, random_state=42)
# X_umap = reducer.fit_transform(X)

## 4. Clustering

### 4.1 Find Optimal Number of Clusters

In [None]:
# TODO: Use elbow method to find optimal k
# inertias = []
# for k in range(2, 11):
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     kmeans.fit(X)
#     inertias.append(kmeans.inertia_)

# plt.plot(range(2, 11), inertias, marker='o')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Inertia')
# plt.title('Elbow Method')

In [None]:
# TODO: Calculate silhouette scores
# for k in range(2, 11):
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     labels = kmeans.fit_predict(X)
#     score = silhouette_score(X, labels)
#     print(f"k={k}: Silhouette Score = {score:.3f}")

### 4.2 K-Means Clustering

In [None]:
# TODO: Apply K-Means with optimal k
# kmeans = KMeans(n_clusters=3, random_state=42)
# kmeans_labels = kmeans.fit_predict(X)

# TODO: Visualize clusters on PCA/t-SNE/UMAP

### 4.3 Hierarchical Clustering

In [None]:
# TODO: Apply Hierarchical Clustering
# hierarchical = AgglomerativeClustering(n_clusters=3, linkage='ward')
# hierarchical_labels = hierarchical.fit_predict(X)

# TODO: Create dendrogram

### 4.4 DBSCAN

In [None]:
# TODO: Apply DBSCAN
# dbscan = DBSCAN(eps=0.5, min_samples=5)
# dbscan_labels = dbscan.fit_predict(X)

## 5. Association Rule Mining

In [None]:
# TODO: Apply Apriori algorithm for association rules
# from mlxtend.frequent_patterns import apriori, association_rules

# Discover patterns like: if resistant to antibiotic A, likely resistant to B

## 6. Cluster Interpretation

In [None]:
# TODO: Analyze cluster characteristics
# Calculate mean feature values for each cluster
# Identify distinguishing features

## 7. Save Results

In [None]:
# TODO: Save cluster labels and reduced dimensions
# results_df = pd.DataFrame({
#     'kmeans_cluster': kmeans_labels,
#     'hierarchical_cluster': hierarchical_labels,
#     'pca_1': X_pca[:, 0],
#     'pca_2': X_pca[:, 1]
# })
# results_df.to_csv('../data/processed/unsupervised_results.csv', index=False)