# Phase 1: Unsupervised Pattern Recognition

This notebook applies unsupervised learning to identify AMR patterns.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler

import sys
sys.path.append('..')

# Import unsupervised learning functions
from src.models.unsupervised import (
    perform_kmeans, perform_hierarchical, perform_dbscan,
    find_optimal_clusters, get_cluster_summary,
    perform_pca, perform_tsne, perform_umap, get_pca_loadings
)

# Import association rule mining functions
from src.models.association_rules import (
    prepare_binary_resistance, mine_frequent_itemsets,
    generate_association_rules, filter_top_rules, interpret_rules,
    get_resistance_frequency
)

# Import visualization functions
from src.visualization.plots import (
    plot_elbow_curve, plot_silhouette_scores, plot_dendrogram,
    plot_cluster_distribution, plot_2d_scatter, plot_pca_variance,
    plot_pca_loadings_heatmap, plot_clustering_comparison,
    plot_reduction_comparison
)

sns.set_style('whitegrid')
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler

import sys
sys.path.append('..')
from src.models.unsupervised import *
from src.models.association_rules import *
from src.visualization.plots import *

sns.set_style('whitegrid')
%matplotlib inline

## 2. Load Processed Data

In [None]:
df = pd.read_csv('../data/processed/cleaned_data.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
resistance_cols = [col for col in df.columns if col.endswith('_encoded')]
print(f"Resistance features: {len(resistance_cols)}")

In [None]:
X = df[resistance_cols].fillna(-1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Feature matrix shape: {X_scaled.shape}")

## 3. Clustering Analysis

### 3.1 Determine Optimal Number of Clusters

In [None]:
k_range, inertias, silhouette_scores = find_optimal_clusters(X_scaled, max_k=10)
for k, s in zip(k_range, silhouette_scores):
    print(f"K={k}: Silhouette={s:.3f}")

In [None]:
fig = plot_elbow_curve(k_range, inertias, save_path='../reports/figures/elbow_plot.png')
plt.show()

In [None]:
fig = plot_silhouette_scores(k_range, silhouette_scores, save_path='../reports/figures/silhouette_plot.png')
plt.show()

In [None]:
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal K: {optimal_k}")

### 3.2 K-Means Clustering

In [None]:
kmeans_labels, kmeans_model = perform_kmeans(X_scaled, n_clusters=optimal_k)
print(f"K-Means with K={optimal_k}")
print(f"Cluster sizes: {np.bincount(kmeans_labels)}")

In [None]:
fig = plot_cluster_distribution(kmeans_labels)
plt.show()

In [None]:
kmeans_summary = get_cluster_summary(df, kmeans_labels, resistance_cols)
kmeans_summary

### 3.3 Hierarchical Clustering

In [None]:
fig = plot_dendrogram(X_scaled, method='ward', p=5, save_path='../reports/figures/dendrogram.png')
plt.show()

In [None]:
hierarchical_labels, hierarchical_model = perform_hierarchical(X_scaled, n_clusters=optimal_k)
hierarchical_summary = get_cluster_summary(df, hierarchical_labels, resistance_cols)
hierarchical_summary

### 3.4 DBSCAN (Outlier Detection)

In [None]:
dbscan_labels, dbscan_model = perform_dbscan(X_scaled, eps=3.5, min_samples=5)
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_outliers = list(dbscan_labels).count(-1)
print(f"Clusters: {n_clusters}, Outliers: {n_outliers}")

In [None]:
dbscan_summary = get_cluster_summary(df, dbscan_labels, resistance_cols)
dbscan_summary

### 3.5 Save Cluster Results

In [None]:
cluster_results = df[['isolate_code', 'bacterial_species', 'MAR_index']].copy()
cluster_results['kmeans_cluster'] = kmeans_labels
cluster_results['hierarchical_cluster'] = hierarchical_labels
cluster_results['dbscan_cluster'] = dbscan_labels
cluster_results.to_csv('../reports/results/cluster_labels.csv', index=False)
print('Saved cluster labels')

In [None]:
with pd.ExcelWriter('../reports/results/cluster_summary.xlsx') as writer:
    kmeans_summary.to_excel(writer, sheet_name='KMeans', index=False)
    hierarchical_summary.to_excel(writer, sheet_name='Hierarchical', index=False)
    dbscan_summary.to_excel(writer, sheet_name='DBSCAN', index=False)
print('Saved cluster summaries')

## 4. Dimensionality Reduction

### 4.1 PCA Analysis

In [None]:
X_pca_full, pca_full = perform_pca(X_scaled, n_components=min(X_scaled.shape))
fig = plot_pca_variance(pca_full, save_path='../reports/figures/pca_variance_explained.png')
plt.show()

In [None]:
cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)
n_comp_80 = np.argmax(cumsum_var >= 0.8) + 1
n_comp_90 = np.argmax(cumsum_var >= 0.9) + 1
print(f"80% variance: {n_comp_80} components")
print(f"90% variance: {n_comp_90} components")

In [None]:
X_pca, pca_model = perform_pca(X_scaled, n_components=2)
print(f"PCA variance explained: {pca_model.explained_variance_ratio_.sum():.3f}")

In [None]:
fig = plot_2d_scatter(X_pca, labels=df['bacterial_species'].values, title='PCA - By Species', save_path='../reports/figures/pca_by_species.png')
plt.show()

In [None]:
fig = plot_2d_scatter(X_pca, labels=df['MAR_index'].values, title='PCA - By MAR Index', palette='RdYlGn_r')
plt.show()

In [None]:
fig = plot_2d_scatter(X_pca, labels=kmeans_labels, title='PCA - By Clusters', save_path='../reports/figures/kmeans_clusters_pca.png')
plt.show()

In [None]:
feature_names = [col.replace('_encoded', '') for col in resistance_cols]
loadings = get_pca_loadings(pca_full, feature_names)
fig = plot_pca_loadings_heatmap(loadings, feature_names, n_components=5, save_path='../reports/figures/pca_loadings_heatmap.png')
plt.show()

In [None]:
print("Top 5 PC1 loadings:")
print(loadings['PC1'].abs().sort_values(ascending=False).head())

### 4.2 t-SNE Visualization

In [None]:
print('Computing t-SNE...')
X_tsne = perform_tsne(X_scaled, n_components=2, perplexity=30)
print('Complete!')

In [None]:
fig = plot_2d_scatter(X_tsne, labels=df['bacterial_species'].values, title='t-SNE - By Species', save_path='../reports/figures/tsne_by_species.png')
plt.show()

In [None]:
fig = plot_2d_scatter(X_tsne, labels=kmeans_labels, title='t-SNE - By Clusters')
plt.show()

### 4.3 UMAP Visualization

In [None]:
print('Computing UMAP...')
X_umap = perform_umap(X_scaled, n_components=2, n_neighbors=15, min_dist=0.1)
print('Complete!')

In [None]:
fig = plot_2d_scatter(X_umap, labels=df['bacterial_species'].values, title='UMAP - By Species', save_path='../reports/figures/umap_by_species.png')
plt.show()

In [None]:
fig = plot_2d_scatter(X_umap, labels=kmeans_labels, title='UMAP - By Clusters')
plt.show()

### 4.4 Comparison Plots

In [None]:
fig = plot_reduction_comparison(X_pca, X_tsne, X_umap, labels=df['bacterial_species'].values, save_path='../reports/figures/reduction_comparison.png')
plt.show()

In [None]:
labels_dict = {'K-Means': kmeans_labels, 'Hierarchical': hierarchical_labels, 'DBSCAN': dbscan_labels}
fig = plot_clustering_comparison(X_pca, labels_dict, save_path='../reports/figures/clustering_comparison.png')
plt.show()

### 4.5 Save Embeddings

In [None]:
pd.DataFrame(X_pca, columns=['PC1', 'PC2']).to_csv('../reports/results/pca_embeddings.csv', index=False)
pd.DataFrame(X_tsne, columns=['tSNE1', 'tSNE2']).to_csv('../reports/results/tsne_embeddings.csv', index=False)
pd.DataFrame(X_umap, columns=['UMAP1', 'UMAP2']).to_csv('../reports/results/umap_embeddings.csv', index=False)
loadings.to_csv('../reports/results/pca_loadings.csv')
print('All embeddings saved')

## 5. Association Rule Mining

### 5.1 Prepare Binary Resistance Data

In [None]:
df_binary = prepare_binary_resistance(df, resistance_cols)
print(f"Binary matrix shape: {df_binary.shape}")
df_binary.head()

In [None]:
freq_df = get_resistance_frequency(df_binary)
freq_df.head(10)

### 5.2 Mine Frequent Itemsets

In [None]:
print('Mining frequent itemsets...')
frequent_itemsets = mine_frequent_itemsets(df_binary, min_support=0.02)
print(f"Found {len(frequent_itemsets)} frequent itemsets")
frequent_itemsets.sort_values('support', ascending=False).head(10)

### 5.3 Generate Association Rules

In [None]:
print('Generating association rules...')
rules = generate_association_rules(frequent_itemsets, min_confidence=0.6, min_lift=1.0)
print(f"Found {len(rules)} rules")

In [None]:
top_rules = filter_top_rules(rules, n=20, sort_by='lift')
top_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)

### 5.4 Interpret Co-Resistance Patterns

In [None]:
interpreted_rules = interpret_rules(top_rules)
for i, row in interpreted_rules.head(5).iterrows():
    print(f"{i+1}. {row['interpretation']}")

In [None]:
interpreted_rules[['antecedents_str', 'consequents_str', 'support', 'confidence', 'lift', 'interpretation']].to_csv('../reports/results/association_rules.csv', index=False)
print('Association rules saved')

## 6. Key Findings Summary

In [None]:
print('='*80)
print('KEY FINDINGS')
print('='*80)
print(f"\n1. CLUSTERING")
print(f"   Optimal K: {optimal_k}")
print(f"   K-Means clusters: {len(np.unique(kmeans_labels))}")
print(f"   DBSCAN outliers: {n_outliers}")
print(f"\n2. DIMENSIONALITY REDUCTION")
print(f"   PCA 80% variance: {n_comp_80} components")
print(f"   PCA 90% variance: {n_comp_90} components")
print(f"\n3. ASSOCIATION RULES")
print(f"   Frequent itemsets: {len(frequent_itemsets)}")
print(f"   Association rules: {len(rules)}")
print('\n' + '='*80)

## 7. Summary

Phase 1 complete. All results and figures saved to `reports/` directory.