# Supervised Learning

This notebook trains and evaluates supervised learning models for AMR prediction.

## Objectives
- Split data into train/validation/test sets
- Train multiple classifiers (RF, XGBoost, LR, SVM, KNN, NB)
- Perform hyperparameter tuning
- Evaluate model performance
- Analyze feature importance
- Save best models

## 1. Setup and Imports

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Custom modules
import sys
sys.path.append('..')
from src.data.splitting import split_data, create_stratified_folds
from src.models.supervised import (
    train_random_forest,
    train_xgboost,
    train_logistic_regression,
    train_svm,
    train_knn,
    train_naive_bayes,
    perform_hyperparameter_tuning
)
from src.models.evaluation import (
    calculate_classification_metrics,
    create_confusion_matrix,
    plot_confusion_matrix,
    calculate_roc_auc,
    plot_roc_curve,
    perform_cross_validation
)
from src.visualization.plots import plot_feature_importance

%matplotlib inline

## 2. Load and Prepare Data

In [None]:
# TODO: Load processed data
df = pd.read_csv('../data/processed/cleaned_data.csv')

# TODO: Define features and target
# X = df.drop('target_column', axis=1)
# y = df['target_column']

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

## 3. Train/Validation/Test Split

In [None]:
# TODO: Split data with stratification
# X_train, X_val, X_test, y_train, y_val, y_test = split_data(
#     X, y, test_size=0.2, val_size=0.1, random_state=42
# )

# Or use simple train_test_split:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

## 4. Train Models

### 4.1 Random Forest

In [None]:
# TODO: Train Random Forest
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# TODO: Evaluate on validation set
# y_val_pred = rf_model.predict(X_val)
# rf_metrics = calculate_classification_metrics(y_val, y_val_pred)

### 4.2 XGBoost

In [None]:
# TODO: Train XGBoost
# xgb_model = XGBClassifier(n_estimators=100, random_state=42)
# xgb_model.fit(X_train, y_train)

### 4.3 Logistic Regression

In [None]:
# TODO: Train Logistic Regression
# lr_model = LogisticRegression(max_iter=1000, random_state=42)
# lr_model.fit(X_train, y_train)

### 4.4 Support Vector Machine

In [None]:
# TODO: Train SVM
# svm_model = SVC(kernel='rbf', probability=True, random_state=42)
# svm_model.fit(X_train, y_train)

### 4.5 K-Nearest Neighbors

In [None]:
# TODO: Train KNN
# knn_model = KNeighborsClassifier(n_neighbors=5)
# knn_model.fit(X_train, y_train)

### 4.6 Naive Bayes

In [None]:
# TODO: Train Naive Bayes
# nb_model = GaussianNB()
# nb_model.fit(X_train, y_train)

## 5. Hyperparameter Tuning

In [None]:
# TODO: Tune best performing model
# Example for Random Forest:
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20, 30, None],
#     'min_samples_split': [2, 5, 10]
# }
# grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X_train, y_train)
# print(f"Best parameters: {grid_search.best_params_}")

## 6. Model Evaluation

In [None]:
# TODO: Evaluate all models on validation set
# Create comparison table with metrics

In [None]:
# TODO: Plot confusion matrices
# plot_confusion_matrix(y_val, y_val_pred)

In [None]:
# TODO: Plot ROC curves
# plot_roc_curve(y_val, y_val_pred_proba)

## 7. Feature Importance Analysis

In [None]:
# TODO: Plot feature importance for tree-based models
# feature_importance = rf_model.feature_importances_
# plot_feature_importance(X.columns, feature_importance)

## 8. Save Best Models

In [None]:
# TODO: Save trained models
# joblib.dump(rf_model, '../models/random_forest_model.pkl')
# joblib.dump(xgb_model, '../models/xgboost_model.pkl')
# print("Models saved successfully!")