In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import joblib

# Load precomputed features and labels
X_features = np.load("data/features/X_features.npy")
y_labels = np.load("data/features/y_labels.npy")

print("X_features shape:", X_features.shape)
print("y shape:", y_labels.shape)

# Quick sanity check
print("Unique labels:", np.unique(y_labels))

# Split into train and temp (val+test)
X_train, X_temp, y_train, y_temp = train_test_split(X_features, y_labels, test_size=0.30, random_state=42, stratify=y_labels)

# Split temp into val and test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

print("Train size:", X_train.shape[0])
print("Validation size:", X_val.shape[0])
print("Test size:", X_test.shape[0])

# Pipelines: scaler + classifier
pipelines = {
    "knn": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier())
    ]),
    "svm_rbf": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel="rbf"))
    ]),
    "rf": Pipeline([
        ("scaler", StandardScaler()),  # not mandatory but keeps uniformity
        ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
    ]),
}

# Hyperparameter grids
param_grids = {
    "knn": {
        "clf__n_neighbors": [3, 5, 7, 9],
        "clf__weights": ["uniform", "distance"]
    },
    "svm_rbf": {
        "clf__C": [0.1, 1, 10],
        "clf__gamma": [0.001, 0.01, 0.1, 1]
    },
    "rf": {
        "clf__n_estimators": [100, 200],
        "clf__max_depth": [None, 10, 20]
    }
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}
best_models = {}

for name in pipelines:
    print(f"\n=== Training {name} ===")
    gs = GridSearchCV(
        estimator=pipelines[name],
        param_grid=param_grids[name],
        cv=cv,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1
    )
    gs.fit(X_train, y_train)

    print("Best CV accuracy:", gs.best_score_)
    print("Best params:", gs.best_params_)

    results[name] = gs
    best_models[name] = gs.best_estimator_

for name, model in best_models.items():
    y_val_pred = model.predict(X_val)
    acc_val = accuracy_score(y_val, y_val_pred)
    print(f"\n--- {name.upper()} on Validation ---")
    print("Accuracy:", acc_val)
    print(classification_report(y_val, y_val_pred, digits=3))

# Select best model by CV score
best_name = max(results.keys(), key=lambda m: results[m].best_score_)
best_model = best_models[best_name]

print(f"Best model: {best_name} with CV accuracy = {results[best_name].best_score_:.3f}")

# Save best model
os.makedirs("models", exist_ok=True)
joblib.dump(best_model, f"models/best_model_{best_name}.joblib")
print("Saved:", f"models/best_model_{best_name}.joblib")