In [None]:
import pandas as pd
import numpy as np
import joblib

from pathlib import Path
from typing import Dict, Any, Tuple, Optional

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    recall_score,
    f1_score,
    roc_auc_score,
    precision_score,
    precision_recall_curve,
    auc,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# ===================== BASIC CONFIG (EDIT THIS ONLY) =====================
DATA_PREFIX = "event_7"  # <<< change to your file prefix (no .csv)
# ========================================================================


In [None]:
# -------------------- Utilities -------------------- #
def safe_index_col(path: str) -> pd.DataFrame:
    """
    Safely read CSV with the first column as index (covers 'Unnamed: 0' cases).
    """
    return pd.read_csv(path, index_col=0)


def get_scores(model, X: pd.DataFrame) -> np.ndarray:
    """
    Return a continuous score for AUC metrics:
    - Try predict_proba[:, 1]
    - Else try decision_function
    - Else fall back to predict (labels) â€” not ideal for AUC, but prevents crashes
    """
    if hasattr(model, "predict_proba"):
        try:
            proba = model.predict_proba(X)
            if proba.shape[1] == 2:
                return proba[:, 1]
        except Exception:
            pass
    if hasattr(model, "decision_function"):
        try:
            scores = model.decision_function(X)
            # If decision_function returns shape (n_samples, 2), take positive class
            if scores.ndim == 2 and scores.shape[1] == 2:
                return scores[:, 1]
            return scores
        except Exception:
            pass
    # Fallback: predicted labels (AUC will be less informative)
    return model.predict(X)


def evaluate_split(y_true: pd.Series, y_pred: np.ndarray, y_scores: Optional[np.ndarray]) -> Dict[str, Any]:
    """
    Compute confusion matrix-based metrics and AUC metrics from scores when available.
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    acc = accuracy_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    # AUC metrics: require both classes present and a continuous score
    roc = None
    pr_auc = None
    if y_scores is not None and len(np.unique(y_true)) == 2:
        try:
            roc = roc_auc_score(y_true, y_scores)
        except Exception:
            roc = None
        try:
            precision, recall, _ = precision_recall_curve(y_true, y_scores)
            pr_auc = auc(recall, precision)
        except Exception:
            pr_auc = None

    return dict(
        fn=int(fn), fp=int(fp), tn=int(tn), tp=int(tp),
        accuracy=float(acc), precision=float(pre), recall=float(rec), f1=float(f1),
        roc_auc=(None if roc is None else float(roc)),
        pr_auc=(None if pr_auc is None else float(pr_auc)),
    )


def run_model(
    name: str,
    estimator,
    param_grid: Dict[str, Any],
    Xtrain_res: pd.DataFrame,
    Ytrain_res: pd.Series,
    Xtrain: pd.DataFrame,
    Ytrain: pd.Series,
    Xtest: pd.DataFrame,
    Ytest: pd.Series,
    cv_splits: int = 10,
    scoring: str = "f1",
    n_jobs: int = -1,
    save_dir: Path = Path("."),
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Grid-search a model on resampled training set, fit best params, evaluate on original
    train and test splits, save model and CV table.
    """
    print(f"\n===== Begin Train: {name} =====")
    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    gs = GridSearchCV(estimator, param_grid, scoring=scoring, n_jobs=n_jobs, cv=skf, refit=True)
    gs.fit(Xtrain_res, np.array(Ytrain_res).ravel())

    print(f"Best {scoring}: {gs.best_score_:.6f} using {gs.best_params_}")
    cv_df = pd.DataFrame(gs.cv_results_)
    cv_path = save_dir / f"cv_results_{name}.csv"
    cv_df.to_csv(cv_path, index=False)
    print(f"Saved CV results to: {cv_path}")

    best = gs.best_estimator_
    # Save model
    model_path = save_dir / f"model_{name}.pkl"
    joblib.dump(best, model_path)
    print(f"Saved best model to: {model_path}")

    # Evaluate on original training set
    pred_tr = best.predict(Xtrain)
    scores_tr = get_scores(best, Xtrain)
    metrics_tr = evaluate_split(Ytrain, pred_tr, scores_tr)
    print(f"[{name}] Training metrics: {metrics_tr}")

    # Evaluate on test set
    pred_te = best.predict(Xtest)
    scores_te = get_scores(best, Xtest)
    metrics_te = evaluate_split(Ytest, pred_te, scores_te)
    print(f"[{name}] Test metrics: {metrics_te}")

    return metrics_tr, metrics_te

In [None]:
# -------------------- Main -------------------- #
def main():
    # Create output directory
    out_dir = Path("model_outputs")
    out_dir.mkdir(parents=True, exist_ok=True)

    # Load data produced by your preprocessing pipeline
    Xtrain = safe_index_col(f"{DATA_PREFIX}-Xtrain.csv")
    Xtrain_ = safe_index_col(f"{DATA_PREFIX}-Xtrain_.csv")  # SMOTE-resampled features
    Xtest = safe_index_col(f"{DATA_PREFIX}-Xtest.csv")

    Ytrain = safe_index_col(f"{DATA_PREFIX}-Ytrain.csv").iloc[:, 0]
    Ytrain_ = pd.read_csv(f"{DATA_PREFIX}-Ytrain_.csv")  # may not have index col; ensure 1D series
    if Ytrain_.shape[1] == 1:
        Ytrain_ = Ytrain_.iloc[:, 0]
    else:
        # If there are multiple columns by accident, take the first
        Ytrain_ = Ytrain_.iloc[:, 0]
    Ytest = safe_index_col(f"{DATA_PREFIX}-Ytest.csv").iloc[:, 0]

    print("Shapes:",
          "Xtrain", Xtrain.shape, "| Ytrain", Ytrain.shape,
          "| Xtrain_", Xtrain_.shape, "| Ytrain_", Ytrain_.shape,
          "| Xtest", Xtest.shape, "| Ytest", Ytest.shape)

    # Containers for aggregated results (keep your original column names)
    out_rows = []

    # ---------------- Random Forest ---------------- #
    rf_params = dict(
        n_estimators=[100, 200, 300, 500],
        criterion=["gini", "entropy", "log_loss"],
        max_depth=[None, 10, 20, 50, 100],
        min_samples_split=[2, 5, 10],
        min_samples_leaf=[1, 2, 4],
        n_jobs=[-1],
        random_state=[42],
    )
    tr_rf, te_rf = run_model(
        name="RandomForest",
        estimator=RandomForestClassifier(),
        param_grid=rf_params,
        Xtrain_res=Xtrain_, Ytrain_res=Ytrain_,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=out_dir
    )
    out_rows.append(("RandomForest", te_rf))

    # ---------------- LinearSVC ---------------- #
    # Note: LinearSVC does not support probability; we will use decision_function for AUC.
    # Valid combos: penalty='l2', loss in {'hinge','squared_hinge'}.
    lsvc_params = dict(
        loss=["hinge", "squared_hinge"],
        C=[2.0, 1.0, 0.5, 0.2, 0.1],
        tol=[1e-4, 1e-3],
        max_iter=[1000, 2000, 5000],
    )
    tr_svc, te_svc = run_model(
        name="LinearSVC",
        estimator=LinearSVC(dual=True),  # default dual=True works for our sizes
        param_grid=lsvc_params,
        Xtrain_res=Xtrain_, Ytrain_res=Ytrain_,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=out_dir
    )
    out_rows.append(("LinearSVC", te_svc))

    # ---------------- Decision Tree ---------------- #
    dt_params = dict(
        criterion=["gini", "entropy", "log_loss"],
        splitter=["best", "random"],
        max_depth=[None, 10, 20, 50, 100, 200],
        min_samples_split=[2, 5, 10],
        min_samples_leaf=[1, 2, 4],
        random_state=[42],
    )
    tr_dt, te_dt = run_model(
        name="DecisionTree",
        estimator=DecisionTreeClassifier(),
        param_grid=dt_params,
        Xtrain_res=Xtrain_, Ytrain_res=Ytrain_,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=out_dir
    )
    out_rows.append(("DecisionTree", te_dt))

    # ---------------- GaussianNB ---------------- #
    # No hyperparameters to tune that matter for basic case.
    tr_nb, te_nb = run_model(
        name="GaussianNB",
        estimator=GaussianNB(),
        param_grid={},  # empty grid
        Xtrain_res=Xtrain_, Ytrain_res=Ytrain_,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=out_dir
    )
    out_rows.append(("GaussianNB", te_nb))

    # ---------------- KNN ---------------- #
    knn_params = dict(
        n_neighbors=[3, 5, 7, 9, 11, 15, 21],
        algorithm=["auto", "ball_tree", "kd_tree", "brute"],
        weights=["uniform", "distance"],
        n_jobs=[-1],
    )
    tr_knn, te_knn = run_model(
        name="KNeighbors",
        estimator=KNeighborsClassifier(),
        param_grid=knn_params,
        Xtrain_res=Xtrain_, Ytrain_res=Ytrain_,
        Xtrain=Xtrain, Ytrain=Ytrain,
        Xtest=Xtest, Ytest=Ytest,
        save_dir=out_dir
    )
    out_rows.append(("KNeighbors", te_knn))

    # ---------------- Aggregate & Save ---------------- #
    # Keep your original column order and naming
    columns = ['fn', 'fp', 'tn', 'tp', 'accuracy', 'f1-score', 'precision', 'recall', 'roc_auc', 'pr_auc']
    index_names = ['RandomForest', 'LinearSVC', 'DecisionTree', 'GaussianNB', 'KNeighbors']

    data = []
    for name, metrics in out_rows:
        row = [
            metrics['fn'], metrics['fp'], metrics['tn'], metrics['tp'],
            metrics['accuracy'], metrics['f1'], metrics['precision'], metrics['recall'],
            metrics['roc_auc'], metrics['pr_auc']
        ]
        data.append(row)

    result_df = pd.DataFrame(data, index=index_names, columns=columns)
    print("\n===== Test Set Summary =====")
    print(result_df)

    out_path = Path("predictive performance.csv")
    result_df.to_csv(out_path)
    print(f"\nSaved summary to: {out_path.resolve()}")

    # Quick success guard (as in your original script)
    if len(data) == 5:
        print("success!")


if __name__ == "__main__":
    main()