In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    confusion_matrix,
    roc_auc_score
)
from sklearn.utils import shuffle
import joblib


# ============================================================
# 0) CPU Random Forest
# ============================================================

print("Using scikit-learn RandomForest on CPU.")


# ============================================================
# 1) Paths and basic settings  (WINDOWS PATHS USED)
# ============================================================

DATA_PATH = Path(r"C:\Users\LENOVO\Desktop\sen\sen\DATA_MARGED\FUSED_ALL_FINAL_FROM_DATA_ALL.csv")
MODELS_DIR = Path(r"C:\Users\LENOVO\Desktop\sen\sen\models_nested_rf_nlnso_cpu")

SUBJECT_COL = "subject"
LABEL_COL   = "label"

RANDOM_STATE = 42
N_OUTER_FOLDS = 10
N_INNER_FOLDS = 10

os.makedirs(MODELS_DIR, exist_ok=True)
np.random.seed(RANDOM_STATE)


# ============================================================
# 2) Helper functions
# ============================================================

def make_subject_folds(subject_ids, n_folds=10, random_state=42):
    """Split subjects into n folds (subject-wise)."""
    unique_subjects = np.array(sorted(np.unique(subject_ids)))
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_subjects)
    folds = np.array_split(unique_subjects, n_folds)
    return folds


def undersample_multiclass(X, y, max_ratio=3.0, random_state=42):
    """Random undersampling for multi-class imbalance."""
    rng = np.random.RandomState(random_state)
    X = np.asarray(X)
    y = np.asarray(y)

    classes, counts = np.unique(y, return_counts=True)
    min_count = counts.min()
    max_per_class = {c: int(min_count * max_ratio) for c in classes}

    keep_idx = []

    for c in classes:
        idx = np.where(y == c)[0]
        n_keep = min(len(idx), max_per_class[c])
        chosen = rng.choice(idx, size=n_keep, replace=False)
        keep_idx.append(chosen)

    keep_idx = np.concatenate(keep_idx)
    keep_idx = shuffle(keep_idx, random_state=random_state)

    return X[keep_idx], y[keep_idx]


def get_cpu_rf(params):
    """Return sklearn RF model with given hyperparameters."""
    return RandomForestClassifier(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        max_features=params["max_features"],
        bootstrap=params["bootstrap"],
        random_state=RANDOM_STATE,
        n_jobs=-1,
        criterion="gini",
    )


def rf_fit_predict_proba_cpu(X_train, y_train, X_val, params):
    """Train RF and predict probabilities."""
    rf = get_cpu_rf(params)
    rf.fit(X_train, y_train)
    return rf, rf.predict_proba(X_val)


def rf_fit_predict_cpu(X_train, y_train, X_test, params):
    """Train RF and predict labels."""
    rf = get_cpu_rf(params)
    rf.fit(X_train, y_train)
    return rf, rf.predict(X_test)


def evaluate_metrics(y_true, y_pred, label_set=None):
    """Compute all metrics including confusion components."""
    if label_set is None:
        label_set = np.unique(y_true)

    prec, rec, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=label_set, zero_division=0
    )

    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    micro_f1 = f1_score(y_true, y_pred, average="micro")
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    weighted_f1 = f1_score(y_true, y_pred, average="weighted")
    cm = confusion_matrix(y_true, y_pred, labels=label_set)

    return {
        "labels": label_set,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "support": support,
        "accuracy": acc,
        "balanced_accuracy": bal_acc,
        "micro_f1": micro_f1,
        "macro_f1": macro_f1,
        "weighted_f1": weighted_f1,
        "cm": cm,
    }


def print_metrics(m, header=""):
    """Pretty-print metrics."""
    print("\n" + "=" * 70)
    print(header)
    print("=" * 70)

    labels = m["labels"]
    cm = m["cm"]

    print("\nPer-class metrics:")
    print("Label | Precision | Recall | F1 | Support")
    for i, c in enumerate(labels):
        print(f"{c:5d} | {m['precision'][i]:.4f} | {m['recall'][i]:.4f} | {m['f1'][i]:.4f} | {m['support'][i]}")

    print("\nGlobal metrics:")
    print(f"Accuracy          : {m['accuracy']:.4f}")
    print(f"Balanced Accuracy : {m['balanced_accuracy']:.4f}")
    print(f"Micro-F1          : {m['micro_f1']:.4f}")
    print(f"Macro-F1          : {m['macro_f1']:.4f}")
    print(f"Weighted-F1       : {m['weighted_f1']:.4f}")

    print("\nConfusion Matrix:")
    print(cm)

    total = cm.sum()
    print("\nTP / FP / FN / TN per class:")
    for idx, c in enumerate(labels):
        TP = cm[idx, idx]
        FP = cm[:, idx].sum() - TP
        FN = cm[idx, :].sum() - TP
        TN = total - (TP + FP + FN)
        print(f"Class {c}: TP={TP}, FP={FP}, FN={FN}, TN={TN}")


def compute_inner_auc(
    X, y, subjects, label_set, param_grid, n_inner_folds=10, max_ratio=3.0
):
    """Inner-loop ROC AUC tuning."""
    inner_folds = make_subject_folds(subjects, n_folds=n_inner_folds, random_state=RANDOM_STATE)
    combo_scores = []

    print("\n------------------------------------------------------")
    print("Inner loop ROC AUC hyperparameter tuning")
    print("------------------------------------------------------")

    for idx, params in enumerate(param_grid):
        fold_aucs = []

        for inner_idx, val_subj in enumerate(inner_folds):
            is_val = np.isin(subjects, val_subj)
            is_train = ~is_val

            X_tr = X[is_train]
            y_tr = y[is_train]
            X_val = X[is_val]
            y_val = y[is_val]

            # undersample
            X_tr_bal, y_tr_bal = undersample_multiclass(
                X_tr, y_tr, max_ratio=max_ratio, random_state=RANDOM_STATE + inner_idx
            )

            _, y_proba = rf_fit_predict_proba_cpu(X_tr_bal, y_tr_bal, X_val, params)

            try:
                auc = roc_auc_score(
                    y_val, y_proba, labels=label_set,
                    multi_class="ovr", average="macro"
                )
                fold_aucs.append(auc)
            except:
                continue

        mean_auc = np.mean(fold_aucs) if len(fold_aucs) > 0 else np.nan
        combo_scores.append((params, mean_auc))
        print(f"Combo {idx+1}/{len(param_grid)} {params} → mean ROC AUC = {mean_auc:.4f}")

    # choose best
    valid = [c for c in combo_scores if not np.isnan(c[1])]
    best_params, best_auc = max(valid, key=lambda x: x[1])

    print("\nBest inner params:", best_params)
    print("Best inner ROC AUC:", best_auc)

    return best_params, best_auc


# ============================================================
# 3) Load dataset
# ============================================================

print("\nLoading dataset...")
df = pd.read_csv(DATA_PATH)
print("Data shape:", df.shape)


# ============================================================
# 4) Ensure only NUMERIC features (prevents all crashes)
# ============================================================

print("\nFiltering numeric columns only...")

df_numeric = df.select_dtypes(include=[np.number]).copy()

meta_cols = [SUBJECT_COL, LABEL_COL]
meta_cols = [c for c in meta_cols if c in df_numeric.columns]

feature_cols = [c for c in df_numeric.columns if c not in meta_cols]

print(f"Numeric feature columns: {len(feature_cols)}")

X_all = df_numeric[feature_cols].values
y_all = df[LABEL_COL].values
subjects_all = df[SUBJECT_COL].values
label_set = np.unique(y_all)




param_grid = []
for ne in [300, 400, 500]:      
    for md in [16, 24, 32]:      
        for mf in ["sqrt"]:      
            param_grid.append({
                "n_estimators": ne,
                "max_depth": md,
                "max_features": mf,
                "bootstrap": True,
            })

print("\nTotal hyperparameter combinations:", len(param_grid))


# ============================================================
# 6) Outer Loop (10 folds)
# ============================================================

outer_folds = make_subject_folds(subjects_all, n_folds=N_OUTER_FOLDS, random_state=RANDOM_STATE)

all_true = []
all_pred = []

outer_best_params = []
outer_best_auc = []
outer_f1 = []

for outer_idx, test_subj in enumerate(outer_folds, start=1):

    print("\n" + "#"*80)
    print(f"OUTER Fold {outer_idx}/10")
    print("#"*80)

    is_test = np.isin(subjects_all, test_subj)
    is_trainval = ~is_test

    X_trval = X_all[is_trainval]
    y_trval = y_all[is_trainval]
    subj_trval = subjects_all[is_trainval]

    X_test = X_all[is_test]
    y_test = y_all[is_test]

    print("Train+Val subjects:", len(np.unique(subj_trval)))
    print("Test subjects:", len(np.unique(test_subj)))

    # -------------------------
    # INNER LOOP (AUC tuning)
    # -------------------------
    best_params, best_auc = compute_inner_auc(
        X_trval, y_trval, subj_trval,
        label_set, param_grid,
        n_inner_folds=N_INNER_FOLDS
    )

    outer_best_params.append(best_params)
    outer_best_auc.append(best_auc)

    # -------------------------
    # Train FINAL model for this outer fold
    # -------------------------
    X_trval_bal, y_trval_bal = undersample_multiclass(
        X_trval, y_trval, max_ratio=3.0, random_state=RANDOM_STATE + outer_idx
    )

    model, y_pred = rf_fit_predict_cpu(X_trval_bal, y_trval_bal, X_test, best_params)

    # save model
    model_path = MODELS_DIR / f"rf_outer_{outer_idx:02d}.pkl"
    joblib.dump(model, model_path)
    print("Saved model:", model_path)

    # evaluation
    m = evaluate_metrics(y_test, y_pred, label_set)
    print_metrics(m, header=f"OUTER Fold {outer_idx} Test Metrics")

    all_true.append(y_test)
    all_pred.append(y_pred)
    outer_f1.append(m["macro_f1"])


# ============================================================
# 7) GLOBAL METRICS
# ============================================================

all_true = np.concatenate(all_true)
all_pred = np.concatenate(all_pred)

global_m = evaluate_metrics(all_true, all_pred, label_set)
print_metrics(global_m, header="GLOBAL Metrics Across All 10 OUTER Folds")


# ============================================================
# 8) Save AUC summary
# ============================================================

rows = []
for i, (p, auc_val) in enumerate(zip(outer_best_params, outer_best_auc), start=1):
    rows.append({
        "outer_fold": i,
        "best_inner_mean_roc_auc": auc_val,
        "n_estimators": p["n_estimators"],
        "max_depth": p["max_depth"],
        "max_features": p["max_features"],
        "bootstrap": p["bootstrap"],
    })

auc_df = pd.DataFrame(rows)
auc_path = MODELS_DIR / "inner_auc_summary.csv"
auc_df.to_csv(auc_path, index=False)

print("\nSaved inner AUC summary to:", auc_path)


# ============================================================
# 9) FINAL DEPLOYMENT MODEL
# ============================================================

from collections import defaultdict

count = defaultdict(int)
sum_f1 = defaultdict(float)

for params, f1_val in zip(outer_best_params, outer_f1):
    key = tuple(sorted(params.items()))
    count[key] += 1
    sum_f1[key] += float(f1_val)

best_key = None
best_count = -1
best_f1sum = -np.inf

for k in count:
    if count[k] > best_count or (count[k] == best_count and sum_f1[k] > best_f1sum):
        best_key = k
        best_count = count[k]
        best_f1sum = sum_f1[k]

final_params = dict(best_key)

print("\nSelected FINAL deployment hyperparameters:", final_params)

# undersample full data
X_full_bal, y_full_bal = undersample_multiclass(
    X_all, y_all, max_ratio=3.0, random_state=999
)

final_model, _ = rf_fit_predict_cpu(
    X_full_bal, y_full_bal, X_full_bal, final_params
)

final_path = MODELS_DIR / "rf_final_deployment.pkl"
joblib.dump(final_model, final_path)

print("\nFinal deployment model saved to:", final_path)
print("Use this model for real-time predictions.")


Using scikit-learn RandomForest on CPU.

Loading dataset...
Data shape: (1622677, 86)

Filtering numeric columns only...
Numeric feature columns: 83

Total hyperparameter combinations: 9

################################################################################
OUTER Fold 1/10
################################################################################
Train+Val subjects: 101
Test subjects: 12

------------------------------------------------------
Inner loop ROC AUC hyperparameter tuning
------------------------------------------------------
Combo 1/9 {'n_estimators': 300, 'max_depth': 16, 'max_features': 'sqrt', 'bootstrap': True} → mean ROC AUC = 0.6245
Combo 2/9 {'n_estimators': 300, 'max_depth': 24, 'max_features': 'sqrt', 'bootstrap': True} → mean ROC AUC = 0.6225
Combo 3/9 {'n_estimators': 300, 'max_depth': 32, 'max_features': 'sqrt', 'bootstrap': True} → mean ROC AUC = 0.6196
Combo 4/9 {'n_estimators': 400, 'max_depth': 16, 'max_features': 'sqrt', 'bootstrap': True} 

In [2]:
print_metrics(global_m, header="GLOBAL Metrics Across All 10 OUTER Folds")



GLOBAL Metrics Across All 10 OUTER Folds

Per-class metrics:
Label | Precision | Recall | F1 | Support
    0 | 0.8210 | 0.8730 | 0.8462 | 1312916
    1 | 0.2514 | 0.1511 | 0.1887 | 289844
    2 | 0.1142 | 0.3001 | 0.1654 | 19917

Global metrics:
Accuracy          : 0.7370
Balanced Accuracy : 0.4414
Micro-F1          : 0.7370
Macro-F1          : 0.4001
Weighted-F1       : 0.7204

Confusion Matrix:
[[1146200  128745   37971]
 [ 237664   43787    8393]
 [  12309    1631    5977]]

TP / FP / FN / TN per class:
Class 0: TP=1146200, FP=249973, FN=166716, TN=59788
Class 1: TP=43787, FP=130376, FN=246057, TN=1202457
Class 2: TP=5977, FP=46364, FN=13940, TN=1556396


In [None]:
global_metrics_path = MODELS_DIR / "global_metrics_summary.txt"
with open(global_metrics_path, "w", encoding="utf-8") as f:
    from pprint import pprint
    pprint(global_m, stream=f)

print("\nSaved GLOBAL metrics to:", global_metrics_path)



Saved GLOBAL metrics to: C:\Users\LENOVO\Desktop\sen\sen\models_nested_rf_nlnso_cpu\global_metrics_summary.txt
