In [2]:
# Full corrected end-to-end pipeline (Jupyter cell)
# - Apnea mapped to positive class (1)
# - Imblearn RandomOverSampler used if installed, else manual upsample
# - RandomizedSearchCV optimizing recall (3-fold)
# - Efficient threshold selection using precision_recall_curve
# - Saves plots (.png) and metrics (.json)

import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import (
    roc_curve, auc, precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report, accuracy_score,
    precision_score, recall_score, f1_score
)
from sklearn.calibration import calibration_curve
from sklearn.base import clone
import joblib
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

# ---------------------------
# User-editable paths & options
# ---------------------------
TRAIN_CSV = "/scratch/sshuvo13/project_shared_folder_bspml_1/segments_30s/features/train_test_separated_and_combined/male/feature33/train_data.csv"   # <- change to your training csv path
TEST_CSV  = "/scratch/sshuvo13/project_shared_folder_bspml_1/segments_30s/features/train_test_separated_and_combined/male/feature33/test_data.csv"    # <- change to your test csv path

OUT_DIR   = "model_outputs"
N_JOBS    = -1
RANDOM_STATE = 42
FEATURE_COUNT = 33   # first 33 columns are features

os.makedirs(OUT_DIR, exist_ok=True)

def save_fig(fig, name):
    path = Path(OUT_DIR) / name
    fig.savefig(path, bbox_inches='tight', dpi=150)
    plt.close(fig)
    print(f"Saved: {path}")

def save_json(obj, name):
    path = Path(OUT_DIR) / name
    with open(path, "w") as f:
        json.dump(obj, f, indent=2, default=lambda o: float(o) if isinstance(o, (np.floating, np.integer)) else str(o))
    print(f"Saved: {path}")

# ---------------------------
# Try imblearn
# ---------------------------
USE_IMBLEARN = True
try:
    from imblearn.over_sampling import RandomOverSampler
    from imblearn.pipeline import Pipeline as ImbPipeline
    print("imblearn found: using RandomOverSampler inside CV pipelines.")
except Exception:
    print("imblearn not available - will fall back to manual upsampling.")
    USE_IMBLEARN = False

# ---------------------------
# Load data
# ---------------------------
print("Loading data...")
train = pd.read_csv(TRAIN_CSV, low_memory=False)
test  = pd.read_csv(TEST_CSV, low_memory=False)
print(f"Train shape: {train.shape}; Test shape: {test.shape}")

X_train = train.iloc[:, :FEATURE_COUNT].copy()
y_train_raw = train.iloc[:, -1].astype(str).copy()
X_test  = test.iloc[:, :FEATURE_COUNT].copy()
y_test_raw = test.iloc[:, -1].astype(str).copy()

# Cast numeric columns to float32 where possible to save memory
for df in (X_train, X_test):
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype(np.float32)

# ---------------------------
# Map labels so Apnea=1, Normal=0
# ---------------------------
y_train = (y_train_raw == "Apnea").astype(int).values
y_test  = (y_test_raw  == "Apnea").astype(int).values
meta = {
    "timestamp": datetime.utcnow().isoformat() + "Z",
    "train_shape": train.shape,
    "test_shape": test.shape,
    "feature_count": FEATURE_COUNT,
    "label_map": {"Apnea": 1, "Normal": 0},
    "train_pos_fraction": float(y_train.mean()),
    "test_pos_fraction": float(y_test.mean())
}
save_json(meta, "metadata.json")

# ---------------------------
# Manual safe upsampling fallback
# ---------------------------
def simple_upsample(X_df, y_arr, random_state=RANDOM_STATE):
    df = pd.DataFrame(X_df.copy())
    df['_y_'] = y_arr
    counts = df['_y_'].value_counts()
    if len(counts) == 1:
        return X_df.values, y_arr
    max_count = counts.max()
    parts = []
    for cls, cnt in counts.items():
        sub = df[df['_y_'] == cls]
        if cnt < max_count:
            sub_up = sub.sample(n=max_count, replace=True, random_state=random_state)
            parts.append(sub_up)
        else:
            parts.append(sub)
    df_up = pd.concat(parts).sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    y_up = df_up['_y_'].values.astype(int)
    X_up = df_up.drop(columns=['_y_']).values.astype(np.float32)
    return X_up, y_up

# Prepare sampler or balanced data
if USE_IMBLEARN:
    sampler = RandomOverSampler(random_state=RANDOM_STATE)
else:
    X_train_bal, y_train_bal = simple_upsample(X_train, y_train)
    print("Manual upsampled training shape:", X_train_bal.shape, y_train_bal.shape)

# ---------------------------
# Build pipelines and small param grids
# ---------------------------
base_steps = [("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]

# Logistic Regression (class_weight balanced)
pipe_lr = Pipeline(base_steps + [("clf", LogisticRegression(solver="liblinear", class_weight="balanced", random_state=RANDOM_STATE, max_iter=200))])
param_dist_lr = {"clf__C": [0.01, 0.1, 1.0, 10.0]}

# Random Forest (class_weight balanced)
pipe_rf = Pipeline(base_steps + [("clf", RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced", n_jobs=1))])
param_dist_rf = {"clf__n_estimators": [50, 100], "clf__max_depth": [5, 10, None]}

# HistGradientBoosting (no class_weight param typically)
pipe_hgb = Pipeline(base_steps + [("clf", HistGradientBoostingClassifier(random_state=RANDOM_STATE))])
param_dist_hgb = {"clf__learning_rate": [0.01, 0.1], "clf__max_iter": [100, 200], "clf__max_depth": [3, 7, None]}

models = [
    ("logreg", pipe_lr, param_dist_lr),
    ("rf",     pipe_rf, param_dist_rf),
    ("hgb",    pipe_hgb, param_dist_hgb),
]

# If using imblearn, create imblearn pipelines that include sampler
if USE_IMBLEARN:
    models_with_sampler = []
    for name, pipe, params in models:
        imb_pipe = ImbPipeline([("sampler", sampler)] + pipe.steps)
        models_with_sampler.append((name, imb_pipe, params))
    models = models_with_sampler

# ---------------------------
# RandomizedSearchCV (3-fold) optimizing recall (Apnea)
# ---------------------------
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
best_estimators = {}
cv_results_summary = {}

for name, pipe, param_dist in models:
    print(f"\nTuning {name} ... (scoring=recall for Apnea)")
    rs = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=min(8, max(3, sum(len(v) for v in param_dist.values()))),
        scoring="roc_auc",
        cv=cv,
        n_jobs=N_JOBS,
        random_state=RANDOM_STATE,
        verbose=1,
        return_train_score=False
    )
    if USE_IMBLEARN:
        rs.fit(X_train, y_train)
    else:
        rs.fit(X_train_bal, y_train_bal)
    print(f"Best {name} recall (CV): {rs.best_score_:.4f}")
    print(f"Best params for {name}: {rs.best_params_}")
    best_estimators[name] = rs.best_estimator_
    # save compact cv results
    cv_res = rs.cv_results_.copy()
    def tidy(v):
        if isinstance(v, np.ndarray): return v.tolist()
        if isinstance(v, (np.float32, np.float64, np.int32, np.int64)): return float(v)
        return v
    cv_results_summary[name] = {k: tidy(v) for k, v in cv_res.items() if k in ["mean_test_score", "params", "rank_test_score", "std_test_score"]}
    save_json({"best_score": float(rs.best_score_), "best_params": rs.best_params_}, f"{name}_best_params.json")

save_json(cv_results_summary, "cv_results_summary.json")

# ---------------------------
# Refit chosen estimators on full training set
# ---------------------------
print("\nRefitting chosen estimators on full training data...")
for name, est in best_estimators.items():
    print(f"Refitting {name} ...")
    if USE_IMBLEARN:
        est.fit(X_train, y_train)
    else:
        est.fit(X_train_bal, y_train_bal)

# Build stacking classifier (use cloned fitted base estimators)
estimators_for_stack = [(name, clone(est)) for name, est in best_estimators.items()]
final_estimator = LogisticRegression(solver="liblinear", class_weight="balanced", random_state=RANDOM_STATE)
stack_clf = StackingClassifier(estimators=estimators_for_stack, final_estimator=final_estimator, n_jobs=N_JOBS, passthrough=False, verbose=0)

# Fit stacking on full balanced data (sampler included inside pipelines if USE_IMBLEARN)
if USE_IMBLEARN:
    stack_clf.fit(X_train, y_train)
else:
    stack_clf.fit(X_train_bal, y_train_bal)

# Save models
joblib.dump(stack_clf, Path(OUT_DIR) / "stacking_model_improved.joblib")
for name, est in best_estimators.items():
    joblib.dump(est, Path(OUT_DIR) / f"{name}_best_improved.joblib")
print("Saved models.")

# ---------------------------
# Evaluate on test set
# ---------------------------
print("\nEvaluating on test set...")
y_proba = stack_clf.predict_proba(X_test)[:, 1]
y_pred_default = (y_proba >= 0.5).astype(int)

metrics_default = {
    "accuracy": accuracy_score(y_test, y_pred_default),
    "precision": precision_score(y_test, y_pred_default, zero_division=0),
    "recall": recall_score(y_test, y_pred_default, zero_division=0),
    "f1": f1_score(y_test, y_pred_default, zero_division=0),
    "roc_auc": float(auc(*roc_curve(y_test, y_proba)[:2])),
    "average_precision": float(average_precision_score(y_test, y_proba)),
    "confusion_matrix": confusion_matrix(y_test, y_pred_default).tolist()
}
metrics_default["classification_report"] = classification_report(y_test, y_pred_default, output_dict=True, zero_division=0)
save_json(metrics_default, "test_metrics_default_threshold.json")
print("Default-threshold metrics saved.")

# ---------------------------
# Efficient threshold selection using precision_recall_curve
# ---------------------------
prec_vals, rec_vals, pr_thresholds = precision_recall_curve(y_test, y_proba)
# align arrays: prec_vals[:-1], rec_vals[:-1] correspond to pr_thresholds
eps = 1e-12
p = prec_vals[:-1]
r = rec_vals[:-1]
f1s = 2 * p * r / (p + r + eps)

best_idx = int(np.nanargmax(f1s))
best_thresh = float(pr_thresholds[best_idx])
best_f1 = float(f1s[best_idx])
best_prec = float(p[best_idx])
best_rec = float(r[best_idx])

save_json({
    "best_threshold_by_f1": best_thresh,
    "best_f1": best_f1,
    "precision_at_best": best_prec,
    "recall_at_best": best_rec,
    "n_pr_thresholds": int(len(pr_thresholds))
}, "best_threshold_summary.json")
print(f"Selected best threshold by PR-F1: {best_thresh:.4f} (F1={best_f1:.4f}, P={best_prec:.4f}, R={best_rec:.4f})")

# Evaluate with chosen threshold
y_pred_best = (y_proba >= best_thresh).astype(int)
metrics_best = {
    "threshold": best_thresh,
    "accuracy": accuracy_score(y_test, y_pred_best),
    "precision": precision_score(y_test, y_pred_best, zero_division=0),
    "recall": recall_score(y_test, y_pred_best, zero_division=0),
    "f1": f1_score(y_test, y_pred_best, zero_division=0),
    "roc_auc": float(auc(*roc_curve(y_test, y_proba)[:2])),
    "average_precision": float(average_precision_score(y_test, y_proba)),
    "confusion_matrix": confusion_matrix(y_test, y_pred_best).tolist()
}
metrics_best["classification_report"] = classification_report(y_test, y_pred_best, output_dict=True, zero_division=0)
save_json(metrics_best, "test_metrics_best_threshold.json")
print("Best-threshold metrics saved.")

# ---------------------------
# Plots: ROC, PR (with best point), Confusion matrix (best threshold), Calibration
# ---------------------------
# ROC
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
fig = plt.figure()
plt.plot(fpr, tpr, lw=2, label=f'Stacking (AUC = {roc_auc:.4f})')
plt.plot([0,1], [0,1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Apnea=1)'); plt.legend(loc='lower right')
save_fig(fig, "roc_curve_improved.png")

# Precision-Recall (plot and mark best)
ap = average_precision_score(y_test, y_proba)
fig = plt.figure()
plt.plot(rec_vals, prec_vals, lw=2, label=f'Stacking (AP={ap:.4f})')
plt.scatter([best_rec], [best_prec], color='red', label=f'Best F1 thr={best_thresh:.3f}')
plt.xlabel('Recall'); plt.ylabel('Precision')
plt.title('Precision-Recall Curve (Apnea=1)'); plt.legend(loc='lower left')
save_fig(fig, "precision_recall_curve_improved.png")

# Confusion matrix for best threshold
cm = confusion_matrix(y_test, y_pred_best)
fig, ax = plt.subplots(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap='Blues')
ax.set_xlabel('Predicted'); ax.set_ylabel('Actual')
ax.set_title(f'Confusion Matrix (threshold={best_thresh:.3f})')
save_fig(fig, "confusion_matrix_improved.png")

# Calibration curve
prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10)
fig = plt.figure()
plt.plot(prob_pred, prob_true, marker='o', linewidth=1, label='Stacking')
plt.plot([0,1],[0,1], linestyle='--', color='gray')
plt.xlabel('Mean predicted probability'); plt.ylabel('Fraction of positives')
plt.title('Calibration Curve (Apnea=1)'); plt.legend()
save_fig(fig, "calibration_curve_improved.png")

# ---------------------------
# Feature importances from tree-based models (if available)
# ---------------------------
feature_names = list(X_train.columns.astype(str))
fi_summary = {}
for name, est in best_estimators.items():
    try:
        clf = None
        if hasattr(est, 'named_steps'):
            if 'clf' in est.named_steps:
                clf = est.named_steps['clf']
            else:
                clf = list(est.named_steps.values())[-1]
        else:
            clf = est
        if hasattr(clf, "feature_importances_"):
            importances = clf.feature_importances_
            idx = np.argsort(importances)[::-1]
            topk = min(20, len(importances))
            top_features = [{"feature": feature_names[i], "importance": float(importances[i])} for i in idx[:topk]]
            fi_summary[name] = top_features
            fig, ax = plt.subplots(figsize=(8, max(3, topk * 0.3)))
            sns.barplot(x=[f["importance"] for f in top_features], y=[f["feature"] for f in top_features], ax=ax)
            ax.set_title(f"Top {topk} feature importances: {name}")
            save_fig(fig, f"{name}_feature_importances_improved.png")
    except Exception as e:
        print(f"No feature importances for {name} or failed: {e}")

if fi_summary:
    save_json(fi_summary, "feature_importances_improved.json")

# Save predictions
pred_df = pd.DataFrame({"true": y_test, "pred_default": y_pred_default, "pred_best": y_pred_best, "proba": y_proba})
pred_df.to_csv(Path(OUT_DIR) / "test_predictions_improved.csv", index=False)
print(f"Saved predictions to {Path(OUT_DIR) / 'test_predictions_improved.csv'}")

# Final summary
final_summary = {
    "timestamp": datetime.utcnow().isoformat() + "Z",
    "default_threshold_metrics": metrics_default,
    "best_threshold_metrics": metrics_best,
    "best_threshold": best_thresh,
    "cv_results_summary": cv_results_summary,
    "used_imblearn": USE_IMBLEARN
}
save_json(final_summary, "final_summary_improved.json")

print("\nDone. All outputs are in:", OUT_DIR)


imblearn found: using RandomOverSampler inside CV pipelines.
Loading data...
Train shape: (103302, 34); Test shape: (27491, 34)
Saved: model_outputs/metadata.json

Tuning logreg ... (scoring=recall for Apnea)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best logreg recall (CV): 0.6412
Best params for logreg: {'clf__C': 10.0}
Saved: model_outputs/logreg_best_params.json

Tuning rf ... (scoring=recall for Apnea)
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best rf recall (CV): 0.8292
Best params for rf: {'clf__n_estimators': 100, 'clf__max_depth': None}
Saved: model_outputs/rf_best_params.json

Tuning hgb ... (scoring=recall for Apnea)
Fitting 3 folds for each of 7 candidates, totalling 21 fits
Best hgb recall (CV): 0.8074
Best params for hgb: {'clf__max_iter': 200, 'clf__max_depth': 7, 'clf__learning_rate': 0.1}
Saved: model_outputs/hgb_best_params.json
Saved: model_outputs/cv_results_summary.json

Refitting chosen estimators on full training data...
Refitt