# RUN EXPERIMENTS

In [None]:
import sys
import numpy as np
import pandas as pd

print("python:", sys.executable)
print("numpy:", np.__version__)
print("pandas:", pd.__version__)


In [None]:
#!pip install xgboost
!pip install -U xgboost matplotlib shap


In [None]:
!pip install matplotlib


In [None]:
from modeling_pipeline_utils import load_feature_csv, run_experiments

# Update these paths to point to where your CSV files are actually located
# For example, if they're in your current working directory:
train_csv = r"C:\Users\albav\Desktop\3004 Project\train_merged_LE_RE.csv"
test_csv = r"C:\Users\albav\Desktop\3004 Project\external_test.csv"

# Or provide the full correct path to your files:
# train_csv = r"C:\correct\path\to\train_merged_LE_RE.csv"
# test_csv = r"C:\correct\path\to\test_merged_LE_RE.csv"

X_train, y_train, X_test, y_test = load_feature_csv(train_csv, test_csv)

results, metrics_table, jaccard = run_experiments(
    X_train, y_train, X_test, y_test,
    top_n=10,
    nsamples_ci=2000 #<----------------For "fast run," change it to 200
)

# Results table

In [None]:
display(metrics_table)

# Sanity checks

In [None]:

len(results)
#4-8 ----> sanity run
#10-20 ------> full experiment set

# Information per model
- Change results[0] to inspect the different models

In [None]:
#dataclass fields
res = results[0] #<------------change number here to view different model.
print("Model:", res.model_name)
print("Selector:", res.selector_name)
print("Optimal threshold:", res.optimal_threshold)

print("\nSelected features:")
print("\n".join(res.selected_features))

print("\nMetrics:")
display(res.metrics_test)


In [None]:
print(res.__dict__.keys())


In [None]:
print(type(res.extra))
print(res.extra)


# Confusion matrices

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

for res in results:
    fig, ax = plt.subplots(figsize=(4, 4))
    ConfusionMatrixDisplay(
        confusion_matrix=res.confusion_matrix_norm,
        display_labels=[0, 1]
    ).plot(ax=ax, values_format=".2f")
    ax.set_title(f"{res.model_name} | {res.selector_name}")
    plt.tight_layout()
    plt.show()


In [None]:
!pip install shap

In [None]:
import shap
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay


In [None]:
import numpy as np
import shap
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# 1) choose which run to explain
res = results[0]  # change index to explain a different model/selector

# 2) build selected-feature matrix
X_train_sel = X_train[res.selected_features].copy()

# 3) refit a stable logistic regression on those selected features
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=5000, solver="liblinear"))
])
clf.fit(X_train_sel, y_train)

# 4) SHAP background (keeps it manageable)
X_bg = shap.utils.sample(X_train_sel, 100, random_state=27)

# 5) explain predict_proba (binary classification -> 2 classes)
explainer = shap.Explainer(clf.predict_proba, X_bg)
shap_values = explainer(X_train_sel)

# 6) Robustly extract "class 1" SHAP values and wrap into an Explanation object
vals = shap_values.values
base = shap_values.base_values

print("shap_values.values shape:", vals.shape)

# vals can be (n_samples, n_features, n_classes) OR (n_samples, n_features)
if vals.ndim == 3:
    shap_class1 = vals[:, :, 1]
    if np.ndim(base) == 2:
        base_class1 = base[:, 1]
    else:
        base_class1 = base
else:
    shap_class1 = vals
    base_class1 = base

sv1 = shap.Explanation(
    values=shap_class1,
    base_values=base_class1,
    data=X_train_sel.values,
    feature_names=list(X_train_sel.columns)
)

# 7) Plots
shap.plots.beeswarm(sv1, max_display=15)

idx = np.random.RandomState(27).randint(0, X_train_sel.shape[0])
print("case index:", idx)
shap.plots.waterfall(sv1[idx])


In [None]:
#SHAP VALUES VANILLE

#X100 = shap.utils.sample(reduced_features_train_rfecv, 100, random_state=27) # I am not yet sure what the optimal number for distribution is. Standard (explained in documentation is 100.
#explainer_xgb = shap.Explainer(rfecv.estimator_, X100) #This utilises the rfe xgboost with 10 features
#shap_values_xgb = explainer_xgb(reduced_features_train_rfecv) #based on training dataset of model, since that is what controls final model architecture
#shap.plots.beeswarm(shap_values_xgb, max_display=10)
#X100 = shap.utils.sample(reduced_features_train, 100, random_state=27) # I am not yet sure what the optimal number for distribution is. Standard (explained in documentation is 100.
#explainer_xgb = shap.Explainer(rfe.estimator_, X100) #This utilises the rfe xgboost with 10 features
#shap_values_xgb = explainer_xgb(reduced_features_train) #based on training dataset of model, since that is what controls final model architecture
#shap.plots.beeswarm(shap_values_xgb, max_display=len(reduced_features_train))
#import random
#random_case = random.randint(0, len(reduced_features_train + 1))
#print("case index: " + str(random_case))

#shap.plots.waterfall(shap_values_xgb[random_case])

XGBoost Models -
----

In [None]:
def list_run_combinations(results):
    for i, r in enumerate(results):
        print(f"{i:2d} | {r.model_name:10s} | {r.selector_name}")

list_run_combinations(results)


general SHAP function (XGBoost)
----

In [None]:
import numpy as np
import shap
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

SEED = 27

def shap_xgb_for_run(
    results,
    X_train,
    y_train,
    model_name: str,
    selector_name: str,
    *,
    background_n: int = 100,
    explain_n: int | None = 300,     # keep this smaller for speed
    max_display: int = 15,
    case_idx: int | None = None,
    seed: int = SEED,
):
    # 1) find the matching run
    match = None
    for i, r in enumerate(results):
        if r.model_name == model_name and r.selector_name == selector_name:
            match = (i, r)
            break
    if match is None:
        raise ValueError(f"No run found for: {model_name} | {selector_name}")

    run_idx, res = match
    print("Selected run:", run_idx, "|", res.model_name, "|", res.selector_name)

    # 2) selected feature matrix
    X_train_sel = X_train[res.selected_features].copy()

    # subset to explain (speed)
    if explain_n is not None and explain_n < X_train_sel.shape[0]:
        X_explain = shap.utils.sample(X_train_sel, explain_n, random_state=seed)
    else:
        X_explain = X_train_sel

    # 3) refit XGBoost
    xgb = XGBClassifier(
        n_estimators=400,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective="binary:logistic",
        random_state=seed,
        n_jobs=-1,
        eval_metric="logloss",
    )
    xgb.fit(X_train_sel, y_train)

    # 4) SHAP (PermutationExplainer; robust for your SHAP/XGB versions)
    X_bg = shap.utils.sample(X_train_sel, min(background_n, X_train_sel.shape[0]), random_state=seed)

    explainer = shap.Explainer(
        xgb.predict_proba,
        X_bg,
        algorithm="permutation"
    )
    sv = explainer(X_explain)

    # 5) Convert to "class 1" explanation safely
    # sv.values is typically (n_samples, n_features, n_classes) for predict_proba
    sv1 = shap.Explanation(
        values=sv.values[:, :, 1],
        base_values=sv.base_values[:, 1] if getattr(sv, "base_values", None) is not None else None,
        data=sv.data,
        feature_names=sv.feature_names
    )

    # 6) Beeswarm (class 1)
    shap.plots.beeswarm(sv1, max_display=max_display, show=False)
    ax = plt.gca()
    ax.set_title(f"{model_name} | {selector_name} â€” SHAP beeswarm (global)", fontsize=12)
    plt.tight_layout()
    plt.show()

    # 7) Waterfall (class 1)
    if case_idx is None:
        case_idx = np.random.RandomState(seed).randint(0, X_explain.shape[0])
    print("Waterfall case index (within explained set):", case_idx)
    shap.plots.waterfall(sv1[case_idx])

    return sv1, res


XGBoost + RFECV
--

In [None]:
sv_rfecv, res_rfecv = shap_xgb_for_run(
    results, X_train, y_train,
    model_name="xgb",
    selector_name="rfecv_rank_top10",
    explain_n=500  # optional; speeds it up
)


XGBoost + RFE (no CV)
-

In [None]:
sv_rfe, res_rfe = shap_xgb_for_run(
    results, X_train, y_train,
    model_name="xgb",
    selector_name="rfe_no_cv_top10",
    explain_n=500  # optional
)


XGBoost + ANOVA-CV
-

In [None]:
sv_anova, res_anova = shap_xgb_for_run(
    results, X_train, y_train,
    model_name="xgb",
    selector_name="anova_cv",
    explain_n=500  # optional; set to 300 if you want faster
)


XGBoost + Embedded Method
-

In [None]:
sv_embedded, res_embedded = shap_xgb_for_run(
    results, X_train, y_train,
    model_name="xgb",
    selector_name="embedded_method",
    explain_n=500  # optional; reduce if it gets slow
)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

SEED = 27


# -------- helpers --------

def _extract_scores_from_extra(res):
    """
    Try to pull test-set scores/probabilities from res.extra if your pipeline stored them.
    Returns a 1D array of scores for class 1, or None if not found.
    """
    if not hasattr(res, "extra") or res.extra is None or not isinstance(res.extra, dict):
        return None

    # common key names that pipelines often use
    candidate_keys = [
        "y_proba_test", "y_pred_proba_test", "proba_test", "y_prob_test",
        "y_score_test", "scores_test", "decision_test", "yhat_proba_test"
    ]

    for k in candidate_keys:
        if k in res.extra:
            s = np.asarray(res.extra[k])
            # if it's (n,2) take class 1
            if s.ndim == 2 and s.shape[1] >= 2:
                return s[:, 1]
            # if it's (n,) assume already class-1 score
            if s.ndim == 1:
                return s

    return None


def _fit_model_for_run(model_name: str, seed: int = SEED):
    """
    Build a model matching the model_name labels used in your results.
    Returns an estimator that can produce either predict_proba or decision_function.
    """
    if model_name == "logreg":
        return Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=5000, solver="liblinear", random_state=seed))
        ])

    if model_name == "svm_linear":
        # LinearSVC does NOT give predict_proba, but it gives decision_function (good for ROC).
        return Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LinearSVC(random_state=seed))
        ])

    if model_name == "random_forest":
        return RandomForestClassifier(
            n_estimators=500,
            random_state=seed,
            n_jobs=-1
        )

    if model_name == "xgb":
        # Use same settings as your SHAP refit; ROC only needs consistent scores.
        return XGBClassifier(
            n_estimators=400,
            max_depth=3,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            objective="binary:logistic",
            random_state=seed,
            n_jobs=-1,
            eval_metric="logloss",
        )

    if model_name == "lasso_logregcv":
        return Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegressionCV(
                penalty="l1",
                solver="saga",
                max_iter=10000,
                cv=5,
                random_state=seed,
                n_jobs=-1
            ))
        ])

    raise ValueError(f"Unknown model_name: {model_name}")


def _get_test_scores(res, X_train, y_train, X_test, y_test, seed: int = SEED):
    """
    Returns (y_score_test, X_test_sel) where y_score_test is 1D numeric scores for ROC.
    Prefers stored scores in res.extra; otherwise refits the model on selected features.
    """
    # 1) Try to use stored scores
    s = _extract_scores_from_extra(res)
    if s is not None:
        return s, None

    # 2) Otherwise refit
    X_train_sel = X_train[res.selected_features].copy()
    X_test_sel = X_test[res.selected_features].copy()

    est = _fit_model_for_run(res.model_name, seed=seed)
    est.fit(X_train_sel, y_train)

    # prefer predict_proba if available; otherwise use decision_function
    if hasattr(est, "predict_proba"):
        proba = est.predict_proba(X_test_sel)
        y_score = proba[:, 1]
    elif hasattr(est, "decision_function"):
        y_score = est.decision_function(X_test_sel)
    else:
        raise RuntimeError(f"Model {res.model_name} has neither predict_proba nor decision_function")

    return np.asarray(y_score), X_test_sel


# -------- main plotting --------

def plot_roc_for_all_runs(results, X_train, y_train, X_test, y_test, seed: int = SEED):
    """
    Makes one ROC plot per (model, selector) run in results.
    """
    for i, res in enumerate(results):
        y_score, _ = _get_test_scores(res, X_train, y_train, X_test, y_test, seed=seed)

        fpr, tpr, _ = roc_curve(y_test, y_score)
        roc_auc = auc(fpr, tpr)

        plt.figure()
        plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
        plt.plot([0, 1], [0, 1], linestyle="--")
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC: {res.model_name} | {res.selector_name} (run {i})")
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.show()


# Run this:
plot_roc_for_all_runs(results, X_train, y_train, X_test, y_test)
