In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score,
    balanced_accuracy_score, brier_score_loss, classification_report, confusion_matrix
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

try:
    from xgboost import XGBClassifier
    xgb_available = True
except Exception:
    xgb_available = False

RND = 1706
rng = np.random.default_rng(42)

def bootstrap_ci_classification(y_true, y_pred, y_proba, metric="f1_macro", n_boot=2000, stratified=True):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    y_proba = np.asarray(y_proba)
    idx_pos = np.where(y_true == 1)[0]
    idx_neg = np.where(y_true == 0)[0]
    scores = []
    for _ in range(n_boot):
        if stratified:
            samp_pos = rng.choice(idx_pos, size=len(idx_pos), replace=True)
            samp_neg = rng.choice(idx_neg, size=len(idx_neg), replace=True)
            idx = np.concatenate([samp_pos, samp_neg])
        else:
            idx = rng.choice(len(y_true), size=len(y_true), replace=True)
        yt, yp, pr = y_true[idx], y_pred[idx], y_proba[idx]
        if metric == "f1_macro":
            s = f1_score(yt, yp, average="macro")
        elif metric == "auc":
            if len(np.unique(yt)) < 2:
                continue
            s = roc_auc_score(yt, pr)
        else:
            raise ValueError("metric must be 'f1_macro' or 'auc'")
        scores.append(s)
    lo, hi = np.percentile(scores, [2.5, 97.5])
    return float(np.mean(scores)), float(lo), float(hi)

def icc2_1_with_ci(y_probs1, y_probs2, n_boot=2000):
    if np.allclose(y_probs1, y_probs1[0]) or np.allclose(y_probs2, y_probs2[0]):
        print("Warning: One of the probability vectors is constant. ICC(2,1) is undefined (NaN).")
        return np.nan, np.nan, np.nan
    try:
        import pingouin as pg
        n = len(y_probs1)
        df_long = pd.DataFrame({
            "subject_id": np.repeat(np.arange(n), 2),
            "method": np.tile([0, 1], n),
            "value": np.concatenate([y_probs1, y_probs2])
        })
        icc_tbl = pg.intraclass_corr(data=df_long, targets="subject_id", raters="method", ratings="value")
        row = icc_tbl.loc[icc_tbl["Type"] == "ICC2"]
        icc = float(row["ICC"].values[0])
        ci = row["CI95%"].values[0]
        if isinstance(ci, str):
            lo, hi = [float(x) for x in ci.replace("[", "").replace("]", "").split(",")]
        else:
            lo, hi = float(ci[0]), float(ci[1])
        return icc, lo, hi
    except Exception as e:
        return np.nan, np.nan, np.nan

def run_baseline_classification_pipeline(
    dataset_path,
    target_col="Depression_label",
    drop_cols=None,
    train_value="train",
    test_value="test"
):
    if drop_cols is None:
        drop_cols = ["PTSD_severity", "PTSD_label", "split", "Participant", "semantic_perplexity"]

    df = pd.read_csv(dataset_path)
    split_lower = df["split"].astype(str).str.lower().str.strip()
    is_train = split_lower == train_value
    is_test = split_lower == test_value

    X_train = df.loc[is_train].drop(columns=drop_cols + [target_col], errors="ignore")
    y_train = df.loc[is_train, target_col].astype(int)
    X_test = df.loc[is_test].drop(columns=drop_cols + [target_col], errors="ignore")
    y_test = df.loc[is_test, target_col].astype(int)

    categorical_cols = [c for c in X_train.columns if X_train[c].dtype.name in ["object", "category"]]
    numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

    cat_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("enc", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    num_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="median"))
    ])
    pre = ColumnTransformer([
        ("cat", cat_pipe, categorical_cols),
        ("num", num_pipe, numeric_cols),
    ], remainder="drop")

    def metrics_from_preds(y_true, y_pred, y_proba=None, y_proba_other=None):
        out = {
            "accuracy": accuracy_score(y_true, y_pred),
            "f1_macro": f1_score(y_true, y_pred, average="macro"),
            "balanced_acc": balanced_accuracy_score(y_true, y_pred),
            "roc_auc": np.nan,
            "pr_auc": np.nan,
            "brier": np.nan,
            "f1_lo": np.nan,
            "f1_hi": np.nan,
            "auc_lo": np.nan,
            "auc_hi": np.nan,
            "icc2_1": np.nan,
            "icc_lo": np.nan,
            "icc_hi": np.nan,
        }
        if y_proba is not None and len(np.unique(y_true)) == 2:
            try: out["roc_auc"] = roc_auc_score(y_true, y_proba)
            except: pass
            try: out["pr_auc"] = average_precision_score(y_true, y_proba)
            except: pass
            try: out["brier"] = brier_score_loss(y_true, y_proba)
            except: pass
            try:
                f1_mean, f1_lo, f1_hi = bootstrap_ci_classification(y_true, y_pred, y_proba, metric="f1_macro")
                auc_mean, auc_lo, auc_hi = bootstrap_ci_classification(y_true, y_pred, y_proba, metric="auc")
                out["f1_lo"], out["f1_hi"] = f1_lo, f1_hi
                out["auc_lo"], out["auc_hi"] = auc_lo, auc_hi
            except Exception as e:
                pass
        if y_proba is not None and y_proba_other is not None:
            icc, lo, hi = icc2_1_with_ci(y_proba, y_proba_other)
            out["icc2_1"], out["icc_lo"], out["icc_hi"] = icc, lo, hi
        return out
    
    def explain_model(model, model_name, X_test, y_test, feature_names):
        clf = model.named_steps["clf"]
        if hasattr(clf, "feature_importances_"):
            imp = clf.feature_importances_
            imp_df = pd.DataFrame({"feature": feature_names, "importance": imp}) \
                .sort_values("importance", ascending=False).reset_index(drop=True)
            print(f"\n[{model_name}] Top-10 impurity importances:")
            print(imp_df.head(10))
            top_n = min(15, imp_df.shape[0])
            plt.figure(figsize=(8, max(5, 0.3 * top_n)))
            plt.barh(imp_df.loc[:top_n - 1, "feature"][::-1], imp_df.loc[:top_n - 1, "importance"][::-1])
            plt.title(f"Impurity Importances (top-15) — {model_name}")
            plt.xlabel("Importance")
            plt.ylabel("Feature")
            plt.tight_layout()
            plt.show()
        try:
            try:
                y_proba = model.predict_proba(X_test)[:, 1]
                scoring = "roc_auc"
            except Exception:
                scoring = "balanced_accuracy"
            perm = permutation_importance(model, X_test, y_test,
                                          n_repeats=10, random_state=RND, scoring=scoring)
            perm_df = pd.DataFrame({
                "feature": feature_names,
                "importance_mean": perm.importances_mean,
                "importance_std": perm.importances_std
            }).sort_values("importance_mean", ascending=False).reset_index(drop=True)
            print(f"\n[{model_name}] Top-10 permutation importances:")
            print(perm_df.head(10))
            top_n = min(15, perm_df.shape[0])
            plt.figure(figsize=(8, max(5, 0.3 * top_n)))
            plt.barh(perm_df.loc[:top_n - 1, "feature"][::-1], perm_df.loc[:top_n - 1, "importance_mean"][::-1])
            plt.title(f"Permutation Importances (top-15) — {model_name}")
            plt.xlabel("Mean importance (test)")
            plt.ylabel("Feature")
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"[{model_name}] Permutation importance failed: {e}")

    def fit_eval(model_name, estimator, y_proba_other=None):
        pipe = Pipeline([("pre", pre), ("clf", estimator)])
        pipe.fit(X_train, y_train)

        feature_names = []
        try:
            encoder = pipe.named_steps["pre"].named_transformers_["cat"].named_steps["enc"]
            cat_names = encoder.get_feature_names_out(categorical_cols)
            feature_names = list(cat_names) + numeric_cols
        except Exception:
            feature_names = categorical_cols + numeric_cols

        y_pred = pipe.predict(X_test)
        try:
            y_proba = pipe.predict_proba(X_test)[:, 1]
        except Exception:
            try:
                from scipy.special import expit
                y_proba = expit(pipe.decision_function(X_test))
            except Exception:
                y_proba = None

        uniq, counts = np.unique(y_proba, return_counts=True) if y_proba is not None else ([], [])
        print(f"[{model_name}] unique predicted probabilities: {dict(zip(uniq, counts))}")

        mets = metrics_from_preds(y_test, y_pred, y_proba, y_proba_other)
        print(f"\n=== {model_name} ===")
        print("Metrics:",
              {k: round(v, 4) if isinstance(v, (int, float)) and not np.isnan(v) else v for k, v in mets.items()})
        print("\nClassification report:\n", classification_report(y_test, y_pred, digits=3))
        print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
        explain_model(pipe, model_name, X_test, y_test, feature_names)
        return {"model": model_name, **mets, "y_proba": y_proba}

    results = []

    # Decision Tree
    dt = DecisionTreeClassifier(
        random_state=RND,
        class_weight="balanced",
        max_depth=10,
        min_samples_leaf=4,
    )
    res_dt = fit_eval("DecisionTree", dt)

    # Random Forest (ICC vs DT)
    rf = RandomForestClassifier(
        n_estimators=300,
        random_state=RND,
        max_depth=10,
        min_samples_leaf=4,
        class_weight="balanced",
        n_jobs=1
    )
    res_rf = fit_eval("RandomForest", rf, y_proba_other=res_dt["y_proba"])

    # XGBoost (ICC vs RF)
    if xgb_available:
        pos = int((y_train == 1).sum())
        neg = int((y_train == 0).sum())
        spw = float(neg / max(pos, 1)) if pos > 0 else 1.0

        xgb = XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            tree_method="hist",
            random_state=RND,
            n_estimators=300,
            max_depth=10,
            # learning_rate=0.08,
            # subsample=0.8,
            # colsample_bytree=0.8,
            # min_child_weight=4,
            # reg_lambda=1.0,
            scale_pos_weight=spw
        )
        res_xgb = fit_eval("XGBoost", xgb, y_proba_other=res_rf["y_proba"])
        results.append(res_xgb)
        # Дополнительно: DT vs XGB
        res_dt_xgb = fit_eval("DecisionTree vs XGBoost", dt, y_proba_other=res_xgb["y_proba"])
        results.append(res_dt_xgb)
    else:
        print("⚠️ XGBoost is not installed — skipping this model.")

    # Добавляем все модели в результирующую таблицу
    results.insert(0, res_dt)
    results.insert(1, res_rf)

    res_df = pd.DataFrame(results)[[
        "model", "accuracy", "f1_macro", "f1_lo", "f1_hi",
        "balanced_acc", "roc_auc", "auc_lo", "auc_hi",
        "pr_auc", "brier", "icc2_1", "icc_lo", "icc_hi"
    ]]
    print("\n=== Summary metrics on TEST ===")
    print(res_df.round(4).to_string(index=False))
    return res_df

In [None]:
summary = run_baseline_classification_pipeline("/Users/pelmeshek1706/Desktop/projects/airest_notebooks/data/full_dataset_eng_gemma.csv", target_col="Depression_label")

In [None]:
summary = run_baseline_classification_pipeline("/Users/pelmeshek1706/Desktop/projects/airest_notebooks/data/full_dataset_eng.csv", target_col="Depression_label")

In [None]:
summary

In [None]:
summary = run_baseline_classification_pipeline("/Users/pelmeshek1706/Desktop/projects/airest_notebooks/data/full_dataset_ukr_gemma.csv", target_col="Depression_label")

In [None]:
summary

In [None]:
summary = run_baseline_classification_pipeline("/Users/pelmeshek1706/Desktop/projects/airest_notebooks/data/full_dataset_ukr_bert.csv", target_col="Depression_label")

In [None]:
summary