# Model experiments

This notebook is for experimenting with the choice of model and performance variations between different time periods and clinics.

1. Train and evaluate current best model on total dataset (without differentiating between clinics)
2. Apply IECV to different models and check heteregeneity with respect to clinics
3. Train models per clinic and check performance
4. Use clinic as categorical feature in tree based models and check performance

## Load packages

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    GridSearchCV,
    GroupKFold,
    StratifiedGroupKFold,
    TimeSeriesSplit,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, SplineTransformer

from noshow.features.feature_pipeline import create_features
from noshow.preprocessing.load_data import (
    load_appointment_csv,
    process_appointments,
    process_postal_codes,
)

## Load data and split in to train and test

In [None]:
featuretable = pd.read_parquet(
    "/mapr/no_show/no_show_onderzoeker/rpeters7/No_Show/data/processed/featuretable.parquet"
)

featuretable["no_show"] = featuretable["no_show"].replace({"no_show": 1, "show": 0})
featuretable["hour"] = featuretable["hour"].astype("category")
featuretable["weekday"] = featuretable["weekday"].astype("category")

print(featuretable.dtypes)

X, y = featuretable.drop(columns="no_show"), featuretable["no_show"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, shuffle=False
)
train_groups = X.index.get_level_values("pseudo_id")

In [None]:
lgboost_model = HistGradientBoostingClassifier(
    learning_rate=0.05, max_iter=300, categorical_features=["hour", "weekday"]
)

categorical_features = ["hour", "weekday"]
continuous_features = X.columns.difference(categorical_features)

preprocessor = ColumnTransformer(
    transformers=[
        (
            "continuous",
            Pipeline([("scaler", RobustScaler()), ("spline", SplineTransformer())]),
            continuous_features,
        ),
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

log_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(penalty=None)),
    ]
)

## Train and evaluate current best model on total dataset (without differentiating between clinics)

In [None]:
def cv_auc_curve(X_train, y_train, model, cv, train_groups=None, title=None):
    fpr = {}
    tpr = {}
    roc_auc = {}
    test_indices = {}

    fig, ax = plt.subplots()

    for i, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train, train_groups)):
        X_train_cv, X_test_cv = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_cv, y_test_cv = y_train.iloc[train_idx], y_train.iloc[test_idx]

        model.fit(X_train_cv, y_train_cv)

        y_score = model.predict_proba(X_test_cv)[:, 1]
        fpr[i], tpr[i], _ = roc_curve(y_test_cv, y_score)
        roc_auc[i] = roc_auc_score(y_test_cv, y_score)
        test_indices[i] = test_idx

        ax.plot(fpr[i], tpr[i], c="b", alpha=0.15)

    ax.plot([0, 1], [0, 1], "k--")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    # Add mean AUC and standard deviation to the legend
    mean_auc = np.mean(list(roc_auc.values()))
    std_auc = np.std(list(roc_auc.values()))
    ax.legend([f"ROC curve (AUC = {mean_auc:.3f} +/- {std_auc:.3f})"])
    if title:
        ax.set_title(title)
    fig.show()

    return roc_auc, test_indices

In [None]:
_, _ = cv_auc_curve(
    X,
    y,
    lgboost_model,
    StratifiedGroupKFold(n_splits=5),
    train_groups,
)

In [None]:
_, _ = cv_auc_curve(
    X,
    y,
    log_model,
    StratifiedGroupKFold(n_splits=5),
    train_groups,
)

## Check temporal performance

In [None]:
X_timesorted = X.sort_index(level="start")
y_timesorted = y.sort_index(level="start")

In [None]:
roc_auc, test_indices = cv_auc_curve(
    X_timesorted,
    y_timesorted,
    lgboost_model,
    TimeSeriesSplit(n_splits=10),
)

In [None]:
fold_times = [
    str(
        (
            X.iloc[idx].index.get_level_values("start").min().strftime("%Y-%m-%d"),
            X.iloc[idx].index.get_level_values("start").max().strftime("%Y-%m-%d"),
        )
    )
    for idx in test_indices.values()
]
fold_times

In [None]:
roc_scores = pd.Series(roc_auc)
roc_scores.index = fold_times
roc_scores.plot.bar()

## Apply IECV

In [None]:
appointments_df = load_appointment_csv("../data/raw/poliafspraken_no_show.csv")
appointments_df = process_appointments(appointments_df)
all_postalcodes = process_postal_codes("../data/raw/NL.txt")
appointments_features = create_features(
    appointments_df, all_postalcodes, minutes_early_cutoff=30
)

In [None]:
appointments_features = (
    appointments_features[
        [
            "hoofdagenda",
            "hour",
            "weekday",
            "minutesDuration",
            "no_show",
            "prev_no_show",
            "prev_no_show_perc",
            "age",
            "dist_umcu",
            "prev_minutes_early",
            "earlier_appointments",
            "appointments_same_day",
            "appointments_last_days",
            "days_since_created",
            "days_since_last_appointment",
        ]
    ]
    .reset_index()
    .set_index(["pseudo_id", "start", "hoofdagenda"])
)

In [None]:
appointments_features["no_show"] = (
    appointments_features["no_show"].replace({"no_show": 1, "show": 0}).astype(int)
)

appointments_features["hour"] = appointments_features["hour"].astype("category")
appointments_features["weekday"] = appointments_features["weekday"].astype("category")

X, y = appointments_features.drop(columns="no_show"), appointments_features["no_show"]

In [None]:
train_groups = X.index.get_level_values("pseudo_id")

cv_auc_curve(
    X,
    y,
    lgboost_model,
    StratifiedGroupKFold(n_splits=5),
    train_groups,
)

In [None]:
def group_leave_one_out(df):
    groups = df.index.get_level_values("hoofdagenda").unique()

    for test_group in groups:
        train_index = df.index.get_level_values("hoofdagenda") != test_group
        test_index = df.index.get_level_values("hoofdagenda") == test_group

        yield test_group, (np.where(train_index)[0], np.where(test_index)[0])

In [None]:
def iecv_auc_curve(X_train, y_train, model):
    fpr = {}
    tpr = {}
    roc_auc = {}
    test_indices = {}

    fig, ax = plt.subplots()

    for group, (train_idx, test_idx) in group_leave_one_out(X_train):
        X_train_cv, X_test_cv = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_cv, y_test_cv = y_train.iloc[train_idx], y_train.iloc[test_idx]

        model.fit(X_train_cv, y_train_cv)

        y_score = model.predict_proba(X_test_cv)[:, 1]
        fpr[group], tpr[group], _ = roc_curve(y_test_cv, y_score)
        roc_auc[group] = roc_auc_score(y_test_cv, y_score)
        test_indices[group] = test_idx

        ax.plot(fpr[group], tpr[group], label=group)

    print(
        f"Mean AUC: {np.mean(list(roc_auc.values()))}"
        f"(+/- {np.std(list(roc_auc.values()))})"
    )
    ax.plot([0, 1], [0, 1], "k--")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.legend()
    fig.show()

    return roc_auc, test_indices

In [None]:
iecv_auc_curve(X, y, lgboost_model)

In [None]:
iecv_auc_curve(X, y, log_model)

## CV per poli

In [None]:
X, y = appointments_features.drop(columns="no_show"), appointments_features["no_show"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, shuffle=False
)
train_groups = X.index.get_level_values("pseudo_id")

In [None]:
train_groups

In [None]:
for poli in X.index.get_level_values("hoofdagenda").unique():
    X_tmp = X.loc[(slice(None), slice(None), poli), :]
    y_tmp = y.loc[(slice(None), slice(None), poli)]
    train_groups_tmp = X_tmp.index.get_level_values("pseudo_id")

    _, _ = cv_auc_curve(
        X_tmp,
        y_tmp,
        lgboost_model,
        # HistGradientBoostingClassifier(learning_rate=0.05, max_iter=300),
        StratifiedGroupKFold(n_splits=5),
        train_groups_tmp,
        title=poli,
    )

## Adding poli as feature

In [None]:
appointments_features_agenda = appointments_features.reset_index()
appointments_features_agenda["hoofdagenda_cat"] = appointments_features_agenda[
    "hoofdagenda"
].astype("category")
appointments_features_agenda = appointments_features_agenda.set_index(
    ["pseudo_id", "start", "hoofdagenda"]
)
X, y = (
    appointments_features_agenda.drop(columns="no_show"),
    appointments_features_agenda["no_show"],
)

model = HistGradientBoostingClassifier(
    learning_rate=0.05, max_iter=300, categorical_features=["hoofdagenda_cat"]
)
train_groups = X.index.get_level_values("pseudo_id")

In [None]:
X.dtypes

In [None]:
_, _ = cv_auc_curve(
    X,
    y,
    model,
    StratifiedGroupKFold(n_splits=5),
    train_groups,
)