# Model strategy

Notebook to compare model strategies, including clinic or not in the model and the effect on the predictions

## Load packages

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier

from noshow.features.feature_pipeline import create_features
from noshow.preprocessing.load_data import (
    load_appointment_csv,
    process_appointments,
    process_postal_codes,
)

# Enable copy on write
pd.options.mode.copy_on_write = True

## Load data

In [None]:
appointments_df = load_appointment_csv("../data/raw/poliafspraken_no_show.csv")
appointments_df = process_appointments(appointments_df)
all_postalcodes = process_postal_codes("../data/raw/NL.txt")
featuretable = create_features(
    appointments_df, all_postalcodes, minutes_early_cutoff=30
)

featuretable = (
    featuretable[
        [
            "hoofdagenda",
            "hour",
            "weekday",
            "minutesDuration",
            "no_show",
            "prev_no_show",
            "prev_no_show_perc",
            "age",
            "dist_umcu",
            "prev_minutes_early",
            "earlier_appointments",
            "appointments_same_day",
            "appointments_last_days",
            "days_since_created",
            "days_since_last_appointment",
        ]
    ]
    .reset_index()
    .set_index(["pseudo_id", "start", "hoofdagenda"])
)

In [None]:
featuretable["no_show"] = (
    featuretable["no_show"].replace({"no_show": "1", "show": "0"}).astype(int)
)
featuretable["hour"] = featuretable["hour"].astype("category")
featuretable["weekday"] = featuretable["weekday"].astype("category")

print(featuretable.dtypes)

X, y = featuretable.drop(columns="no_show"), featuretable["no_show"]

## Train different models and add predictions to data

In [None]:
lgboost_model = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=300,
    categorical_features=["hour", "weekday"],
    random_state=42,
)

In [None]:
fitted_model = lgboost_model.fit(X, y)
preds = fitted_model.predict_proba(X)

In [None]:
lgboost_poli = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=300,
    categorical_features=["hour", "weekday", "clinic"],
    random_state=42,
)

In [None]:
X_poli = X.copy()
X_poli["clinic"] = X_poli.index.get_level_values("hoofdagenda")
fitted_model_poli = lgboost_poli.fit(X_poli, y)
preds_poli = fitted_model_poli.predict_proba(X_poli)

In [None]:
featuretable_preds = featuretable.copy()
featuretable_preds["preds"] = preds[:, 1]
featuretable_preds["preds_poli"] = preds_poli[:, 1]

### Add predictions from different models per poli

In [None]:
for clinic in featuretable_preds.index.get_level_values("hoofdagenda").unique():
    X_clinic = X.loc[(slice(None), slice(None), clinic), :]
    y_clinic = y.loc[X_clinic.index]
    fitted_model_clinic = lgboost_model.fit(X_clinic, y_clinic)
    preds_clinic = fitted_model_clinic.predict_proba(X_clinic)
    featuretable_preds.loc[(slice(None), slice(None), clinic), "preds_sep"] = (
        preds_clinic[:, 1]
    )

## Visualise the prediction distribution per clinic and per model

In [None]:
featuretable_preds["clinic"] = featuretable_preds.index.get_level_values("hoofdagenda")
# Plot a historgram per clinic
groups = featuretable_preds.groupby("clinic")
fig, ax = plt.subplots(len(groups), 1, figsize=(15, 20), sharex=True)
for i, (clinic, group) in enumerate(groups):
    group["preds"].plot.hist(bins=100, alpha=0.5, legend=True, title=clinic, ax=ax[i])
fig.show()

In [None]:
fig, ax = plt.subplots(len(groups), 1, figsize=(15, 20), sharex=True)
for i, (clinic, group) in enumerate(groups):
    group["preds_poli"].plot.hist(
        bins=100, alpha=0.5, legend=True, title=clinic, ax=ax[i]
    )
fig.show()

## Visualise the distribution of called patients per prediction quantile and per model

In [None]:
featuretable_preds["preds_bin"] = pd.qcut(featuretable_preds["preds"], 10, labels=False)

# plot per bin the number of predictions per clinic
featuretable_preds.groupby(
    ["preds_bin", "clinic"], observed=True
).size().unstack().plot.bar(stacked=True, figsize=(15, 8))
plt.show()

featuretable_preds["preds_poli_bin"] = pd.qcut(
    featuretable_preds["preds_poli"], 10, labels=False
)
featuretable_preds.groupby(
    ["preds_poli_bin", "clinic"], observed=True
).size().unstack().plot.bar(stacked=True, figsize=(15, 8))
plt.show()

## Create reclassification table

In [None]:
RELATIVE_TABLE = True

In [None]:
featuretable_preds["called_by_gen_model"] = featuretable_preds["preds_bin"] >= 8
featuretable_preds["called_by_poli_model"] = featuretable_preds["preds_poli_bin"] >= 8

reclassification_table = featuretable_preds.value_counts(
    ["called_by_gen_model", "called_by_poli_model"], normalize=RELATIVE_TABLE
).unstack()

if RELATIVE_TABLE:
    reclassification_table = round(reclassification_table * 100, 2)

reclassification_table

In [None]:
# Create a visualisation of the previous reclassification table
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

column_labels = ["Not called by general model", "Called by general model"]
row_labels = ["Not called by clinic model", "Called by clinic model"]

ax.imshow(reclassification_table)
ax.set_xticks(np.arange(len(row_labels)), labels=row_labels)
ax.set_yticks(np.arange(len(column_labels)), labels=column_labels)
ax.set_title("Reclassification table")

for i in range(len(column_labels)):
    for j in range(len(row_labels)):
        ax.text(
            j,
            i,
            reclassification_table.iloc[i, j],
            ha="center",
            va="center",
            color="darkgrey",
            antialiased=True,
            fontsize=16,
        )

In [None]:
counts_series = featuretable_preds.groupby(
    ["clinic", "called_by_gen_model", "called_by_poli_model"]
).size()
total_series = counts_series.groupby("clinic").transform("sum")
reclassification_table_per_clinic = pd.concat([counts_series, total_series], axis=1)
reclassification_table_per_clinic.columns = ["count", "total"]

if RELATIVE_TABLE:
    reclassification_table_per_clinic["count"] = round(
        reclassification_table_per_clinic["count"]
        / reclassification_table_per_clinic["total"]
        * 100,
        2,
    )

reclassification_table_per_clinic = reclassification_table_per_clinic.drop(
    columns="total"
).unstack()
reclassification_table_per_clinic

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(20, 15), layout="tight")
ax = ax.flatten()
for i, clinic in enumerate(
    reclassification_table_per_clinic.index.get_level_values("clinic").unique()
):
    ax[i].set_title(clinic)
    ax[i].imshow(reclassification_table_per_clinic.loc[clinic])
    ax[i].set_xticks(np.arange(len(row_labels)), labels=row_labels)
    ax[i].set_yticks(np.arange(len(column_labels)), labels=column_labels)

    for j in range(len(column_labels)):
        for k in range(len(row_labels)):
            ax[i].text(
                k,
                j,
                (
                    f"{reclassification_table_per_clinic.loc[clinic].iloc[j, k]}"
                    f"{'%' if RELATIVE_TABLE else ''}"
                ),
                ha="center",
                va="center",
                color="darkgrey",
                antialiased=True,
                fontsize=18,
            )
fig.suptitle("Reclassification table per clinic", fontsize=20)
fig.show()