In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    train_test_split,
)

from noshow.features.feature_pipeline import create_features
from noshow.preprocessing.load_data import (
    load_appointment_csv,
    process_appointments,
    process_postal_codes,
)

In [None]:
appointments_df = load_appointment_csv("../data/raw/poliafspraken_no_show.csv")
appointments_df = process_appointments(appointments_df)
all_postalcodes = process_postal_codes("../data/raw/NL.txt")
featuretable = create_features(
    appointments_df,
    all_postalcodes,
)

featuretable = (
    featuretable[
        [
            "hoofdagenda",
            "hour",
            "weekday",
            "minutesDuration",
            "no_show",
            "prev_no_show",
            "prev_no_show_perc",
            "age",
            "dist_umcu",
            "prev_minutes_early",
            "earlier_appointments",
            "appointments_same_day",
            "appointments_last_days",
            "days_since_created",
            "days_since_last_appointment",
        ]
    ]
    .reset_index()
    .set_index(["pseudo_id", "start", "hoofdagenda"])
)

In [None]:
featuretable["no_show"] = (
    featuretable["no_show"].replace({"no_show": "1", "show": "0"}).astype(int)
)
featuretable["hour"] = featuretable["hour"].astype("category")
featuretable["weekday"] = featuretable["weekday"].astype("category")

print(featuretable.dtypes)
X, y = featuretable.drop(columns="no_show"), featuretable["no_show"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
lgboost_model = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=300,
    categorical_features=["hour", "weekday"],
    random_state=42,
)

In [None]:
fitted_model = lgboost_model.fit(X_train, y_train)
preds = fitted_model.predict_proba(X_test)

## ROC curve

In [None]:
y_pred = fitted_model.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 1])
auc_score = roc_auc_score(y_test, y_pred[:, 1])
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(fpr, tpr, label=f"Hist Gradient Boosting (AUC={round(auc_score, 2)})")
ax.plot([0, 1], [0, 1], label="Random (AUC=0.5)", linestyle="dotted")
ax.legend()
plt.show()

## Prediction, recall

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred[:, 1])

fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(thresholds, precision[:-1], label="precision")
ax.plot(thresholds, recall[:-1], label="recall")
ax.legend()
plt.show()

## Create fixed bins

In [None]:
prediction_df = pd.DataFrame(y_pred[:, 1], index=X_test.index, columns=["prediction"])
prediction_df = prediction_df.reset_index()

In [None]:
# Calculate quantiles
n_bins = 4
quantiles = np.linspace(0, 1, n_bins + 1)
# determine quantiles for every hoofdagenda group in preditcions_df
bin_edges = (
    prediction_df.sort_values("prediction", ascending=False)
    .drop_duplicates(subset="pseudo_id", keep="first")
    .groupby("hoofdagenda")["prediction"]
    .quantile(quantiles)
    .reset_index()
)

bin_edges = pd.pivot_table(
    bin_edges, values="prediction", index="hoofdagenda", columns="level_1"
)
# create a dict where hoodagendas are keys and bin_edges for the quantiles are values
bin_edges = bin_edges.to_dict(orient="index")

# save to json
with open("../data/processed/fixed_pred_score_bin.json", "w") as f:
    json.dump(bin_edges, f)
print(bin_edges)