# First model experiments

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, RocCurveDisplay
import matplotlib.pyplot as plt

from sklearn.metrics import auc
from imblearn.over_sampling import SMOTE

from dvclive import Live

In [None]:
featuretable = pd.read_parquet(
    "/mapr/no_show/no_show_onderzoeker/rpeters7/No_Show/data/processed/featuretable.parquet"
)
print(featuretable.dtypes)

featuretable["no_show"] = featuretable["no_show"].replace({"no_show": 1, "show": 0})


X, y = featuretable.drop(columns="no_show"), featuretable["no_show"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, shuffle=False
)


In [None]:
with Live(save_dvc_exp=True) as live:
    param_grid = {"classifier__n_estimators": [100]}

    oversampler = SMOTE()

    # Define the categorical columns in your feature matrix
    categorical_cols = [col for col in X.columns if X[col].dtype == "object"]
    num_cols = [col for col in X.columns if X[col].dtype != "object"]

    # Define the preprocessor pipeline for the categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", "passthrough", num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ]
    )

    # Define the final pipeline with preprocessor and random forest classifier
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("smotesampling", oversampler),
            ("classifier", RandomForestClassifier()),
        ]
    )

    # Train the pipeline on the training data
    grid = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=5,
        scoring=["roc_auc", "precision", "recall"],
        verbose=2,
        refit="roc_auc",
    )
    grid.fit(X_train, y_train)

    live.log_metric("best_score", grid.best_score_)
    live.log_params(grid.best_params_)

In [None]:
y_pred = grid.best_estimator_.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 1])
auc_score = roc_auc_score(y_test, y_pred[:, 1])
fig, ax = plt.subplots()
ax.plot(fpr, tpr, label=f"Random Forest (AUC={round(auc_score, 2)})")
ax.plot([0, 1], [0, 1], label="Random (AUC=0.5)", linestyle="dotted")
ax.legend()
plt.show()

In [None]:
# Test the performance on the test data
y_pred = pipeline.predict_proba(X_test)

auc_score = roc_auc_score(y_test, y_pred[:, 1])
auc_score


In [None]:
y_pred_cat = np.zeros(len(y_pred), object)
y_pred_cat[y_pred[:, 0] > 0.5] = "no_show"
y_pred_cat[y_pred[:, 0] <= 0.5] = "show"
y_pred_cat

In [None]:
precision_score(y_test, y_pred_cat, pos_label="no_show")

In [None]:
recall_score(y_test, y_pred_cat, pos_label="no_show")

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 0], pos_label="no_show")
plt.plot(fpr, tpr, label=f"Random Forest (AUC={round(auc_score, 2)})")
plt.plot([0, 1], [0, 1], label="Random (AUC=0.5)", linestyle="dotted")
plt.legend()
plt.show()

In [None]:
forest_importances = pd.Series(
    pipeline[-1].feature_importances_, index=pipeline[0].get_feature_names_out()
).sort_values()

fig, ax = plt.subplots()
ax.barh(
    forest_importances.index,
    forest_importances,
)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
plt.show()