# First model experiments

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE

In [None]:
featuretable = pd.read_parquet(
    "/mapr/nicu_ew/nicu_ew_onderzoeker/rpeters7/No_Show/data/processed/featuretable.parquet"
)
featuretable.dtypes

In [None]:
X, y = featuretable.drop(columns="no_show"), featuretable["no_show"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

oversampler = SMOTE()

# Define the categorical columns in your feature matrix
categorical_cols = [col for col in X.columns if X[col].dtype == "object"]

# Define the preprocessor pipeline for the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# Define the final pipeline with preprocessor and random forest classifier
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("smotesampling", oversampler),
        ("classifier", RandomForestClassifier()),
    ]
)

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)


In [None]:
# Test the performance on the test data
y_pred = pipeline.predict_proba(X_test)

roc_auc_score(y_test, y_pred[:, 1])


In [None]:
import numpy as np

y_pred_cat = np.zeros(len(y_pred), object)
y_pred_cat[y_pred[:, 0] > 0.5] = "no_show"
y_pred_cat[y_pred[:, 0] <= 0.5] = "show"
y_pred_cat

In [None]:
precision_score(y_test, y_pred_cat, pos_label="no_show")

In [None]:
recall_score(y_test, y_pred_cat, pos_label="no_show")

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 0], pos_label="no_show")
plt.plot(fpr, tpr)
plt.show()