In [None]:
# Classifier imports
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.metrics import classification_report
from sklearn.metrics import RocCurveDisplay


import pylab as plt
import pandas as pd
import numpy as np

from joblib import dump, load
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import svm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
# Import paths
path_train = "./LISS_example_input_data.csv"
path_outcome = "./LISS_example_groundtruth_data.csv"

In [None]:
# Read data
original_data = pd.read_csv("datasets\LISS_example_input_data.csv", encoding='cp1252')

In [None]:
outcome = pd.read_csv("datasets\LISS_example_groundtruth_data.csv", encoding="cp1252")

In [None]:
# Drop observations where the outcome is missing
y_isna = outcome['new_child'].isnull()
data = original_data.loc[~y_isna]
outcome = outcome.loc[~y_isna]

In [None]:
# Select predictors: education, year of birth, gender, number of children in the household 
# You can do this automatically (not necessarily better): https://scikit-learn.org/stable/modules/feature_selection.html
keepcols = ['gebjaar', 'geslacht', 'leeftijd2019', 'positie2019',
            'aantalhh2019','partner2019', 'sted2019', 'belbezig2019', 
            'nettohh_f2019', 'oplmet2019', 'herkomstgroep2019',
            'burgstat2019', 'woonvorm2019', 'aantalki2019', 'cf19l128',
           'cf19l131','cf19l132','cf19l133','cf19l134','woning2019', 
            'cf19l456', 'cw19l522', 'cr19l143', 'cr19l090',
            'cf19l483', 'cf19l484', 'cf19l485', 'cf19l486', 'cf19l487', 'cf19l488',
           'cf19l130', 'cf19l457', 'cf19l458', 'cf19l459']
data = data.loc[:, keepcols]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    outcome,
                                                    test_size=0.25, random_state=2023)
y_train = y_train["new_child"]
y_test = y_test["new_child"]

In [None]:
from sklearn.utils import resample
target_upsample, data_upsample = resample(y_train[y_train == 1],
                                          X_train[y_train == 1],
             replace=True,
             n_samples=len(y_train[y_train == 0]),
             random_state=42)
target_upsampled = pd.concat([target_upsample, y_train[y_train==0]])
data_upsampled = pd.concat([data_upsample, X_train[y_train==0]])

In [None]:
dict_kids = {'None': 0, 'One child': 1, 'Two children': 2, 'Three children': 3, 'Four children': 4, 'Five children': 5, 'Six children': 6}
data_upsampled["aantalki2019"] = data_upsampled["aantalki2019"].map(dict_kids)

In [None]:
# An example of a preprocessing apart from the pipeline
# Create transformers
# Imputer are sometimes not necessary
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=50))])

numerical_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100)),
    ('scaler', StandardScaler())])

# Use ColumnTransformer to apply the transformations to the correct columns in the dataframe
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, selector(dtype_exclude=object)(data_upsampled)),
        ('cat', categorical_transformer, selector(dtype_include=object)(data_upsampled))])

In [None]:
model = Pipeline([
               ("preprocess", preprocessor),
               ("classifier", HistGradientBoostingClassifier())
                ]) 
                      
# Define the hyperparameters, this can include several classifiers, but will make it slow
# You can see different classifiers here: https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
parameters = [
    {'classifier': [HistGradientBoostingClassifier(learning_rate=0.8, max_depth=3)]
    },                  
]

# Perform hyperparameter tuning using cross-validation: https://scikit-learn.org/stable/modules/classes.html#hyper-parameter-optimizers
# Scoring metrics: https://scikit-learn.org/stable/modules/model_evaluation.html
# f1 = f1 of the class labeled as 1 (i.e. kids)
grid_search = GridSearchCV(model, parameters, cv=5, n_jobs=-1, scoring="f1", verbose=9) #n_jobs=-1 allows for multiprocessing
grid_search.fit(data_upsampled, target_upsampled)

# Keep best model (or define it from scratch with the best coefficients found)
best_model = grid_search.best_estimator_

best_model

In [None]:
#Variable names in the data
best_model["preprocess"].get_feature_names_out()

In [None]:
X_test["aantalki2019"] = X_test["aantalki2019"].map(dict_kids)

In [None]:
# Print ROC curve, it tells you how well you can balance false and true positives
RocCurveDisplay.from_predictions(
    y_test,
    best_model.predict_proba(X_test)[:, 1],
    color="cornflowerblue",
)
plt.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()

In [None]:
# Create predictions
y_pred = best_model.predict(X_test)

# Report classification table
print(classification_report(y_test, y_pred))

In [None]:
import os
os.makedirs("../models", exist_ok=True)

# Dump model (don't change the name)
dump(best_model, "../models/model v2.joblib")