In [None]:
import pandas as pd
import numpy as np
import sklearn
import imblearn

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, mean_squared_error

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:
# create indices for each fold, train and test
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# create pipelines for all algorithms

In [None]:
# pipeline Dummy
from sklearn.dummy import DummyClassifier

pipeline_dumdum = ImbPipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
    ('sampler', SMOTE(random_state=seed, sampling_strategy='minority')),
    ('model', DummyClassifier(strategy='most_frequent', random_state=seed)),
])


In [None]:
# pipeline Logistic
from sklearn.linear_model import LogisticRegression

pipeline_log = ImbPipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
    ('sampler', SMOTE(random_state=seed, sampling_strategy='minority')),
    ('model', LogisticRegression(class_weight=None, random_state=seed))
])

In [None]:
# pipeline CART
from sklearn.tree import DecisionTreeClassifier

pipeline_cart = ImbPipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
    ('sampler', SMOTE(random_state=seed, sampling_strategy='minority')),
    ('model', DecisionTreeClassifier(class_weight=None, max_depth=10, min_samples_split=10, random_state=seed)),
])

In [None]:
# pipeline RF
from sklearn.ensemble import RandomForestClassifier

pipeline_rf = ImbPipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
    ('sampler', SMOTE(random_state=seed, sampling_strategy='minority')),
    ('model', RandomForestClassifier(class_weight=None, max_depth=20, n_estimators=300, random_state=seed)),
])

In [None]:
# pipeline booster
from sklearn.ensemble import HistGradientBoostingClassifier

pipeline_boost = ImbPipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
    ('sampler', SMOTE(random_state=seed, sampling_strategy='minority')),
    ('model', HistGradientBoostingClassifier(class_weight=None, max_depth=20, max_iter=300, random_state=seed))
])

In [None]:
# make pipeline lists for training
pipes_train = [pipeline_dumdum, pipeline_log, pipeline_cart, pipeline_rf, pipeline_boost]

In [None]:
# loop through all models

for idx, train_item in enumerate(pipes_train):
    
    # collect validation pipelines
    val_pipeline = [] 
    
    # cross validation sets
    for i, (train_index, test_index) in enumerate(kfold.split(X, Y)): 

        X_train_fold = X.iloc[train_index]
        X_test_fold = X.iloc[test_index]
        y_train_fold = Y.iloc[train_index]
        y_test_fold = Y.iloc[test_index]
        y_train_fold = np.ravel(y_train_fold)
        y_test_fold = np.ravel(y_test_fold)

        # fit
        model = train_item.fit(X_train_fold, y_train_fold)

        # store validation pipeline
        pipeline = ImbPipeline([
            ('imputer', model.named_steps['imputer']),
            ('scaler', model.named_steps['scaler']),
            ('model', model.named_steps['model'])
        ])
        
        val_pipeline.append(pipeline)
        
        # predict
        y_pred = val_pipeline[i].predict(X_test_fold)
    
        # test
        accuracy = accuracy_score(y_test_fold, y_pred)
        f1 = f1_score(y_test_fold, y_pred, zero_division=0)
        precision = precision_score(y_test_fold, y_pred, zero_division=0)
        recall = recall_score(y_test_fold, y_pred)
        mse = mean_squared_error(y_test_fold, y_pred)
    
        # confusion matrix
        conf_matrix = confusion_matrix(y_test_fold, y_pred)
        false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 0] + conf_matrix[0, 1])
        false_negative_rate = conf_matrix[1, 0] / (conf_matrix[1, 0] + conf_matrix[1, 1])
