In [31]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomTreesEmbedding, StackingClassifier
from sklearn.model_selection import cross_val_predict

from constants import *
import numpy as np
from shutil import make_archive, unpack_archive
import time
from time import perf_counter
import warnings

from src.calc_scores import get_X_train_X_test_y_train_y_test
from pycaret.classification import *
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split



In [32]:

dataset_id = "1489"
#dataset_id = "40923" # 1489 # 3

# load data
X_train, X_test, y_train, y_test = get_X_train_X_test_y_train_y_test(
    dataset_folder=DATASETS_FOLDER_PATH.joinpath(dataset_id), random_state=RANDOM_STATE,
    X_file_name=X_FILTERED_FILE_NAME, y_file_name=Y_FILE_NAME)

# short feedback of the data and classes
print(f"X_train shape: {X_train.shape}")
print(f"target classes: \n{y_train.value_counts()}")
print(f"total {len(y_train.value_counts())} classes\n")

#sample if needed
sample_size = 1_000

if len(X_train) > sample_size:
    print(f"Sample is used of {sample_size}")
    #X_train_sample = X_train.sample(n=sample_size, random_state=RANDOM_STATE)
    X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, train_size=sample_size, random_state=RANDOM_STATE)

else:
    X_train_sample = X_train
    y_train_sample = y_train

feature_names = list(X_train.columns)

# pycaret wants the target in a dataframe column
X_train_sample["y"] = y_train_sample
X_test["y"] = y_test
pass

X_train shape: (3566, 5)
target classes: 
0    2501
1    1065
Name: y, dtype: int64
total 2 classes

Sample is used of 1000


In [33]:
experiment = setup(
    #data=X_train,
    data=X_train_sample,
    target="y",
    test_data=X_test,
    preprocess=False,
    #data_split_shuffle=False,
    #n_jobs=-1,
    # session_id=RANDOM_STATE # maybe a bug so do not set the random state. ERROR:
    fold=5,
    fold_shuffle=True, # must be set because of this bug which leads from pycaret is not adjustd to sklearn version 1? ValueError: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True. Info from here: https://stackoverflow.com/questions/67728802/valueerror-setting-a-random-state-has-no-effect-since-shuffle-is-false-you-sho
    numeric_features=feature_names,  # force all features to be regarded as numeric not categorical which avoids problems with xgboost and lightgbm
    session_id=42,
    html=False,  # must be set to False when run outside of a notebook (ipython)
)


Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
V1,Numeric
V2,Numeric
V3,Numeric
V4,Numeric
V5,Numeric
y,Label


Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target,y
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(1000, 6)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,0
8,Transformed Train Set,"(1000, 5)"
9,Transformed Test Set,"(1838, 5)"


In [34]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,pycaret.internal.tunable.TunableMLPClassifier,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


## Train Models

In [35]:
model_types_to_use = [
    "rf",
    "lr",
    "knn",
    #"mlp",  # too slow
    "dt",
    #"xgboost",
    #"lightgbm",
    #"catboost",
]

selected_models = compare_models(include=model_types_to_use,
errors="raise",
n_select=len(model_types_to_use)  # all models included
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.848,0.9145,0.7029,0.771,0.734,0.6279,0.6303,0.06


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.848,0.9145,0.7029,0.771,0.734,0.6279,0.6303,0.06
lr,Logistic Regression,0.732,0.8046,0.4084,0.5703,0.4724,0.3,0.3094,0.004


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.848,0.9145,0.7029,0.771,0.734,0.6279,0.6303,0.06
knn,K Neighbors Classifier,0.826,0.8722,0.6658,0.7285,0.6951,0.5737,0.5753,0.008
lr,Logistic Regression,0.732,0.8046,0.4084,0.5703,0.4724,0.3,0.3094,0.004


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.848,0.9145,0.7029,0.771,0.734,0.6279,0.6303,0.06
knn,K Neighbors Classifier,0.826,0.8722,0.6658,0.7285,0.6951,0.5737,0.5753,0.008
dt,Decision Tree Classifier,0.804,0.7658,0.6722,0.6726,0.6714,0.5319,0.5327,0.004
lr,Logistic Regression,0.732,0.8046,0.4084,0.5703,0.4724,0.3,0.3094,0.004


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.848,0.9145,0.7029,0.771,0.734,0.6279,0.6303,0.06
knn,K Neighbors Classifier,0.826,0.8722,0.6658,0.7285,0.6951,0.5737,0.5753,0.008
dt,Decision Tree Classifier,0.804,0.7658,0.6722,0.6726,0.6714,0.5319,0.5327,0.004
lr,Logistic Regression,0.732,0.8046,0.4084,0.5703,0.4724,0.3,0.3094,0.004


In [36]:
model_names = []
accuracy_scores = []

for model in selected_models:
    model_names.append(model.__class__.__name__)
    prediction_df = predict_model(model)
    accuracy_scores.append(accuracy_score(prediction_df["Label"], prediction_df["y"]))

predict_df = pd.DataFrame(data={"model": model_names, "accuracy": accuracy_scores}).sort_values(by="accuracy", ascending=False).reset_index(drop=True)
predict_df

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8629,0.9265,0.7524,0.7612,0.7568,0.6613,0.6613


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.8455,0.8971,0.6948,0.7433,0.7183,0.612,0.6126


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.8128,0.7673,0.6622,0.6725,0.6673,0.5371,0.5371


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7459,0.8126,0.4242,0.5696,0.4862,0.3222,0.3284


Unnamed: 0,model,accuracy
0,RandomForestClassifier,0.862894
1,KNeighborsClassifier,0.845484
2,DecisionTreeClassifier,0.81284
3,LogisticRegression,0.745919


## Tune Models

In [37]:
#tuned_models = [tune_model(model, choose_better=True, n_iter=100) for model  in selected_models]
tuned_models = []
for model in selected_models:
    tuned_model = tune_model(model, choose_better=True, n_iter=25)
    tuned_models.append(tuned_model)
    print(tuned_model)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8,0.8682,0.65,0.6724,0.661,0.5192,0.5194
1,0.88,0.9461,0.7627,0.8182,0.7895,0.7057,0.7066
2,0.81,0.8961,0.6441,0.6909,0.6667,0.534,0.5347
3,0.87,0.8953,0.7119,0.8235,0.7636,0.6746,0.678
4,0.89,0.9298,0.7288,0.8776,0.7963,0.7218,0.7277
Mean,0.85,0.9071,0.6995,0.7765,0.7354,0.6311,0.6333
SD,0.0374,0.0276,0.0459,0.0804,0.0595,0.0868,0.0883


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={},
                       criterion='gini', max_depth=9, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0, min_samples_leaf=6,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       n_estimators=120, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)
Fitting 5 folds for each of 25 candidates, totalling 125 fits


Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.82,0.7902,0.5833,0.7609,0.6604,0.5408,0.5497
1,0.86,0.8687,0.5593,0.9429,0.7021,0.6183,0.6543
2,0.815,0.7923,0.5254,0.775,0.6263,0.5093,0.5263
3,0.815,0.8122,0.5085,0.7895,0.6186,0.5039,0.5251
4,0.835,0.8469,0.5254,0.8611,0.6526,0.5526,0.5816
Mean,0.829,0.8221,0.5404,0.8259,0.652,0.545,0.5674
SD,0.0171,0.0309,0.0271,0.0679,0.0295,0.041,0.048


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=-1, n_neighbors=2, p=2,
                     weights='uniform')
Fitting 5 folds for each of 25 candidates, totalling 125 fits


Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.815,0.8348,0.7,0.6885,0.6942,0.5616,0.5617
1,0.825,0.866,0.661,0.7222,0.6903,0.5686,0.5697
2,0.74,0.7684,0.5932,0.5556,0.5738,0.387,0.3874
3,0.81,0.8212,0.6949,0.6721,0.6833,0.5477,0.5478
4,0.855,0.841,0.661,0.8125,0.729,0.6314,0.6377
Mean,0.809,0.8263,0.662,0.6902,0.6741,0.5393,0.5409
SD,0.0379,0.0324,0.0381,0.083,0.0526,0.0814,0.0827


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=11, max_features=1.0, max_leaf_nodes=None,
                       min_impurity_decrease=0.0001, min_samples_leaf=3,
                       min_samples_split=10, min_weight_fraction_leaf=0.0,
                       random_state=42, splitter='best')
Fitting 5 folds for each of 25 candidates, totalling 125 fits


Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.735,0.7798,0.5333,0.5614,0.547,0.3599,0.3601
1,0.765,0.868,0.3898,0.6765,0.4946,0.3556,0.3786
2,0.735,0.8002,0.3898,0.575,0.4646,0.2971,0.307
3,0.705,0.7578,0.3729,0.5,0.4272,0.2342,0.2387
4,0.72,0.8167,0.3559,0.5385,0.4286,0.2532,0.2628
Mean,0.732,0.8045,0.4084,0.5703,0.4724,0.3,0.3094
SD,0.0199,0.0374,0.0637,0.0589,0.0449,0.0514,0.0539


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [38]:
model_names = []
accuracy_scores = []

for model in tuned_models:
    model_names.append(f"{model.__class__.__name__} tuned")
    predict_df_tuned = predict_model(model)
    accuracy_scores.append(accuracy_score(predict_df_tuned["Label"], predict_df_tuned["y"]))

predict_df_tuned = pd.DataFrame(data={"model": model_names, "accuracy": accuracy_scores}).sort_values(by="accuracy", ascending=False).reset_index(drop=True)
predict_df_tuned

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.852,0.9154,0.7063,0.7556,0.7302,0.6284,0.629


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.8232,0.8439,0.4856,0.8161,0.6089,0.504,0.5324


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.8134,0.8121,0.6411,0.6816,0.6607,0.5322,0.5327


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7459,0.8126,0.4242,0.5696,0.4862,0.3222,0.3284


Unnamed: 0,model,accuracy
0,RandomForestClassifier tuned,0.852013
1,KNeighborsClassifier tuned,0.823177
2,DecisionTreeClassifier tuned,0.813384
3,LogisticRegression tuned,0.745919


## Stack the tuned models


In [39]:
stacked_model = stack_models(
    estimator_list=tuned_models,
    round=6,
    restack=False,  # False means only the Predictions of Layer 0 are used to train the final estimator. True uses the basefeatures too.
    choose_better=False
)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.795,0.890238,0.633333,0.666667,0.649573,0.504831,0.505163
1,0.87,0.954442,0.694915,0.836735,0.759259,0.671261,0.676691
2,0.82,0.891574,0.644068,0.716981,0.678571,0.554069,0.555606
3,0.845,0.892295,0.711864,0.75,0.730435,0.621767,0.622184
4,0.865,0.928237,0.694915,0.82,0.752294,0.660377,0.66465
Mean,0.839,0.911357,0.675819,0.758076,0.714026,0.602461,0.604859
SD,0.028178,0.025853,0.031118,0.063453,0.042874,0.063771,0.065422


In [40]:
predict_df_stacked = predict_model(stacked_model, round=6)
print(accuracy_score(predict_df_stacked["Label"], predict_df_stacked["y"]))

predict_df_stacked = pd.DataFrame(data={"model": [f"{stacked_model.__class__.__name__} tuned"], "accuracy": [accuracy_score(predict_df_stacked["Label"], predict_df_stacked["y"])]})
print(predict_df_stacked)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0.862894,0.920604,0.710173,0.785563,0.745968,0.652404,0.653962


0.8628944504896626
                      model  accuracy
0  StackingClassifier tuned  0.862894


In [41]:
## Make results dataframe

In [42]:
caret_results_df = pd.concat([predict_df, predict_df_tuned, predict_df_stacked]).sort_values(by="accuracy", ascending=False).reset_index(drop=True)
caret_results_df

Unnamed: 0,model,accuracy
0,RandomForestClassifier,0.862894
1,StackingClassifier tuned,0.862894
2,RandomForestClassifier tuned,0.852013
3,KNeighborsClassifier,0.845484
4,KNeighborsClassifier tuned,0.823177
5,DecisionTreeClassifier tuned,0.813384
6,DecisionTreeClassifier,0.81284
7,LogisticRegression,0.745919
8,LogisticRegression tuned,0.745919
