In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomTreesEmbedding, StackingClassifier
from sklearn.model_selection import cross_val_predict

from constants import *
import numpy as np
from shutil import make_archive, unpack_archive
import time
from time import perf_counter
import warnings

from src.calc_scores import get_X_train_X_test_y_train_y_test
from pycaret.classification import *
from sklearn.metrics import accuracy_score


In [18]:
dataset_id = "3"
#dataset_id = "40923" # 1489 # 3

# load data
X_train, X_test, y_train, y_test = get_X_train_X_test_y_train_y_test(
    dataset_folder=DATASETS_FOLDER_PATH.joinpath(dataset_id), random_state=RANDOM_STATE,
    X_file_name=X_FILTERED_FILE_NAME, y_file_name=Y_FILE_NAME)

# short feedback of the data and classes
print(f"X_train shape: {X_train.shape}")
print(f"target classes: \n{y_train.value_counts()}")
print(f"total {len(y_train.value_counts())} classes\n")

#sample if needed
sample_size = 10_000

if len(X_train) > sample_size:
    warnings.warn(
        f"Sample is used of {sample_size}")
    X_train_sample = X_train.sample(n=sample_size, random_state=RANDOM_STATE)

else:
    X_train_sample = X_train

feature_names = list(X_train.columns)

# pycaret wants the target in a dataframe column
X_train["y"] = y_train
X_test["y"] = y_test
pass

X_train shape: (2109, 60)
target classes: 
1    1097
0    1012
Name: y, dtype: int64
total 2 classes



In [19]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,pycaret.internal.tunable.TunableMLPClassifier,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [20]:
experiment = setup(
    data=X_train,
    target="y",
    test_data=X_test,
    preprocess=False,
    #data_split_shuffle=False,
    #n_jobs=-1,
    # session_id=RANDOM_STATE # maybe a bug so do not set the random state. ERROR:
    fold=5,
    fold_shuffle=True, # must be set because of this bug which leads from pycaret is not adjustd to sklearn version 1? ValueError: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True. Info from here: https://stackoverflow.com/questions/67728802/valueerror-setting-a-random-state-has-no-effect-since-shuffle-is-false-you-sho
    numeric_features=feature_names,  # force all features to be regarded as numeric not categorical which avoids problems with xgboost and lightgbm
    session_id=42,
)


Unnamed: 0,Description,Value
0,session_id,42
1,Target,y
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(2109, 61)"
5,Missing Values,False
6,Numeric Features,60
7,Categorical Features,0
8,Transformed Train Set,"(2109, 60)"
9,Transformed Test Set,"(1087, 60)"


## Train Models

In [21]:
model_types_to_use = [
    "rf",
    "lr",
    "knn",
    #"mlp",
    "dt",
    "xgboost",
    "lightgbm",
    "catboost",
]

selected_models = compare_models(include=model_types_to_use,
errors="raise",
n_select=len(model_types_to_use)  # all models included
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9839,0.9989,0.9845,0.9846,0.9845,0.9677,0.9678,0.03
catboost,CatBoost Classifier,0.9834,0.9988,0.9827,0.9855,0.984,0.9668,0.9669,0.74
rf,Random Forest Classifier,0.982,0.9961,0.9818,0.9837,0.9827,0.9639,0.9641,0.736
xgboost,Extreme Gradient Boosting,0.982,0.9989,0.9818,0.9837,0.9827,0.9639,0.964,0.102
dt,Decision Tree Classifier,0.9806,0.9857,0.9763,0.9863,0.9812,0.9611,0.9612,0.328
lr,Logistic Regression,0.9592,0.9927,0.959,0.9628,0.9607,0.9183,0.9186,0.396
knn,K Neighbors Classifier,0.9298,0.972,0.9353,0.9302,0.9327,0.8593,0.8594,0.41


In [28]:
model_names = []
accuracy_scores = []

for model in selected_models:
    model_names.append(model.__class__.__name__)
    prediction_df = predict_model(model)
    accuracy_scores.append(accuracy_score(prediction_df["Label"], prediction_df["y"]))

predict_df = pd.DataFrame(data={"model": model_names, "accuracy": accuracy_scores}).sort_values(by="accuracy", ascending=False).reset_index(drop=True)
predict_df

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9788,0.9989,0.9843,0.9757,0.98,0.9575,0.9576


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.9779,0.9989,0.9843,0.974,0.9791,0.9557,0.9558


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9761,0.9984,0.9825,0.9723,0.9774,0.952,0.9521


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9788,0.9989,0.9843,0.9757,0.98,0.9575,0.9576


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.9779,0.98,0.9843,0.974,0.9791,0.9557,0.9558


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9678,0.995,0.9755,0.9637,0.9696,0.9354,0.9355


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.9393,0.9863,0.9493,0.9362,0.9427,0.8781,0.8782


Unnamed: 0,model,accuracy
0,LGBMClassifier,0.978841
1,XGBClassifier,0.978841
2,CatBoostClassifier,0.977921
3,DecisionTreeClassifier,0.977921
4,RandomForestClassifier,0.976081
5,LogisticRegression,0.967801
6,KNeighborsClassifier,0.939282


## Tune Models

In [23]:
#tuned_models = [tune_model(model, choose_better=True, n_iter=100) for model  in selected_models]
tuned_models = []
for model in selected_models:
    tuned_model = tune_model(model, choose_better=True, n_iter=25)
    tuned_models.append(tuned_model)
    print(tuned_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9645,0.9913,0.9635,0.9679,0.9657,0.9288,0.9288
1,0.9336,0.9766,0.9498,0.9244,0.9369,0.867,0.8673
2,0.9526,0.9805,0.9727,0.9386,0.9554,0.9049,0.9055
3,0.9479,0.9844,0.95,0.95,0.95,0.8955,0.8955
4,0.9264,0.9794,0.9224,0.9352,0.9287,0.8526,0.8527
Mean,0.945,0.9825,0.9517,0.9432,0.9473,0.8898,0.89
SD,0.0136,0.0051,0.017,0.0148,0.0131,0.0272,0.0272


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=-1, n_neighbors=18, p=2,
                     weights='distance')


In [29]:
model_names = []
accuracy_scores = []

for model in tuned_models:
    model_names.append(f"{model.__class__.__name__} tuned")
    predict_df_tuned = predict_model(model)
    accuracy_scores.append(accuracy_score(predict_df_tuned["Label"], predict_df_tuned["y"]))

predict_df_tuned = pd.DataFrame(data={"model": model_names, "accuracy": accuracy_scores}).sort_values(by="accuracy", ascending=False).reset_index(drop=True)
predict_df_tuned

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9788,0.9989,0.9843,0.9757,0.98,0.9575,0.9576


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.9788,0.9989,0.9843,0.9757,0.98,0.9575,0.9576


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9761,0.9984,0.9825,0.9723,0.9774,0.952,0.9521


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9807,0.9985,0.9983,0.9662,0.9819,0.9612,0.9618


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.9798,0.993,0.986,0.9758,0.9809,0.9594,0.9594


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9678,0.9953,0.972,0.967,0.9695,0.9354,0.9354


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.9595,0.977,0.9738,0.9505,0.962,0.9187,0.919


Unnamed: 0,model,accuracy
0,XGBClassifier tuned,0.980681
1,DecisionTreeClassifier tuned,0.979761
2,LGBMClassifier tuned,0.978841
3,CatBoostClassifier tuned,0.978841
4,RandomForestClassifier tuned,0.976081
5,LogisticRegression tuned,0.967801
6,KNeighborsClassifier tuned,0.959522


In [31]:
caret_results_df = pd.concat([predict_df, predict_df_tuned]).sort_values(by="accuracy", ascending=False).reset_index(drop=True)
caret_results_df

Unnamed: 0,model,accuracy
0,XGBClassifier tuned,0.980681
1,DecisionTreeClassifier tuned,0.979761
2,LGBMClassifier,0.978841
3,XGBClassifier,0.978841
4,LGBMClassifier tuned,0.978841
5,CatBoostClassifier tuned,0.978841
6,CatBoostClassifier,0.977921
7,DecisionTreeClassifier,0.977921
8,RandomForestClassifier,0.976081
9,RandomForestClassifier tuned,0.976081
