In [1]:
import pickle
import numpy as np
import pandas as pd
from joblib import dump
from functools import partial
from sklearn.experimental import enable_halving_search_cv
from sklearn.base import clone
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_validate,
    cross_val_predict,
    RandomizedSearchCV,
    HalvingRandomSearchCV,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    fbeta_score,
    roc_auc_score,
    average_precision_score,
    log_loss,
    confusion_matrix,
    classification_report,
    precision_recall_curve,
    roc_curve,
)
from xgboost import XGBClassifier
import category_encoders as ce
import optuna
from pycaret.classification import load_model, ClassificationExperiment

In [2]:
SEED = 1990
MODEL_REPO = 'models/'

### Read Data

In [3]:
with open('classification_data.pkl', 'rb') as f:
    data = pickle.load(f)
data.keys()

dict_keys(['dataset', 'X_train', 'X_test', 'y_train', 'y_test', 'train', 'test'])

In [4]:
train, test = data['train'], data['test']

In [5]:
test.target.value_counts(normalize=True)

0    0.7
1    0.3
Name: target, dtype: float64

### Load ready-to-serve pipelines

In [7]:
fitted_model = load_model(f'{MODEL_REPO}modelfit')
fitted_model

Transformation Pipeline and Model Successfully Loaded


In [9]:
e1 = ClassificationExperiment()

In [11]:
e1.setup(data=train,
          test_data=test,
          preprocess=False,
          fold_strategy='stratifiedkfold',
          fold=10,
          session_id=SEED,
          experiment_name="custom_model"
          )

Unnamed: 0,Description,Value
0,Session id,1990
1,Target,target
2,Target type,Binary
3,Original data shape,"(1000, 8)"
4,Transformed data shape,"(1000, 8)"
5,Transformed train set shape,"(750, 8)"
6,Transformed test set shape,"(250, 8)"
7,Numeric features,5
8,Categorical features,2


<pycaret.classification.oop.ClassificationExperiment at 0x1643a6100>

In [15]:
params_grid = {
    "clf__n_estimators": [100, 150, 200],
    "clf__learning_rate": [0.01, 0.1, 0.5],
    "clf__max_depth": [3, 5, 7],
    "clf__min_child_weight": [1, 3, 5],
    "clf__subsample": [0.6, 0.8, 1.0],
    "clf__colsample_bytree": [0.6, 0.8, 1.0],
    "clf__gamma": [0.1, 0.3, 0.5],
    "clf__reg_alpha": [0.1, 0.25, 0.75],
#     "clf__reg_lambda": [0.1, 0.25, 0.75],
}

In [16]:
pretrained = e1.load_model(f'{MODEL_REPO}modelfit')

Transformation Pipeline and Model Successfully Loaded


In [17]:
tuned1 = e1.tune_model(pretrained,
                       custom_grid = params_grid,
                       n_iter = 20,
                       search_library = 'scikit-learn',
                       search_algorithm = 'random', 
                       return_train_score = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.9007,0.9644,0.7723,0.8814,0.8232,0.7546,0.7578
CV-Train,1,0.8993,0.9639,0.7723,0.8764,0.8211,0.7513,0.7542
CV-Train,2,0.9007,0.9679,0.7772,0.8771,0.8241,0.7554,0.758
CV-Train,3,0.9007,0.9682,0.7772,0.8771,0.8241,0.7554,0.758
CV-Train,4,0.8948,0.9656,0.7624,0.8701,0.8127,0.74,0.7431
CV-Train,5,0.9007,0.9658,0.7833,0.8736,0.826,0.7568,0.759
CV-Train,6,0.8919,0.9676,0.7635,0.8611,0.8094,0.7343,0.7369
CV-Train,7,0.9037,0.9676,0.7931,0.875,0.832,0.7648,0.7666
CV-Train,8,0.9037,0.9671,0.7734,0.892,0.8285,0.762,0.7658
CV-Train,9,0.9007,0.9644,0.7783,0.8778,0.8251,0.7561,0.7588


Fitting 10 folds for each of 20 candidates, totalling 200 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [18]:
tuned2 = e1.tune_model(pretrained,
                       custom_grid = params_grid,
                       n_iter = 20,
                       search_library = 'optuna',
                       search_algorithm = 'tpe', 
                       return_train_score = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.96,0.9945,0.8762,0.9888,0.9291,0.9014,0.9046
CV-Train,1,0.9467,0.9921,0.8416,0.977,0.9043,0.8676,0.8722
CV-Train,2,0.9659,0.9951,0.9109,0.9735,0.9412,0.9172,0.9182
CV-Train,3,0.9733,0.9958,0.9307,0.9792,0.9543,0.9355,0.9361
CV-Train,4,0.9422,0.9892,0.8416,0.9605,0.8971,0.8572,0.8608
CV-Train,5,0.957,0.9934,0.8719,0.9833,0.9243,0.8944,0.8976
CV-Train,6,0.9526,0.9914,0.8768,0.9622,0.9175,0.8844,0.8862
CV-Train,7,0.9556,0.9922,0.8867,0.9626,0.9231,0.8919,0.8934
CV-Train,8,0.9496,0.9936,0.8621,0.9669,0.9115,0.8764,0.8792
CV-Train,9,0.9704,0.9957,0.9163,0.9841,0.949,0.9281,0.9293


[32m[I 2023-04-23 06:27:55,462][0m Searching the best hyperparameters using 750 samples...[0m
[32m[I 2023-04-23 06:28:54,262][0m Finished hyperparemeter search![0m


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [51]:
tuned2.named_steps['clf'].get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.6,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 0,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.01,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 3,
 'max_leaves': None,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 200,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': 0.1,
 'reg_lambda': 0.1,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.6,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [24]:
tuned2.named_steps['clf']

### Load configured preprocessors

In [21]:
fitted_prepro = fitted_model.named_steps['prp']
fitted_prepro

In [22]:
e2 = Classificatione2eriment()

In [25]:
e2.setup(data=train,
          test_data=test,
          preprocess=False,
          custom_pipeline=[fitted_prepro],
          fold_strategy='stratifiedkfold',
          fold=10,
          session_id=SEED,
          e2eriment_name="custom_prepro"
          )

Unnamed: 0,Description,Value
0,Session id,1990
1,Target,target
2,Target type,Binary
3,Original data shape,"(1000, 8)"
4,Transformed data shape,"(1000, 6)"
5,Transformed train set shape,"(750, 6)"
6,Transformed test set shape,"(250, 6)"
7,Numeric features,5
8,Categorical features,2


<pycaret.classification.oop.ClassificationExperiment at 0x28586c1c0>

In [26]:
n_select = e2.models().shape[0]
print(f'{n_select=}')
e2.models()

n_select=18


Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [27]:
modelboard = e2.compare_models(n_select=n_select)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.88,0.9399,0.7779,0.8208,0.7961,0.7113,0.7141,0.08
et,Extra Trees Classifier,0.88,0.9448,0.7686,0.825,0.7936,0.7093,0.712,0.079
knn,K Neighbors Classifier,0.868,0.9381,0.7419,0.8079,0.772,0.6793,0.6818,0.076
gbc,Gradient Boosting Classifier,0.8667,0.9411,0.733,0.8092,0.7675,0.6744,0.6774,0.073
lightgbm,Light Gradient Boosting Machine,0.8667,0.9396,0.7597,0.7952,0.7738,0.6795,0.6827,0.075
xgboost,Extreme Gradient Boosting,0.8627,0.9375,0.7595,0.7875,0.7699,0.6723,0.6754,0.072
ada,Ada Boost Classifier,0.852,0.9294,0.7065,0.785,0.7416,0.6384,0.6418,0.076
dt,Decision Tree Classifier,0.8333,0.8024,0.7247,0.727,0.7229,0.6041,0.6066,0.068
lr,Logistic Regression,0.8227,0.9077,0.6441,0.7382,0.6854,0.5629,0.5674,0.229
lda,Linear Discriminant Analysis,0.8227,0.9068,0.6658,0.7214,0.6915,0.5674,0.5691,0.07


In [56]:
modelboard

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='sqrt',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        n_estimators=100, n_jobs=-1, oob_score=False,
                        random_state=1990, verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='sqrt',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=1990, verbose=0, warm_start=False),
 KN

In [57]:
modelboarddf = e2.pull()
modelboarddf

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.88,0.9399,0.7779,0.8208,0.7961,0.7113,0.7141,0.058
et,Extra Trees Classifier,0.88,0.9448,0.7686,0.825,0.7936,0.7093,0.712,0.059
knn,K Neighbors Classifier,0.868,0.9381,0.7419,0.8079,0.772,0.6793,0.6818,0.045
gbc,Gradient Boosting Classifier,0.8667,0.9411,0.733,0.8092,0.7675,0.6744,0.6774,0.051
lightgbm,Light Gradient Boosting Machine,0.8667,0.9396,0.7597,0.7952,0.7738,0.6795,0.6827,0.053
xgboost,Extreme Gradient Boosting,0.8627,0.9375,0.7595,0.7875,0.7699,0.6723,0.6754,0.049
ada,Ada Boost Classifier,0.852,0.9294,0.7065,0.785,0.7416,0.6384,0.6418,0.055
dt,Decision Tree Classifier,0.8333,0.8024,0.7247,0.727,0.7229,0.6041,0.6066,0.046
lr,Logistic Regression,0.8227,0.9077,0.6441,0.7382,0.6854,0.5629,0.5674,0.183
lda,Linear Discriminant Analysis,0.8227,0.9068,0.6658,0.7214,0.6915,0.5674,0.5691,0.044


In [58]:
# ensembled_models = [e2.ensemble_model(m, choose_better=True) for m in tuned_models]

In [61]:
tuned_models = [e2.tune_model(m,
                               search_library='optuna',
                               search_algorithm='tpe'
                              ) 
                for m in modelboard[:3]]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9067,0.9849,0.7391,0.9444,0.8293,0.7664,0.7773
1,0.9067,0.9691,0.8261,0.8636,0.8444,0.7778,0.7782
2,0.84,0.9256,0.6957,0.7619,0.7273,0.6144,0.6157
3,0.88,0.9331,0.7391,0.85,0.7907,0.7072,0.7106
4,0.8533,0.9172,0.7391,0.7727,0.7556,0.6509,0.6512
5,0.8933,0.9494,0.6818,0.9375,0.7895,0.7204,0.7368
6,0.8667,0.9485,0.8182,0.75,0.7826,0.6867,0.6881
7,0.8667,0.934,0.7273,0.8,0.7619,0.6696,0.6711
8,0.84,0.9211,0.6364,0.7778,0.7,0.5924,0.5979
9,0.8933,0.9708,0.7727,0.85,0.8095,0.7357,0.7373


[32m[I 2023-04-22 03:38:21,678][0m Searching the best hyperparameters using 750 samples...[0m
[32m[I 2023-04-22 03:38:46,912][0m Finished hyperparemeter search![0m


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8933,0.964,0.6957,0.9412,0.8,0.7295,0.745
1,0.8933,0.954,0.7826,0.8571,0.8182,0.7429,0.7445
2,0.88,0.9289,0.7391,0.85,0.7907,0.7072,0.7106
3,0.8667,0.944,0.5652,1.0,0.7222,0.6432,0.6885
4,0.84,0.9189,0.6087,0.8235,0.7,0.5942,0.6069
5,0.8533,0.9434,0.5,1.0,0.6667,0.5856,0.6435
6,0.8267,0.9383,0.5455,0.8,0.6486,0.539,0.5564
7,0.84,0.9288,0.5455,0.8571,0.6667,0.5681,0.5933
8,0.8133,0.8791,0.5909,0.7222,0.65,0.5245,0.5294
9,0.88,0.9528,0.6818,0.8824,0.7692,0.6899,0.7004


[32m[I 2023-04-22 03:38:49,475][0m Searching the best hyperparameters using 750 samples...[0m
[32m[I 2023-04-22 03:39:21,889][0m Finished hyperparemeter search![0m


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.92,0.9875,0.7826,0.9474,0.8571,0.8023,0.8093
1,0.8933,0.9653,0.7826,0.8571,0.8182,0.7429,0.7445
2,0.8533,0.9339,0.6957,0.8,0.7442,0.6421,0.6452
3,0.92,0.9607,0.7391,1.0,0.85,0.7971,0.814
4,0.8667,0.9235,0.6087,0.9333,0.7368,0.6528,0.6795
5,0.8933,0.9563,0.6818,0.9375,0.7895,0.7204,0.7368
6,0.8667,0.94,0.5909,0.9286,0.7222,0.6401,0.6684
7,0.88,0.9348,0.6364,0.9333,0.7568,0.6809,0.7028
8,0.8533,0.9117,0.6818,0.7895,0.7317,0.6315,0.6347
9,0.9067,0.9708,0.7727,0.8947,0.8293,0.7655,0.7694


[32m[I 2023-04-22 03:39:24,723][0m Searching the best hyperparameters using 750 samples...[0m
[32m[I 2023-04-22 03:39:35,248][0m Finished hyperparemeter search![0m


In [62]:
blender = e2.blend_models(tuned_models,
#                            choose_better=True,
                           return_train_score=True
                          )

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [63]:
blender

In [64]:
stacker = e2.stack_models(tuned_models,
#                            choose_better=True,
                           return_train_score=True
                          )

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.9837,0.9993,0.9604,0.9848,0.9724,0.9609,0.961
CV-Train,1,0.9852,0.9993,0.9703,0.98,0.9751,0.9646,0.9646
CV-Train,2,0.9822,0.9985,0.9604,0.9798,0.97,0.9574,0.9575
CV-Train,3,0.997,1.0,0.9901,1.0,0.995,0.9929,0.9929
CV-Train,4,0.9941,0.9998,0.9851,0.995,0.99,0.9858,0.9859
CV-Train,5,0.9911,0.9998,0.9803,0.99,0.9851,0.9788,0.9788
CV-Train,6,0.9941,0.9998,0.9852,0.995,0.9901,0.9859,0.9859
CV-Train,7,0.9926,0.9998,0.9803,0.995,0.9876,0.9823,0.9824
CV-Train,8,0.9941,0.9998,0.9852,0.995,0.9901,0.9859,0.9859
CV-Train,9,0.9881,0.9995,0.9754,0.9851,0.9802,0.9717,0.9718


In [65]:
stacker

In [66]:
# pd.DataFrame(fitted_prepro.transform(X_train))
# e2.pipeline.transform(X_train)
# fitted_prepro.output_indices_
# e2.pipeline.named_steps['custom_step_0'].transformer.output_indices_

In [67]:
pycaret_model = Pipeline([('prepro',e2.pipeline),('clf',stacker)])

In [68]:
pycaret_model

In [69]:
pycaret_model.predict(X_train)

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,

### Deploy with PyCaret

In [None]:
e2.create_api(pycaret_model, 'classification_service')

In [None]:
!python classification_service.py

In [None]:
e2.create_docker(pycaret_model, 'classification_service')