#### Packages 

In [1]:
import pandas as pd
from pycaret.classification import *
from sklearn import metrics
from sklearn.metrics import roc_auc_score
import mlflow
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import average_precision_score
from sklearn.utils import shuffle
from model_utils import *

In [7]:
mlflow.set_tracking_uri("http://localhost:5000")

#### Install Optuna and Scikit optmize

In [2]:
%pip install optuna

Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
     ---------------------------------------- 0.0/365.3 kB ? eta -:--:--
     ------ -------------------------------- 61.4/365.3 kB 1.1 MB/s eta 0:00:01
     ---------------------- --------------- 215.0/365.3 kB 1.9 MB/s eta 0:00:01
     ----------------------------- -------- 286.7/365.3 kB 2.0 MB/s eta 0:00:01
     ---------------------------------- --- 327.7/365.3 kB 1.7 MB/s eta 0:00:01
     -------------------------------------- 365.3/365.3 kB 1.6 MB/s eta 0:00:00
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Installing collected packages: colorlog, cmaes, optuna
Successfully installed cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0
Note: you may need to restart the kernel to use updated packages.




In [3]:
%pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
     ---------------------------------------- 0.0/100.3 kB ? eta -:--:--
     ----------- --------------------------- 30.7/100.3 kB 1.3 MB/s eta 0:00:01
     ----------------------------------- --- 92.2/100.3 kB 1.1 MB/s eta 0:00:01
     ------------------------------------ 100.3/100.3 kB 957.7 kB/s eta 0:00:00
Collecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
df_train, df_test = get_raw_data()

In [8]:
clf1 = setup(data = df_train, 
             target = 'target',
             numeric_features=df_train.columns[0:-1].to_list(),
             silent=True,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'selected_model',
             fix_imbalance = True, 
             transformation = True, 
             polynomial_features = True,
             feature_selection = True, feature_selection_threshold = 0.5,
             remove_multicollinearity = True, multicollinearity_threshold = 0.6,
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC", 
                      include=["lightgbm", "et", "rf", "lr", "gbc"])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
et,Extra Trees Classifier,0.9815,0.9823,0.885,0.9868,0.9319,0.9212,0.9238,0.9546,0.37
lr,Logistic Regression,0.963,0.9789,0.916,0.8465,0.8778,0.8561,0.8584,0.9536,0.053
rf,Random Forest Classifier,0.9783,0.9809,0.8893,0.9595,0.9213,0.9088,0.9108,0.9529,0.587
lightgbm,Light Gradient Boosting Machine,0.9757,0.9813,0.8939,0.937,0.9139,0.8998,0.9008,0.9526,1.724
gbc,Gradient Boosting Classifier,0.9681,0.9777,0.898,0.8862,0.8907,0.872,0.873,0.95,1.314


In [9]:
clf1

(False,
 235289    0
 176588    0
 42590     1
 264339    0
 133853    0
          ..
 129869    0
 253807    0
 38520     0
 162271    0
 217349    0
 Name: target, Length: 672, dtype: int64,
 'target',
 True,
 [LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                 importance_type='split', learning_rate=0.1, max_depth=-1,
                 min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                 n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                 random_state=3341, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                 subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min

In [10]:
best

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=3341, verbose=0,
                     warm_start=False)

In [11]:
results = predict_model(best, raw_score=True,data = df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
0,Extra Trees Classifier,0.9755,0.983,0.8772,0.9615,0.9174,0.9031,0.9044,0.9554


#### Hyperparameter tuning
Define search space for hyper parameter tuning

In [12]:
params = {"max_depth": np.random.randint(1, 40, 10),
          "n_estimators": np.random.randint(2, 1000, 10)}
          
# tune model
tuned_dt = tune_model(best, custom_grid = params)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9809,0.9925,0.913,0.9545,0.9333,0.9222,0.9225,0.9769
1,0.9682,0.9815,0.7826,1.0,0.878,0.86,0.8686,0.9405
2,0.9745,0.9731,0.8261,1.0,0.9048,0.8902,0.8956,0.9223
3,0.9809,0.9942,0.913,0.9545,0.9333,0.9222,0.9225,0.9769
4,0.9809,0.9609,0.8696,1.0,0.9302,0.9192,0.9222,0.9361
5,0.9936,0.978,0.9545,1.0,0.9767,0.973,0.9734,0.9659
6,0.9872,0.9746,0.9091,1.0,0.9524,0.945,0.9464,0.9422
7,0.9808,0.9863,0.8636,1.0,0.9268,0.9158,0.9191,0.9554
8,0.9936,0.9986,0.9545,1.0,0.9767,0.973,0.9734,0.993
9,0.9808,0.9937,0.8636,1.0,0.9268,0.9158,0.9191,0.9684


In [13]:
tuned_dt

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=22, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=343, n_jobs=-1,
                     oob_score=False, random_state=3341, verbose=0,
                     warm_start=False)

In [14]:
results = predict_model(tuned_dt, data=df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
0,Extra Trees Classifier,0.9755,0.9805,0.8772,0.9615,0.9174,0.9031,0.9044,0.9565


#### Voting classifier

In [15]:
tops= compare_models(n_select = 3, sort="APC")
tops

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
et,Extra Trees Classifier,0.9815,0.9823,0.885,0.9868,0.9319,0.9212,0.9238,0.9546,0.327
lr,Logistic Regression,0.963,0.9789,0.916,0.8465,0.8778,0.8561,0.8584,0.9536,0.053
rf,Random Forest Classifier,0.9783,0.9809,0.8893,0.9595,0.9213,0.9088,0.9108,0.9529,0.52
lightgbm,Light Gradient Boosting Machine,0.9757,0.9813,0.8939,0.937,0.9139,0.8998,0.9008,0.9526,0.249
lda,Linear Discriminant Analysis,0.9713,0.9783,0.9028,0.9028,0.9014,0.8847,0.8855,0.9521,0.041
gbc,Gradient Boosting Classifier,0.9681,0.9777,0.898,0.8862,0.8907,0.872,0.873,0.95,1.429
ada,Ada Boost Classifier,0.9611,0.9659,0.8937,0.8492,0.8696,0.8467,0.848,0.9395,0.426
qda,Quadratic Discriminant Analysis,0.9725,0.9533,0.8759,0.9333,0.9014,0.8855,0.8876,0.9209,0.032
knn,K Neighbors Classifier,0.9546,0.9648,0.9028,0.8121,0.8524,0.8258,0.8293,0.8996,0.082
nb,Naive Bayes,0.9642,0.9766,0.8672,0.8891,0.8751,0.8543,0.8563,0.8846,0.03


[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=3341, verbose=0,
                      warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=3341, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=No

In [16]:
tops[0]

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=3341, verbose=0,
                     warm_start=False)

In [17]:
blender_weighted = blend_models([tops[0],tops[1],tops[2]], weights = [0.5,0.3,0.2])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9873,0.9922,0.9565,0.9565,0.9565,0.9491,0.9491,0.9764
1,0.9682,0.9809,0.8261,0.95,0.8837,0.8654,0.8682,0.9355
2,0.9745,0.9718,0.8261,1.0,0.9048,0.8902,0.8956,0.9213
3,0.9618,0.9938,0.913,0.84,0.875,0.8525,0.8535,0.9743
4,0.9745,0.9513,0.8696,0.9524,0.9091,0.8943,0.8956,0.9232
5,0.9936,0.981,0.9545,1.0,0.9767,0.973,0.9734,0.9674
6,0.9872,0.98,0.9091,1.0,0.9524,0.945,0.9464,0.9519
7,0.9808,0.9881,0.9091,0.9524,0.9302,0.9191,0.9194,0.9632
8,0.9872,0.9986,0.9545,0.9545,0.9545,0.9471,0.9471,0.993
9,0.9679,0.9922,0.8636,0.9048,0.8837,0.8651,0.8655,0.9652


In [18]:
tops= compare_models(n_select = 2)
blender = blend_models(tops)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9873,0.9929,0.9565,0.9565,0.9565,0.9491,0.9491,0.9769
1,0.9618,0.975,0.7826,0.9474,0.8571,0.8353,0.8404,0.9187
2,0.9745,0.9726,0.8261,1.0,0.9048,0.8902,0.8956,0.9231
3,0.9745,0.994,0.913,0.913,0.913,0.8981,0.8981,0.9743
4,0.9809,0.9533,0.8696,1.0,0.9302,0.9192,0.9222,0.9301
5,0.9872,0.9749,0.9545,0.9545,0.9545,0.9471,0.9471,0.9643
6,0.9872,0.9769,0.9091,1.0,0.9524,0.945,0.9464,0.9442
7,0.9808,0.9881,0.8636,1.0,0.9268,0.9158,0.9191,0.9586
8,0.9936,0.9973,0.9545,1.0,0.9767,0.973,0.9734,0.9879
9,0.9808,0.9922,0.8636,1.0,0.9268,0.9158,0.9191,0.9645


#### Hyperparameter tuning with scikit-optimize on voting model