In [43]:
import json
import pickle
import numpy as np
import pandas as pd
from joblib import dump
from functools import partial

from sklearn.experimental import enable_halving_search_cv
from sklearn.base import clone
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_validate,
    cross_val_predict,
    RandomizedSearchCV,
    HalvingRandomSearchCV,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    fbeta_score,
    roc_auc_score,
    average_precision_score,
    log_loss,
    confusion_matrix,
    classification_report,
    precision_recall_curve,
    roc_curve,
)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import category_encoders as ce

import plotly.graph_objects as go
import plotly
plotly.offline.init_notebook_mode(connected=True)

from mlutils.classification import (
    classifier_confusion_matrix,
    classifier_param_grid,
    classifier_metrics,
    classifier_report,
    classifier_benchmark,
    classifier_plots,
    classifier_plots_int,
    classifier_gain_lift,
    classifier_gain_lift_plot,
    classifier_thresholds
)

import mlflow
import mlflow.sklearn

In [2]:
SEED = 1990
MODEL_REPO = 'models/'

### Read Data

In [3]:
with open('classification_data.pkl', 'rb') as f:
    data = pickle.load(f)
data.keys()

dict_keys(['dataset', 'X_train', 'X_test', 'y_train', 'y_test', 'train', 'test'])

In [4]:
X_train, X_test, y_train, y_test = data['X_train'], data['X_test'], data['y_train'], data['y_test']

In [5]:
y_train.value_counts(normalize=True)

0    0.7
1    0.3
Name: target, dtype: float64

### Define Pipeline

In [6]:
pipen = Pipeline([("numeric", StandardScaler())])

In [7]:
pipec = Pipeline([("categorical", ce.CatBoostEncoder())])

In [8]:
prepro = ColumnTransformer(
    [
        ("num", pipen, make_column_selector(dtype_include=np.number)),
        ("cat", pipec, make_column_selector(dtype_include=object)),
    ]
)

### Add model to the Pipeline

In [9]:
model = Pipeline([("prp", prepro), ("clf", XGBClassifier())])

### Prepare mlflow logging

In [11]:
mlflow.set_tracking_uri("http://localhost:5001")
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))

Current tracking uri: http://localhost:5001


In [12]:
experiment_name = "custom-logging"

In [13]:
mlflow.set_experiment(experiment_name)

2023/04/25 01:37:15 INFO mlflow.tracking.fluent: Experiment with name 'custom-logging' does not exist. Creating a new experiment.


<Experiment: artifact_location='http://minioaccess:miniosecret@minio:9000/artifacts/1', creation_time=1682375836005, experiment_id='1', last_update_time=1682375836005, lifecycle_stage='active', name='custom-logging', tags={}>

### cross_val_score

In [15]:
cvscore1 = cross_val_score(model, X_train, y_train, scoring="f1", cv=3)
cvscore1

array([0.8       , 0.72611465, 0.78082192])

In [17]:
cvscore1.mean()

0.768978855829916

In [18]:
with mlflow.start_run(run_name="cross_val_score"):
    mlflow.log_metric("f1", cvscore1.mean())

In [10]:
models = [
    LogisticRegression()
    ,RandomForestClassifier()
    ,KNeighborsClassifier()
    ,AdaBoostClassifier()
    ,GradientBoostingClassifier()
    ,XGBClassifier()
    ,LGBMClassifier()
]

In [21]:
cross_val_results = {}

for model in models:

    tempname = model.__class__.__name__
    print(f'{tempname}')
    
    temppipe = Pipeline([("prp", prepro), (tempname, model)])
    tempresults = cross_val_score(temppipe, X_train, y_train, scoring="f1", cv=3)
    crossvalresults[tempname] = tempresults
    print(f'{np.mean(tempresults):.2f} | {np.std(tempresults):.3f} | {tempresults}')
    
    dump(temppipe,f'{MODEL_REPO}pipe_{tempname}.pkl')
    print(f'Saved model {MODEL_REPO}pipe_{tempname}.pkl')

LogisticRegression
0.69 | 0.034 | [0.73103448 0.67647059 0.64788732]
Saved model models/pipe_LogisticRegression.pkl
RandomForestClassifier
0.78 | 0.027 | [0.8115942  0.74666667 0.77241379]
Saved model models/pipe_RandomForestClassifier.pkl
KNeighborsClassifier
0.78 | 0.026 | [0.80555556 0.74324324 0.7862069 ]
Saved model models/pipe_KNeighborsClassifier.pkl
AdaBoostClassifier
0.74 | 0.017 | [0.73202614 0.72258065 0.76315789]
Saved model models/pipe_AdaBoostClassifier.pkl
GradientBoostingClassifier
0.75 | 0.018 | [0.77142857 0.72727273 0.74820144]
Saved model models/pipe_GradientBoostingClassifier.pkl
XGBClassifier
0.77 | 0.031 | [0.8        0.72611465 0.78082192]
Saved model models/pipe_XGBClassifier.pkl
LGBMClassifier
0.77 | 0.027 | [0.79738562 0.73076923 0.77027027]
Saved model models/pipe_LGBMClassifier.pkl


### cross_validate

In [19]:
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "f1": make_scorer(f1_score),
    "f2": make_scorer(fbeta_score, beta=2),
    "f0.5": make_scorer(fbeta_score, beta=0.5),
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
    "roc_auc": make_scorer(roc_auc_score),
    "pr_auc": make_scorer(average_precision_score),
    "log_loss": make_scorer(log_loss),
}

In [20]:
cvscore2 = cross_validate(
    model,
    X_train,
    y_train,
    scoring=scoring,
    cv=3,
    return_train_score=True,
)
cvscore2

{'fit_time': array([0.12457299, 0.03701997, 0.10071611]),
 'score_time': array([0.0114069 , 0.00710702, 0.00751281]),
 'test_accuracy': array([0.88 , 0.828, 0.872]),
 'train_accuracy': array([0.946, 0.99 , 0.976]),
 'test_f1': array([0.8       , 0.72611465, 0.78082192]),
 'train_f1': array([0.91262136, 0.98305085, 0.95918367]),
 'test_f2': array([0.8       , 0.7460733 , 0.76819407]),
 'train_f2': array([0.92885375, 0.97315436, 0.94758065]),
 'test_f0.5': array([0.8       , 0.70719603, 0.79387187]),
 'train_f0.5': array([0.89694656, 0.99315068, 0.97107438]),
 'test_precision': array([0.8       , 0.69512195, 0.8028169 ]),
 'train_precision': array([0.88679245, 1.        , 0.97916667]),
 'test_recall': array([0.8 , 0.76, 0.76]),
 'train_recall': array([0.94      , 0.96666667, 0.94      ]),
 'test_roc_auc': array([0.85714286, 0.80857143, 0.84      ]),
 'train_roc_auc': array([0.94428571, 0.98333333, 0.96571429]),
 'test_pr_auc': array([0.7       , 0.60029268, 0.68214085]),
 'train_pr_auc':

In [23]:
cvscore2log = {k:v.mean() for k,v in cvscore2.items()}
cvscore2log

{'fit_time': 0.08743635813395183,
 'score_time': 0.008675575256347656,
 'test_accuracy': 0.86,
 'train_accuracy': 0.9706666666666667,
 'test_f1': 0.768978855829916,
 'train_f1': 0.951618626716772,
 'test_f2': 0.7714224561700607,
 'train_f2': 0.9498629208393696,
 'test_f0.5': 0.7670226320239798,
 'train_f0.5': 0.9537238766607641,
 'test_precision': 0.7659796175426544,
 'train_precision': 0.9553197064989517,
 'test_recall': 0.7733333333333334,
 'train_recall': 0.9488888888888889,
 'test_roc_auc': 0.8352380952380952,
 'train_roc_auc': 0.9644444444444445,
 'test_pr_auc': 0.6608111759990839,
 'train_pr_auc': 0.9222227463312369,
 'test_log_loss': 5.046111474476401,
 'train_log_loss': 1.0572804994141034}

In [24]:
with mlflow.start_run(run_name="cross_validate"):
    mlflow.log_metrics(cvscore2log)

In [13]:
pd.DataFrame(cvscore2)

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_f1,train_f1,test_f2,train_f2,test_f0.5,train_f0.5,test_precision,train_precision,test_recall,train_recall,test_roc_auc,train_roc_auc,test_pr_auc,train_pr_auc,test_log_loss,train_log_loss
0,0.076205,0.008246,0.896,0.964,0.821918,0.939189,0.808625,0.931635,0.835655,0.946866,0.84507,0.952055,0.8,0.926667,0.868571,0.953333,0.736056,0.904237,3.74854,1.297572
1,0.044151,0.007046,0.828,0.994,0.732919,0.989967,0.764249,0.987984,0.704057,0.991957,0.686047,0.993289,0.786667,0.986667,0.81619,0.991905,0.60369,0.984045,6.199508,0.216262
2,0.077811,0.007005,0.852,0.982,0.751678,0.970492,0.748663,0.980132,0.754717,0.961039,0.756757,0.954839,0.746667,0.986667,0.821905,0.983333,0.641045,0.946108,5.334461,0.648786


### RandomizedSearchCV

* RandomizedSearchCV [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
* this will sample **n_iter** from the params_grid; each grid will be evaluated with cross-validation on **cv** folds
* the output will return the results **cv** results, corresponding to the evaluation on the different folds
* output will contain the test fold(s) results and potentially the train folds - if **return_train_score**
* **refit** : Refit an estimator using the best found parameters on the whole dataset.
* if **refit** is used, the best params_grid examined will be selected based on the **refit** value to train the estimator on the **entire dataset**
* **cv** : For integer/None inputs, if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used. In all other cases, KFold is used. These splitters are instantiated with shuffle=False so the splits will be the same across calls.

In [25]:
params_grid = {
    "clf__n_estimators": [100, 150, 200],
    "clf__learning_rate": [0.01, 0.1, 0.5],
    "clf__max_depth": [3, 5, 7],
    "clf__min_child_weight": [1, 3, 5],
    "clf__subsample": [0.6, 0.8, 1.0],
    "clf__colsample_bytree": [0.6, 0.8, 1.0],
    "clf__gamma": [0, 0.3, 0.5],
    "clf__reg_alpha": [0.1, 0.25, 0.75],
    "clf__reg_lambda": [0.1, 0.25, 0.75],
}

In [26]:
search1 = RandomizedSearchCV(
    estimator=model,
    param_distributions=params_grid,
    n_iter=20,
    scoring=scoring,
    refit="accuracy",
    cv=3,
    verbose=1,
    return_train_score=True,
    random_state=SEED,
)

In [27]:
search1.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [51]:
searchdf = pd.DataFrame(search1.cv_results_).filter(regex='params|mean_|rank_',axis=1)
searchdf

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,rank_test_accuracy,mean_train_accuracy,mean_test_f1,rank_test_f1,mean_train_f1,mean_test_f2,...,mean_train_recall,mean_test_roc_auc,rank_test_roc_auc,mean_train_roc_auc,mean_test_pr_auc,rank_test_pr_auc,mean_train_pr_auc,mean_test_log_loss,rank_test_log_loss,mean_train_log_loss
0,0.07461,0.007938,"{'clf__subsample': 1.0, 'clf__reg_lambda': 0.7...",0.870667,5,0.898667,0.759961,16,0.816828,0.712605,...,0.753333,0.81746,20,0.857143,0.680538,5,0.74618,4.661646,16,3.652424
1,0.039875,0.007022,"{'clf__subsample': 0.6, 'clf__reg_lambda': 0.1...",0.857333,16,0.888,0.756357,19,0.806381,0.744976,...,0.777778,0.823175,19,0.856508,0.652558,18,0.718003,5.142228,3,4.036889
2,0.042785,0.007115,"{'clf__subsample': 0.6, 'clf__reg_lambda': 0.2...",0.858667,13,0.953333,0.767337,9,0.924411,0.775641,...,0.933333,0.836825,6,0.947619,0.658272,13,0.87711,5.09417,6,1.682037
3,0.052357,0.007539,"{'clf__subsample': 0.8, 'clf__reg_lambda': 0.7...",0.865333,6,0.964,0.779808,4,0.940883,0.786316,...,0.946667,0.844127,2,0.959048,0.672582,6,0.902275,4.853879,13,1.297572
4,0.070092,0.006963,"{'clf__subsample': 0.8, 'clf__reg_lambda': 0.2...",0.876,3,0.899333,0.780952,3,0.821253,0.754412,...,0.771111,0.836508,7,0.862698,0.69116,3,0.74612,4.469413,18,3.628394
5,0.054544,0.007197,"{'clf__subsample': 0.8, 'clf__reg_lambda': 0.1...",0.874667,4,0.895333,0.770333,8,0.809637,0.727805,...,0.742222,0.825397,16,0.851587,0.690026,4,0.738499,4.517471,17,3.772569
6,0.095955,0.007284,"{'clf__subsample': 0.6, 'clf__reg_lambda': 0.2...",0.857333,16,0.944667,0.770608,7,0.909731,0.787668,...,0.928889,0.840952,4,0.940159,0.656875,14,0.849719,5.142228,3,1.994415
7,0.075554,0.007246,"{'clf__subsample': 0.6, 'clf__reg_lambda': 0.2...",0.861333,11,0.918,0.762609,13,0.856437,0.750129,...,0.815556,0.827302,14,0.88873,0.660627,12,0.791393,4.998053,10,2.95558
8,0.043522,0.007437,"{'clf__subsample': 0.6, 'clf__reg_lambda': 0.2...",0.858667,13,0.909333,0.758274,18,0.842745,0.745673,...,0.808889,0.824127,18,0.880635,0.655121,15,0.769283,5.09417,6,3.267958
9,0.122573,0.007605,"{'clf__subsample': 0.8, 'clf__reg_lambda': 0.7...",0.864,9,0.906,0.765473,11,0.835791,0.748311,...,0.8,0.827937,13,0.875714,0.666891,9,0.760787,4.901937,12,3.388103


In [None]:
with mlflow.start_run(run_name="randomized_search_cv"):
    mlflow.log_params(search1.best_params_)