In [1]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from lohrasb.best_estimator import BaseModel
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from sklearn.metrics import f1_score, make_scorer
from xgboost import *

from lohrasb import logger


2023-02-21 21:10:13,570 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-02-21 21:10:13,575 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-02-21 21:10:13,578 :: matplotlib :: interactive is False
2023-02-21 21:10:13,579 :: matplotlib :: platform is darwin
2023-02-21 21:10:14,530 :: matplotlib :: CACHEDIR=/Users/hjavedani/.matplotlib
2023-02-21 21:10:14,533 :: matplotlib.font_manager :: Using fontManager instance from /Users/hjavedani/.matplotlib/fontlist-v330.json


#### Example: Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

#### Part 1: Use BestModel in sklearn pipeline


In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels


In [3]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [4]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]


X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)


#### Find feature types for later use

In [5]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define estimator and set its arguments 


In [6]:
estimator = XGBClassifier()
estimator_params = {
        "booster": ["gbtree","dart"],
        "eval_metric": ["auc"],
        "max_depth": [4, 5],
        "gamma": [0.1, 1.2],
        "subsample": [0.8],

    }
    

In [7]:

obj = BaseModel().optimize_by_tunegridsearchcv(
            estimator=estimator,
            estimator_params=estimator_params,
            fit_params = None,
            measure_of_accuracy=make_scorer(f1_score, greater_is_better=True),
            verbose=3,
            n_jobs=None,
            cv=KFold(3),
            early_stopping=None, 
            scoring=None, 
            refit=True, 
            error_score='raise', 
            return_train_score=False, 
            local_dir='~/ray_results', 
            name=None, 
            max_iters=1, 
            use_gpu=False, 
            loggers=None, 
            pipeline_auto_early_stop=True, 
            stopper=None, 
            time_budget_s=None, 
            mode=None,

        )

#### Build sklearn pipeline

In [8]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            ('obj', obj)

 ])


#### Run Pipeline

In [9]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


2023-02-21 21:10:18,658 :: ray.tune.tune :: Initializing Ray automatically.For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run`.


2023-02-21 21:10:20,764	INFO worker.py:1538 -- Started a local Ray instance.


0,1
Current time:,2023-02-21 21:10:48
Running for:,00:00:26.22
Memory:,17.2/32.0 GiB

Trial name,status,loc,booster,eval_metric,gamma,max_depth,subsample,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_356e4_00000,TERMINATED,127.0.0.1:40691,gbtree,auc,0.1,4,0.8,1,5.12559,0.862074,0.871975,0.866731
_Trainable_356e4_00001,TERMINATED,127.0.0.1:40696,dart,auc,0.1,4,0.8,1,14.419,0.863861,0.872525,0.864805
_Trainable_356e4_00002,TERMINATED,127.0.0.1:40697,gbtree,auc,1.2,4,0.8,1,4.40365,0.861936,0.871425,0.868381
_Trainable_356e4_00003,TERMINATED,127.0.0.1:40698,dart,auc,1.2,4,0.8,1,14.432,0.862349,0.872525,0.866181
_Trainable_356e4_00004,TERMINATED,127.0.0.1:40699,gbtree,auc,0.1,5,0.8,1,5.35964,0.863311,0.870462,0.86343
_Trainable_356e4_00005,TERMINATED,127.0.0.1:40700,dart,auc,0.1,5,0.8,1,15.6321,0.861111,0.866474,0.861917
_Trainable_356e4_00006,TERMINATED,127.0.0.1:40701,gbtree,auc,1.2,5,0.8,1,5.30515,0.862486,0.8706,0.86343
_Trainable_356e4_00007,TERMINATED,127.0.0.1:40702,dart,auc,1.2,5,0.8,1,15.8458,0.861386,0.869774,0.864118


Trial name,average_test_score,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,objective,pid,split0_test_score,split1_test_score,split2_test_score,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
_Trainable_356e4_00000,0.866926,2023-02-21_21-10-32,True,,0d4381515722427fbeef94c7d3bc69f2,hjavedani-MB,1,127.0.0.1,0.866926,40691,0.862074,0.871975,0.866731,5.12559,5.12559,5.12559,1677042632,0,,1,356e4_00000,0.032202
_Trainable_356e4_00001,0.867064,2023-02-21_21-10-47,True,,9bbd472a46f24d729eb15ad8d5f1fa49,hjavedani-MB,1,127.0.0.1,0.867064,40696,0.863861,0.872525,0.864805,14.419,14.419,14.419,1677042647,0,,1,356e4_00001,0.0552273
_Trainable_356e4_00002,0.867247,2023-02-21_21-10-37,True,,34bad8a265db41ce960d56c9d5ffa098,hjavedani-MB,1,127.0.0.1,0.867247,40697,0.861936,0.871425,0.868381,4.40365,4.40365,4.40365,1677042637,0,,1,356e4_00002,0.0407097
_Trainable_356e4_00003,0.867018,2023-02-21_21-10-47,True,,47a48d7d72f043a5a8148086b4206ae6,hjavedani-MB,1,127.0.0.1,0.867018,40698,0.862349,0.872525,0.866181,14.432,14.432,14.432,1677042647,0,,1,356e4_00003,0.0558712
_Trainable_356e4_00004,0.865734,2023-02-21_21-10-38,True,,ac50bc087876427d8e47b8174705cc9e,hjavedani-MB,1,127.0.0.1,0.865734,40699,0.863311,0.870462,0.86343,5.35964,5.35964,5.35964,1677042638,0,,1,356e4_00004,0.0509591
_Trainable_356e4_00005,0.863167,2023-02-21_21-10-48,True,,a4a7b442318449979be19a9801dd539b,hjavedani-MB,1,127.0.0.1,0.863167,40700,0.861111,0.866474,0.861917,15.6321,15.6321,15.6321,1677042648,0,,1,356e4_00005,0.0574269
_Trainable_356e4_00006,0.865505,2023-02-21_21-10-38,True,,bb51cf3bb679459eaa27e7cec6a6ff27,hjavedani-MB,1,127.0.0.1,0.865505,40701,0.862486,0.8706,0.86343,5.30515,5.30515,5.30515,1677042638,0,,1,356e4_00006,0.0476091
_Trainable_356e4_00007,0.865093,2023-02-21_21-10-48,True,,4178fd25a36e4c0e875f13d7c5d8fa3f,hjavedani-MB,1,127.0.0.1,0.865093,40702,0.861386,0.869774,0.864118,15.8458,15.8458,15.8458,1677042648,0,,1,356e4_00007,0.05111


2023-02-21 21:10:48,950	INFO tune.py:762 -- Total run time: 26.36 seconds (26.19 seconds for the tuning loop).


2023-02-21 21:10:49,537 :: dev :: The optimization will be based on make_scorer(f1_score) metric!
2023-02-21 21:10:49,537 :: dev :: The optimization will be based on make_scorer(f1_score) metric!


#### Check performance of the pipeline

In [10]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))


F1 score : 
0.7138089278264498
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8158
           1       0.78      0.66      0.71      2588

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.82     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7663  495]
 [ 877 1711]]


#### Part 2:  Use BestModel as a standalone estimator 

In [11]:
X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)

#### Transform features to make them ready for model input

In [12]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [13]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [14]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)

0,1
Current time:,2023-02-21 21:11:16
Running for:,00:00:26.50
Memory:,17.3/32.0 GiB

Trial name,status,loc,booster,eval_metric,gamma,max_depth,subsample,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_45c44_00000,TERMINATED,127.0.0.1:40720,gbtree,auc,0.1,4,0.8,1,4.99948,0.862074,0.871975,0.866731
_Trainable_45c44_00001,TERMINATED,127.0.0.1:40722,dart,auc,0.1,4,0.8,1,14.4355,0.863861,0.872525,0.864805
_Trainable_45c44_00002,TERMINATED,127.0.0.1:40723,gbtree,auc,1.2,4,0.8,1,4.38486,0.861936,0.871425,0.868381
_Trainable_45c44_00003,TERMINATED,127.0.0.1:40724,dart,auc,1.2,4,0.8,1,14.4241,0.862349,0.872525,0.866181
_Trainable_45c44_00004,TERMINATED,127.0.0.1:40725,gbtree,auc,0.1,5,0.8,1,5.30335,0.863311,0.870462,0.86343
_Trainable_45c44_00005,TERMINATED,127.0.0.1:40726,dart,auc,0.1,5,0.8,1,15.5748,0.861111,0.866474,0.861917
_Trainable_45c44_00006,TERMINATED,127.0.0.1:40727,gbtree,auc,1.2,5,0.8,1,5.32495,0.862486,0.8706,0.86343
_Trainable_45c44_00007,TERMINATED,127.0.0.1:40728,dart,auc,1.2,5,0.8,1,15.7379,0.861386,0.869774,0.864118


Trial name,average_test_score,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,objective,pid,split0_test_score,split1_test_score,split2_test_score,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
_Trainable_45c44_00000,0.866926,2023-02-21_21-10-59,True,,8274225a3b1b454d9ffd235edd8d8852,hjavedani-MB,1,127.0.0.1,0.866926,40720,0.862074,0.871975,0.866731,4.99948,4.99948,4.99948,1677042659,0,,1,45c44_00000,0.027179
_Trainable_45c44_00001,0.867064,2023-02-21_21-11-15,True,,21042bf62a8042fabf021d4eb30bcaf9,hjavedani-MB,1,127.0.0.1,0.867064,40722,0.863861,0.872525,0.864805,14.4355,14.4355,14.4355,1677042675,0,,1,45c44_00001,0.0524931
_Trainable_45c44_00002,0.867247,2023-02-21_21-11-05,True,,b3d32857b7ac468d9f450368eae204d8,hjavedani-MB,1,127.0.0.1,0.867247,40723,0.861936,0.871425,0.868381,4.38486,4.38486,4.38486,1677042665,0,,1,45c44_00002,0.0476198
_Trainable_45c44_00003,0.867018,2023-02-21_21-11-15,True,,1576b582e0d94fb58a61c90f815756c8,hjavedani-MB,1,127.0.0.1,0.867018,40724,0.862349,0.872525,0.866181,14.4241,14.4241,14.4241,1677042675,0,,1,45c44_00003,0.0466008
_Trainable_45c44_00004,0.865734,2023-02-21_21-11-06,True,,153337592d194ef8a47f8404210f356b,hjavedani-MB,1,127.0.0.1,0.865734,40725,0.863311,0.870462,0.86343,5.30335,5.30335,5.30335,1677042666,0,,1,45c44_00004,0.0496831
_Trainable_45c44_00005,0.863167,2023-02-21_21-11-16,True,,8b0587fb4d934b05b77a71b710690e63,hjavedani-MB,1,127.0.0.1,0.863167,40726,0.861111,0.866474,0.861917,15.5748,15.5748,15.5748,1677042676,0,,1,45c44_00005,0.0537972
_Trainable_45c44_00006,0.865505,2023-02-21_21-11-06,True,,4d25f61c808247b2a93ec3b69e1621be,hjavedani-MB,1,127.0.0.1,0.865505,40727,0.862486,0.8706,0.86343,5.32495,5.32495,5.32495,1677042666,0,,1,45c44_00006,0.0560899
_Trainable_45c44_00007,0.865093,2023-02-21_21-11-16,True,,e314ffa60c4a41279dbe3c2124338e8e,hjavedani-MB,1,127.0.0.1,0.865093,40728,0.861386,0.869774,0.864118,15.7379,15.7379,15.7379,1677042676,0,,1,45c44_00007,0.0514209


2023-02-21 21:11:16,612	INFO tune.py:762 -- Total run time: 26.62 seconds (26.49 seconds for the tuning loop).


2023-02-21 21:11:17,197 :: dev :: The optimization will be based on make_scorer(f1_score) metric!
2023-02-21 21:11:17,197 :: dev :: The optimization will be based on make_scorer(f1_score) metric!


#### Check performance of the pipeline

In [15]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))

F1 score : 
0.7138089278264498
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8158
           1       0.78      0.66      0.71      2588

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.82     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7663  495]
 [ 877 1711]]


In [16]:
obj.get_best_estimator()

In [17]:
obj.best_estimator

#### Get fitted grid search object and its attributes

In [18]:
GridSearchObj = obj.get_optimized_object()
GridSearchObj.cv_results_

{'params': [{'booster': 'gbtree',
   'eval_metric': 'auc',
   'max_depth': 4,
   'gamma': 0.1,
   'subsample': 0.8},
  {'booster': 'dart',
   'eval_metric': 'auc',
   'max_depth': 4,
   'gamma': 0.1,
   'subsample': 0.8},
  {'booster': 'gbtree',
   'eval_metric': 'auc',
   'max_depth': 4,
   'gamma': 1.2,
   'subsample': 0.8},
  {'booster': 'dart',
   'eval_metric': 'auc',
   'max_depth': 4,
   'gamma': 1.2,
   'subsample': 0.8},
  {'booster': 'gbtree',
   'eval_metric': 'auc',
   'max_depth': 5,
   'gamma': 0.1,
   'subsample': 0.8},
  {'booster': 'dart',
   'eval_metric': 'auc',
   'max_depth': 5,
   'gamma': 0.1,
   'subsample': 0.8},
  {'booster': 'gbtree',
   'eval_metric': 'auc',
   'max_depth': 5,
   'gamma': 1.2,
   'subsample': 0.8},
  {'booster': 'dart',
   'eval_metric': 'auc',
   'max_depth': 5,
   'gamma': 1.2,
   'subsample': 0.8}],
 'split0_test_score': array([0.86207371, 0.86386139, 0.86193619, 0.86234873, 0.86331133,
        0.86111111, 0.86248625, 0.86138614]),
 'spli