In [1]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from lohrasb.best_estimator import BaseModel
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from sklearn.metrics import f1_score, make_scorer
from xgboost import *
from lohrasb import logger

logger.info("Using Scikit-Learn API (tune-sklearn) for an example of classification")


2023-02-21 20:40:10,046 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-02-21 20:40:10,053 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-02-21 20:40:10,055 :: matplotlib :: interactive is False
2023-02-21 20:40:10,056 :: matplotlib :: platform is darwin
2023-02-21 20:40:11,206 :: matplotlib :: CACHEDIR=/Users/hjavedani/.matplotlib
2023-02-21 20:40:11,208 :: matplotlib.font_manager :: Using fontManager instance from /Users/hjavedani/.matplotlib/fontlist-v330.json
2023-02-21 20:40:11,748 :: dev :: Using Scikit-Learn API (tune-sklearn) for an example of classification
2023-02-21 20:40:11,748 :: dev :: Using Scikit-Learn API (tune-sklearn) for an example of classification


#### Example: Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

#### Part 1: Use BestModel in sklearn pipeline


In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels


In [3]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [4]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]


X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)


#### Find feature types for later use

In [5]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define estimator and set its arguments 


In [6]:
estimator = XGBClassifier()
estimator_params = {
        "booster": ["gbtree","dart"],
        "eval_metric": ["auc"],
        "max_depth": [4, 5],
        "gamma": [0.1, 1.2],
        "subsample": [0.8],

    }
    

In [7]:

obj = BaseModel().optimize_by_tunesearchcv(
            estimator=estimator,
            estimator_params=estimator_params,
            fit_params = None,
            measure_of_accuracy=make_scorer(f1_score, greater_is_better=True),
            verbose=3,
            random_state=44,
            n_jobs=None,
            cv=KFold(3),
            early_stopping=None, 
            n_trials=10,
            scoring=None, 
            refit=True, 
            error_score='raise', 
            return_train_score=False, 
            local_dir='~/ray_results', 
            name=None, 
            max_iters=1, 
            search_optimization='hyperopt',
            use_gpu=False, 
            loggers=None, 
            pipeline_auto_early_stop=True, 
            stopper=None, 
            time_budget_s=None, 
            mode=None,
            search_kwargs=None, 

        )

#### Build sklearn pipeline

In [8]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            ('obj', obj)

 ])


#### Run Pipeline

In [9]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


2023-02-21 20:40:12,712 :: dev :: The optimization will be based on make_scorer(f1_score) metric!
2023-02-21 20:40:12,712 :: dev :: The optimization will be based on make_scorer(f1_score) metric!
2023-02-21 20:40:12,717 :: ray.tune.tune :: Initializing Ray automatically.For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run`.


2023-02-21 20:40:14,798	INFO worker.py:1538 -- Started a local Ray instance.


0,1
Current time:,2023-02-21 20:40:50
Running for:,00:00:34.78
Memory:,16.2/32.0 GiB

Trial name,status,loc,booster,eval_metric,gamma,max_depth,subsample,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_de80cc66,TERMINATED,127.0.0.1:39677,dart,auc,1.2,4,0.8,1,21.5517,0.690763,0.710764,0.703444
_Trainable_ff7a5643,TERMINATED,127.0.0.1:39684,dart,auc,1.2,4,0.8,1,20.4548,0.690763,0.710764,0.703444
_Trainable_ad2069fe,TERMINATED,127.0.0.1:39685,dart,auc,1.2,5,0.8,1,21.9464,0.690608,0.708166,0.699513
_Trainable_2e697a67,TERMINATED,127.0.0.1:39686,gbtree,auc,0.1,4,0.8,1,4.88987,0.688992,0.709335,0.703579
_Trainable_56854210,TERMINATED,127.0.0.1:39687,gbtree,auc,0.1,5,0.8,1,6.42787,0.69092,0.70872,0.698085
_Trainable_42df8ab4,TERMINATED,127.0.0.1:39688,dart,auc,1.2,5,0.8,1,21.8084,0.690608,0.708166,0.699513
_Trainable_6c680d50,TERMINATED,127.0.0.1:39689,gbtree,auc,1.2,5,0.8,1,6.41556,0.691739,0.711261,0.695492
_Trainable_301723da,TERMINATED,127.0.0.1:39690,dart,auc,1.2,4,0.8,1,20.5216,0.690763,0.710764,0.703444
_Trainable_5d52e37b,TERMINATED,127.0.0.1:39686,gbtree,auc,1.2,4,0.8,1,5.39843,0.688971,0.712573,0.708321
_Trainable_f23ee3be,TERMINATED,127.0.0.1:39689,dart,auc,0.1,5,0.8,1,18.6943,0.690374,0.702056,0.694275


Trial name,average_test_score,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,objective,pid,split0_test_score,split1_test_score,split2_test_score,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
_Trainable_2e697a67,0.700635,2023-02-21_20-40-30,True,,5f64110662404ee49c98c46d5e4b9950,hjavedani-MB,1,127.0.0.1,0.700635,39686,0.688992,0.709335,0.703579,4.88987,4.88987,4.88987,1677040830,0,,1,2e697a67,0.0622981
_Trainable_301723da,0.701657,2023-02-21_20-40-46,True,,77b6cf91e3474c6bbc7a0dafe0f2b72a,hjavedani-MB,1,127.0.0.1,0.701657,39690,0.690763,0.710764,0.703444,20.5216,20.5216,20.5216,1677040846,0,,1,301723da,0.0554409
_Trainable_42df8ab4,0.699429,2023-02-21_20-40-47,True,,f2ef9d3aee8841768e47e8a76191959f,hjavedani-MB,1,127.0.0.1,0.699429,39688,0.690608,0.708166,0.699513,21.8084,21.8084,21.8084,1677040847,0,,1,42df8ab4,0.058208
_Trainable_56854210,0.699242,2023-02-21_20-40-32,True,,8d60d6c955e4442aa5f0808f10caf8e9,hjavedani-MB,1,127.0.0.1,0.699242,39687,0.69092,0.70872,0.698085,6.42787,6.42787,6.42787,1677040832,0,,1,56854210,0.049335
_Trainable_5d52e37b,0.703288,2023-02-21_20-40-36,True,,5f64110662404ee49c98c46d5e4b9950,hjavedani-MB,1,127.0.0.1,0.703288,39686,0.688971,0.712573,0.708321,5.39843,5.39843,5.39843,1677040836,0,,1,5d52e37b,0.0622981
_Trainable_6c680d50,0.699497,2023-02-21_20-40-32,True,,3524ce49e0534244a5942cdfc69f8196,hjavedani-MB,1,127.0.0.1,0.699497,39689,0.691739,0.711261,0.695492,6.41556,6.41556,6.41556,1677040832,0,,1,6c680d50,0.0582979
_Trainable_ad2069fe,0.699429,2023-02-21_20-40-47,True,,523931c2c7e44c06bc28f83c670e5057,hjavedani-MB,1,127.0.0.1,0.699429,39685,0.690608,0.708166,0.699513,21.9464,21.9464,21.9464,1677040847,0,,1,ad2069fe,0.0525839
_Trainable_de80cc66,0.701657,2023-02-21_20-40-41,True,,c86463da80b64b38b7ed20bd815508e8,hjavedani-MB,1,127.0.0.1,0.701657,39677,0.690763,0.710764,0.703444,21.5517,21.5517,21.5517,1677040841,0,,1,de80cc66,0.0299792
_Trainable_f23ee3be,0.695568,2023-02-21_20-40-50,True,,3524ce49e0534244a5942cdfc69f8196,hjavedani-MB,1,127.0.0.1,0.695568,39689,0.690374,0.702056,0.694275,18.6943,18.6943,18.6943,1677040850,0,,1,f23ee3be,0.0582979
_Trainable_ff7a5643,0.701657,2023-02-21_20-40-46,True,,b1c0877d500e45b7a934e2d81581e288,hjavedani-MB,1,127.0.0.1,0.701657,39684,0.690763,0.710764,0.703444,20.4548,20.4548,20.4548,1677040846,0,,1,ff7a5643,0.0482981


2023-02-21 20:40:51,193	INFO tune.py:762 -- Total run time: 35.03 seconds (34.77 seconds for the tuning loop).


#### Check performance of the pipeline

In [10]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))


F1 score : 
0.7138089278264498
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8158
           1       0.78      0.66      0.71      2588

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.82     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7663  495]
 [ 877 1711]]


#### Part 2:  Use BestModel as a standalone estimator 

In [11]:
X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)

#### Transform features to make them ready for model input

In [12]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [13]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [14]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)

2023-02-21 20:40:52,203 :: dev :: The optimization will be based on make_scorer(f1_score) metric!
2023-02-21 20:40:52,203 :: dev :: The optimization will be based on make_scorer(f1_score) metric!


0,1
Current time:,2023-02-21 20:41:56
Running for:,00:01:04.11
Memory:,16.3/32.0 GiB

Trial name,status,loc,booster,eval_metric,gamma,max_depth,subsample,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_ce330c5c,TERMINATED,127.0.0.1:39701,dart,auc,1.2,4,0.8,1,46.5532,0.690763,0.710764,0.703444
_Trainable_caa2eb1f,TERMINATED,127.0.0.1:39702,dart,auc,1.2,4,0.8,1,46.2493,0.690763,0.710764,0.703444
_Trainable_5e99056f,TERMINATED,127.0.0.1:39703,dart,auc,1.2,5,0.8,1,48.0512,0.690608,0.708166,0.699513
_Trainable_4eb979ff,TERMINATED,127.0.0.1:39704,gbtree,auc,0.1,4,0.8,1,6.63711,0.688992,0.709335,0.703579
_Trainable_49da35cf,TERMINATED,127.0.0.1:39705,gbtree,auc,0.1,5,0.8,1,8.25524,0.69092,0.70872,0.698085
_Trainable_3a7a7eaf,TERMINATED,127.0.0.1:39706,dart,auc,1.2,5,0.8,1,48.0103,0.690608,0.708166,0.699513
_Trainable_f1708c91,TERMINATED,127.0.0.1:39707,gbtree,auc,1.2,5,0.8,1,8.21885,0.691739,0.711261,0.695492
_Trainable_07cc4156,TERMINATED,127.0.0.1:39708,dart,auc,1.2,4,0.8,1,46.202,0.690763,0.710764,0.703444
_Trainable_0141c9e2,TERMINATED,127.0.0.1:39704,gbtree,auc,1.2,4,0.8,1,12.3708,0.688971,0.712573,0.708321
_Trainable_7b7e064a,TERMINATED,127.0.0.1:39705,dart,auc,0.1,5,0.8,1,44.048,0.690374,0.702056,0.694275


Trial name,average_test_score,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,objective,pid,split0_test_score,split1_test_score,split2_test_score,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
_Trainable_0141c9e2,0.703288,2023-02-21_20-41-22,True,,ed5041f84790447191bb7afa911dcbf2,hjavedani-MB,1,127.0.0.1,0.703288,39704,0.688971,0.712573,0.708321,12.3708,12.3708,12.3708,1677040882,0,,1,0141c9e2,0.109459
_Trainable_07cc4156,0.701657,2023-02-21_20-41-50,True,,2b8b033f65e94fb7bfbb30fbc4a32115,hjavedani-MB,1,127.0.0.1,0.701657,39708,0.690763,0.710764,0.703444,46.202,46.202,46.202,1677040910,0,,1,07cc4156,0.0706971
_Trainable_3a7a7eaf,0.699429,2023-02-21_20-41-51,True,,9ba5847b30564bdc96571946574be320,hjavedani-MB,1,127.0.0.1,0.699429,39706,0.690608,0.708166,0.699513,48.0103,48.0103,48.0103,1677040911,0,,1,3a7a7eaf,0.0964878
_Trainable_49da35cf,0.699242,2023-02-21_20-41-12,True,,4714254bd8ed406fa26656c26253574a,hjavedani-MB,1,127.0.0.1,0.699242,39705,0.69092,0.70872,0.698085,8.25524,8.25524,8.25524,1677040872,0,,1,49da35cf,0.091363
_Trainable_4eb979ff,0.700635,2023-02-21_20-41-10,True,,ed5041f84790447191bb7afa911dcbf2,hjavedani-MB,1,127.0.0.1,0.700635,39704,0.688992,0.709335,0.703579,6.63711,6.63711,6.63711,1677040870,0,,1,4eb979ff,0.109459
_Trainable_5e99056f,0.699429,2023-02-21_20-41-51,True,,546e81e2322b41d7974065879aabdec5,hjavedani-MB,1,127.0.0.1,0.699429,39703,0.690608,0.708166,0.699513,48.0512,48.0512,48.0512,1677040911,0,,1,5e99056f,0.0796759
_Trainable_7b7e064a,0.695568,2023-02-21_20-41-56,True,,4714254bd8ed406fa26656c26253574a,hjavedani-MB,1,127.0.0.1,0.695568,39705,0.690374,0.702056,0.694275,44.048,44.048,44.048,1677040916,0,,1,7b7e064a,0.091363
_Trainable_caa2eb1f,0.701657,2023-02-21_20-41-50,True,,12811884c94e4249a4a640e371e0bd9a,hjavedani-MB,1,127.0.0.1,0.701657,39702,0.690763,0.710764,0.703444,46.2493,46.2493,46.2493,1677040910,0,,1,caa2eb1f,0.0988958
_Trainable_ce330c5c,0.701657,2023-02-21_20-41-43,True,,24963d7026c1459fb8394b21322fc9e0,hjavedani-MB,1,127.0.0.1,0.701657,39701,0.690763,0.710764,0.703444,46.5532,46.5532,46.5532,1677040903,0,,1,ce330c5c,0.0317891
_Trainable_f1708c91,0.699497,2023-02-21_20-41-12,True,,d0f3194581854d6494e46a8289f75e36,hjavedani-MB,1,127.0.0.1,0.699497,39707,0.691739,0.711261,0.695492,8.21885,8.21885,8.21885,1677040872,0,,1,f1708c91,0.1222


2023-02-21 20:41:56,553	INFO tune.py:762 -- Total run time: 64.34 seconds (64.10 seconds for the tuning loop).


#### Check performance of the pipeline

In [15]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))

F1 score : 
0.7138089278264498
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8158
           1       0.78      0.66      0.71      2588

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.82     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7663  495]
 [ 877 1711]]


In [16]:
obj.get_best_estimator()

In [17]:
obj.best_estimator

#### Get fitted grid search object and its attributes

In [18]:
GridSearchObj = obj.get_optimized_object()
GridSearchObj.cv_results_

{'params': [{'booster': 'dart',
   'eval_metric': 'auc',
   'gamma': 1.2,
   'max_depth': 4,
   'subsample': 0.8},
  {'booster': 'dart',
   'eval_metric': 'auc',
   'gamma': 1.2,
   'max_depth': 4,
   'subsample': 0.8},
  {'booster': 'dart',
   'eval_metric': 'auc',
   'gamma': 1.2,
   'max_depth': 5,
   'subsample': 0.8},
  {'booster': 'gbtree',
   'eval_metric': 'auc',
   'gamma': 0.1,
   'max_depth': 4,
   'subsample': 0.8},
  {'booster': 'gbtree',
   'eval_metric': 'auc',
   'gamma': 0.1,
   'max_depth': 5,
   'subsample': 0.8},
  {'booster': 'dart',
   'eval_metric': 'auc',
   'gamma': 1.2,
   'max_depth': 5,
   'subsample': 0.8},
  {'booster': 'gbtree',
   'eval_metric': 'auc',
   'gamma': 1.2,
   'max_depth': 5,
   'subsample': 0.8},
  {'booster': 'dart',
   'eval_metric': 'auc',
   'gamma': 1.2,
   'max_depth': 4,
   'subsample': 0.8},
  {'booster': 'gbtree',
   'eval_metric': 'auc',
   'gamma': 1.2,
   'max_depth': 4,
   'subsample': 0.8},
  {'booster': 'dart',
   'eval_metric