In [1]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from lohrasb.best_estimator import BaseModel
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from sklearn.metrics import f1_score, make_scorer
from xgboost import *
import optuna
from lohrasb import logger


2023-07-03 19:20:42,967 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-07-03 19:20:42,972 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-07-03 19:20:42,976 :: matplotlib :: interactive is False
2023-07-03 19:20:42,977 :: matplotlib :: platform is darwin
2023-07-03 19:20:42,993 :: graphviz._tools :: deprecate positional args: graphviz.backend.piping.pipe(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-03 19:20:42,994 :: graphviz._tools :: deprecate positional args: graphviz.backend.rendering.render(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-03 19:20:42,997 :: graphviz._tools :: deprecate positional args: graphviz.backend.unflattening.unflatten(['stagger', 'fanout', 'chain', 'encoding'])
2023-07-03 19:20:42,998 :: graphviz._tools :: deprecate positional args: graphviz.backend.viewing.view(['quiet'])
2023-07-03 19:20:43,002 :: graphviz._tools :: deprecate positio

#### Example: Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

#### Part 1: Use BestModel in sklearn pipeline


In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels


In [3]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [4]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]


X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)


#### Find feature types for later use

In [5]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define estimator and set its arguments 


In [6]:
estimator = XGBClassifier()
estimator_params = {
        "booster": optuna.distributions.CategoricalDistribution(choices=("gbtree","dart")),
        "max_depth": optuna.distributions.IntDistribution(10, 15),
        "gamma": optuna.distributions.FloatDistribution(0.5, 1.2, log=True),
        "subsample": optuna.distributions.FloatDistribution(0.8, 1.0)
        }

    

In [7]:
kwargs = {  # extra params of model if any
            'main_newoptuna_kwargs':{},
            # params for fit method or fit_params 
            'fit_newoptuna_kwargs' :{
            'sample_weight':None,
            },
            # params for GridSearchCV 
            'newoptuna_search_kwargs' : {
            'estimator':estimator,
            'param_distributions':estimator_params,
            'scoring' :'f1',
            'verbose':3,
            'n_jobs':-1,
            'cv':KFold(2),
            }
            }


In [8]:

obj = BaseModel().optimize_by_optunasearchcv(
        kwargs=kwargs    
        )

#### Build sklearn pipeline

In [9]:


pipeline =Pipeline([
     
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            ('obj', obj)

 ])


#### Run Pipeline

In [10]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


  self.__newoptuna_search = OptunaSearchCV(**self.newoptuna_search_kwargs)
[I 2023-07-03 19:20:44,979] A new study created in memory with name: no-name-ff04865d-2916-4d31-91de-cc372d769b65
[I 2023-07-03 19:20:44,980] Searching the best hyperparameters using 21815 samples...
[I 2023-07-03 19:20:57,528] Trial 2 finished with value: 0.681324870051653 and parameters: {'booster': 'gbtree', 'max_depth': 10, 'gamma': 0.8596849141768608, 'subsample': 0.9486473698012712}. Best is trial 2 with value: 0.681324870051653.
[I 2023-07-03 19:20:58,834] Trial 3 finished with value: 0.6756159868894858 and parameters: {'booster': 'gbtree', 'max_depth': 11, 'gamma': 0.5870780042930338, 'subsample': 0.9712847453855679}. Best is trial 2 with value: 0.681324870051653.
[I 2023-07-03 19:21:04,498] Trial 0 finished with value: 0.6693192743261537 and parameters: {'booster': 'gbtree', 'max_depth': 15, 'gamma': 0.750806781636867, 'subsample': 0.9493011831993914}. Best is trial 2 with value: 0.681324870051653.
[I 2

#### Check performance of the pipeline

In [11]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))


F1 score : 
0.6920386751697182
Classification report : 
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      8158
           1       0.74      0.65      0.69      2588

    accuracy                           0.86     10746
   macro avg       0.82      0.79      0.80     10746
weighted avg       0.86      0.86      0.86     10746

Confusion matrix : 
[[7567  591]
 [ 906 1682]]


#### Part 2: Another way of using it


In [12]:
X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)

#### Transform features to make them ready for model input

In [13]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [14]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [15]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)

  self.__newoptuna_search = OptunaSearchCV(**self.newoptuna_search_kwargs)
[I 2023-07-03 19:21:22,383] A new study created in memory with name: no-name-c2b13680-8c9a-43d1-9c25-cb44d7fd867b
[I 2023-07-03 19:21:22,384] Searching the best hyperparameters using 21815 samples...
[I 2023-07-03 19:21:39,855] Trial 2 finished with value: 0.6697876382386865 and parameters: {'booster': 'gbtree', 'max_depth': 13, 'gamma': 0.6775355284721216, 'subsample': 0.8610329265730301}. Best is trial 2 with value: 0.6697876382386865.
[I 2023-07-03 19:21:41,107] Trial 7 finished with value: 0.6675482398809633 and parameters: {'booster': 'gbtree', 'max_depth': 14, 'gamma': 0.5664690093539809, 'subsample': 0.8533594799775146}. Best is trial 2 with value: 0.6697876382386865.
[I 2023-07-03 19:21:58,467] Trial 9 finished with value: 0.6754430115573766 and parameters: {'booster': 'gbtree', 'max_depth': 10, 'gamma': 0.5798334910474501, 'subsample': 0.941106938685875}. Best is trial 9 with value: 0.6754430115573766.


#### Check performance of the pipeline

In [16]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))

F1 score : 
0.6945521322179147
Classification report : 
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      8158
           1       0.74      0.66      0.69      2588

    accuracy                           0.86     10746
   macro avg       0.82      0.79      0.80     10746
weighted avg       0.86      0.86      0.86     10746

Confusion matrix : 
[[7547  611]
 [ 886 1702]]


In [17]:
obj.get_best_estimator()

In [18]:
obj.best_estimator

#### Get fitted search object and its attributes

In [19]:
NewSearchObj = obj.get_optimized_object()

In [20]:
NewSearchObj