#### Install required libraries

In [1]:
! pip install git+https://github.com/TorkamaniLab/lohrasb.git --force-reinstall
! pip install pandas  category_encoders  feature-engine optuna xgboost

Collecting git+https://github.com/TorkamaniLab/lohrasb.git
  Cloning https://github.com/TorkamaniLab/lohrasb.git to /private/var/folders/v1/xbcjnd1x5rn7ct1m_rnsblk80000gp/T/pip-req-build-ldi_4fi1
  Running command git clone --filter=blob:none --quiet https://github.com/TorkamaniLab/lohrasb.git /private/var/folders/v1/xbcjnd1x5rn7ct1m_rnsblk80000gp/T/pip-req-build-ldi_4fi1
  Resolved https://github.com/TorkamaniLab/lohrasb.git to commit 5216eb6563dc3152c5cc6d44d2488a8c614ccb80
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting aiosignal==1.3.1 (from lohrasb==4.1.0)
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting alembic==1.12.0 (from lohrasb==4.1.0)
  Obtaining dependency information for alembic==1.12.0 from https://files.pythonhosted.org/packages/a2/8b/46919127496036c8e990b2b236454a0d8655fd46e1df2fd35610a9cbc842/alembic-1.12.0-py3-none-any.whl.metadata
  Using cached alembic-1.12.0-py3-none-any.whl.metadata (7.2 kB)
Collecting argcomplete==3.1.1 (from l

#### Import required libraries and print some version

In [9]:
# Standard library imports
import sys  # For system-related utilities like getting Python version

# Third-party library imports
import numpy as np  # Aliased for better readability
import pandas as pd  # Aliased for better readability
import optuna  # For optimization
import sklearn  # Scikit-learn

# Scikit-learn specific imports
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline

# Optuna specific imports
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler

# Third-party library specific imports for feature engineering
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from category_encoders import OrdinalEncoder

# LightGBM specific imports
from lightgbm import *  # Ideally, list specific imports instead of '*'

# Local (or application-specific) imports
import lohrasb
from lohrasb.best_estimator import BaseModel
from lohrasb.utils.metrics import f1_plus_tn
import xgboost
import lightgbm
# Print out versions of key libraries
print(f'Python version : {sys.version}')
print(f'lohrasb version : {lohrasb.__version__}')
print(f'sklearn version : {sklearn.__version__}')
print(f'pandas version : {pd.__version__}')  # Using the alias
print(f'numpy version : {np.__version__}')
print(f'xgboost version : {xgboost.__version__}')
print(f'lightgbm version : {lightgbm.__version__}')
print(f'optuna version : {optuna.__version__}')


Python version : 3.7.8 (default, Feb 27 2023, 18:11:31) 
[Clang 14.0.0 (clang-1400.0.29.202)]
lohrasb version : 4.1.0
sklearn version : 1.0.2
pandas version : 1.3.5
numpy version : 1.21.6
xgboost version : 1.6.2
lightgbm version : 4.0.0
optuna version : 3.3.0


#### Example: Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

#### Part 1: Use BestModel in sklearn pipeline


In [10]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels


In [11]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [12]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]


X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)


#### Find feature types for later use

In [13]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define estimator and set its arguments 


In [14]:
estimator = xgboost.XGBClassifier()
estimator_params = {
        "booster": optuna.distributions.CategoricalDistribution(choices=("gbtree","dart")),
        "max_depth": optuna.distributions.IntDistribution(10, 15),
        "gamma": optuna.distributions.FloatDistribution(0.5, 1.2, log=True),
        "subsample": optuna.distributions.FloatDistribution(0.8, 1.0)
        }

    

In [15]:
kwargs = {  # extra params of model if any
            'main_newoptuna_kwargs':{},
            # params for fit method or fit_params 
            'fit_newoptuna_kwargs' :{
            'sample_weight':None,
            },
            # params for GridSearchCV 
            'newoptuna_search_kwargs' : {
            'estimator':estimator,
            'param_distributions':estimator_params,
            'scoring' :'f1',
            'verbose':3,
            'n_jobs':-1,
            'cv':KFold(2),
            }
            }


In [16]:

obj = BaseModel().optimize_by_optunasearchcv(
        kwargs=kwargs    
        )

#### Build sklearn pipeline

In [17]:


pipeline =Pipeline([
     
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            ('obj', obj)

 ])


#### Run Pipeline

In [18]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


  self.__newoptuna_search = OptunaSearchCV(**self.newoptuna_search_kwargs)
[I 2023-09-03 13:23:13,398] A new study created in memory with name: no-name-383407c5-a8a2-458a-9282-84532292259f
[I 2023-09-03 13:23:13,399] Searching the best hyperparameters using 21815 samples...
[I 2023-09-03 13:23:27,518] Trial 2 finished with value: 0.6678060224431778 and parameters: {'booster': 'gbtree', 'max_depth': 11, 'gamma': 0.8315260576584292, 'subsample': 0.8275404843286694}. Best is trial 2 with value: 0.6678060224431778.
[I 2023-09-03 13:23:28,359] Trial 0 finished with value: 0.67123141514338 and parameters: {'booster': 'gbtree', 'max_depth': 12, 'gamma': 0.6616134823447023, 'subsample': 0.8851403448808278}. Best is trial 0 with value: 0.67123141514338.
[I 2023-09-03 13:23:31,301] Trial 7 finished with value: 0.6768876451988424 and parameters: {'booster': 'gbtree', 'max_depth': 14, 'gamma': 1.0138452558021576, 'subsample': 0.9353238156930084}. Best is trial 7 with value: 0.6768876451988424.
[I 

#### Check performance of the pipeline

In [19]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))


F1 score : 
0.6862061897929904
Classification report : 
              precision    recall  f1-score   support

           0       0.89      0.92      0.91      8158
           1       0.73      0.65      0.69      2588

    accuracy                           0.86     10746
   macro avg       0.81      0.79      0.80     10746
weighted avg       0.85      0.86      0.85     10746

Confusion matrix : 
[[7541  617]
 [ 914 1674]]


#### Part 2: Another way of using it


In [20]:
X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)

#### Transform features to make them ready for model input

In [21]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [22]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [23]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)

  self.__newoptuna_search = OptunaSearchCV(**self.newoptuna_search_kwargs)
[I 2023-09-03 13:23:57,975] A new study created in memory with name: no-name-33ab41e5-1c67-4f61-bec8-02eb887f8083
[I 2023-09-03 13:23:57,976] Searching the best hyperparameters using 21815 samples...
[I 2023-09-03 13:24:10,188] Trial 1 finished with value: 0.6771463667986065 and parameters: {'booster': 'gbtree', 'max_depth': 11, 'gamma': 1.1766401468093488, 'subsample': 0.9165907231034633}. Best is trial 1 with value: 0.6771463667986065.
[I 2023-09-03 13:24:10,355] Trial 3 finished with value: 0.6707302472107444 and parameters: {'booster': 'gbtree', 'max_depth': 11, 'gamma': 0.5355235941711048, 'subsample': 0.876907233141069}. Best is trial 1 with value: 0.6771463667986065.
[I 2023-09-03 13:24:10,490] Trial 0 finished with value: 0.6709536731957079 and parameters: {'booster': 'gbtree', 'max_depth': 11, 'gamma': 0.6534510289356901, 'subsample': 0.8606253264562219}. Best is trial 1 with value: 0.6771463667986065.


#### Check performance of the pipeline

In [24]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))

F1 score : 
0.6872174270448006
Classification report : 
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      8158
           1       0.73      0.65      0.69      2588

    accuracy                           0.86     10746
   macro avg       0.81      0.79      0.80     10746
weighted avg       0.85      0.86      0.86     10746

Confusion matrix : 
[[7552  606]
 [ 916 1672]]


In [25]:
obj.get_best_estimator()

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=1.1766401468093488, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=11,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [26]:
obj.best_estimator

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=1.1766401468093488, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=11,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

#### Get fitted search object and its attributes

In [27]:
NewSearchObj = obj.get_optimized_object()

In [28]:
NewSearchObj

OptunaSearchCV(cv=KFold(n_splits=2, random_state=None, shuffle=False),
               estimator=XGBClassifier(base_score=None, booster=None,
                                       callbacks=None, colsample_bylevel=None,
                                       colsample_bynode=None,
                                       colsample_bytree=None,
                                       early_stopping_rounds=None,
                                       enable_categorical=False,
                                       eval_metric=None, gamma=None,
                                       gpu_id=None, grow_policy=None,
                                       importance_type=None,
                                       interaction_const...
                                       random_state=None, reg_alpha=None,
                                       reg_lambda=None, ...),
               n_jobs=-1,
               param_distributions={'booster': CategoricalDistribution(choices=('gbtree', 'dart')),
  