#### Install required libraries

In [1]:
! pip install git+https://github.com/TorkamaniLab/lohrasb.git --force-reinstall
! pip install pandas  category_encoders  feature-engine optuna

Collecting git+https://github.com/TorkamaniLab/lohrasb.git
  Cloning https://github.com/TorkamaniLab/lohrasb.git to /private/var/folders/v1/xbcjnd1x5rn7ct1m_rnsblk80000gp/T/pip-req-build-8ezxq23l
  Running command git clone --filter=blob:none --quiet https://github.com/TorkamaniLab/lohrasb.git /private/var/folders/v1/xbcjnd1x5rn7ct1m_rnsblk80000gp/T/pip-req-build-8ezxq23l
  Resolved https://github.com/TorkamaniLab/lohrasb.git to commit 5216eb6563dc3152c5cc6d44d2488a8c614ccb80
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting aiosignal==1.3.1 (from lohrasb==4.1.0)
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting alembic==1.12.0 (from lohrasb==4.1.0)
  Obtaining dependency information for alembic==1.12.0 from https://files.pythonhosted.org/packages/a2/8b/46919127496036c8e990b2b236454a0d8655fd46e1df2fd35610a9cbc842/alembic-1.12.0-py3-none-any.whl.metadata
  Using cached alembic-1.12.0-py3-none-any.whl.metadata (7.2 kB)
Collecting argcomplete==3.1.1 (from l

#### Import and print some libraries versions

In [2]:
# Standard library imports
import sys  # For system-related utilities like getting Python version

# Third-party library imports
import numpy as np  # Aliased for better readability
import pandas as pd  # Aliased for better readability
import optuna  # For optimization
import sklearn  # Scikit-learn

# Scikit-learn specific imports
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Optuna specific imports
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler

# Third-party library specific imports for feature engineering
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from category_encoders import OrdinalEncoder

# LightGBM specific imports
from lightgbm import *  # Ideally, list specific imports instead of '*'

# Local (or application-specific) imports
import lohrasb
from lohrasb.best_estimator import BaseModel
from lohrasb.utils.metrics import f1_plus_tn

# Print out versions of key libraries
print(f'Python version : {sys.version}')
print(f'lohrasb version : {lohrasb.__version__}')
print(f'sklearn version : {sklearn.__version__}')
print(f'pandas version : {pd.__version__}')  # Using the alias
print(f'numpy version : {np.__version__}')
print(f'optuna version : {optuna.__version__}')


  from .autonotebook import tqdm as notebook_tqdm


[Errno 2] No such file or directory: '/Users/hjavedani/.pyenv/versions/3.7.8/lib/python3.7/site-packages/lohrasb/config.yaml'
In this module, the default logging will be applied. The error is [Errno 2] No such file or directory: '/Users/hjavedani/.pyenv/versions/3.7.8/lib/python3.7/site-packages/lohrasb/config.yaml' which will be skipped!
default logger setting is applied !


2023-09-03 13:02:01,079	INFO util.py:90 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-09-03 13:02:02,059	INFO util.py:90 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Python version : 3.7.8 (default, Feb 27 2023, 18:11:31) 
[Clang 14.0.0 (clang-1400.0.29.202)]
lohrasb version : 4.1.0
sklearn version : 1.0.2
pandas version : 1.3.5
numpy version : 1.21.6
optuna version : 3.3.0


#### Example 1 : Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

#### Part 1: Use BestModel in sklearn pipeline


In [3]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels


In [4]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [5]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]
y = y.values.ravel()


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)

# for sample_weights
weights = np.ones(len(y_train))


#### Find feature types for later use

In [6]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define model and set it argumens 

In [7]:
estimator = LGBMClassifier()
estimator_params = {
        "boosting_type":["gbdt"],
        "max_depth": [6,15],
        "learning_rate":[0.01, 0.1],
        "random_state":[42],

    }

kwargs = {  # params for fit method  
            'fit_optuna_kwargs' :{
            'sample_weight':None,
            },
            # params for OptunaSearch
            'main_optuna_kwargs' : {
            'estimator':estimator,
            'estimator_params':estimator_params,
            'refit':True,
            'measure_of_accuracy' :'f1_score(y_true, y_pred,average="weighted")',

            },
            'train_test_split_kwargs':{
                'test_size':.3,
                            
            },
            'study_search_kwargs':{
                'storage':None,
                'sampler':TPESampler(),
                'pruner':HyperbandPruner(),
                'study_name':"example of optuna optimizer",
                'direction':"maximize",
                'load_if_exists':False,
            },
            'optimize_kwargs':{
                # optuna optimization params
                'n_trials':20,
                'timeout':600,
                'catch':(),
                'callbacks':None,
                'gc_after_trial':False,
                'show_progress_bar':False,
            }
}




In [8]:
obj = BaseModel().optimize_by_optuna(
            kwargs=kwargs
        )

#### Build sklearn pipeline

In [9]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            ('obj', obj),


 ])
 



#### Run Pipeline

In [10]:
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
pred_labels = np.rint(y_preds)




[I 2023-09-03 13:02:04,122] A new study created in memory with name: example of optuna optimizer
[I 2023-09-03 13:02:04,274] Trial 0 finished with value: 0.8640178613864075 and parameters: {'boosting_type': 'gbdt', 'max_depth': 6, 'learning_rate': 0.09714552853914048, 'random_state': 42}. Best is trial 0 with value: 0.8640178613864075.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:04,425] Trial 1 finished with value: 0.8650304248596747 and parameters: {'boosting_type': 'gbdt', 'max_depth': 14, 'learning_rate': 0.07702256557671801, 'random_state': 42}. Best is trial 1 with value: 0.8650304248596747.
[I 2023-09-03 13:02:04,541] Trial 2 finished with value: 0.8676592089151163 and parameters: {'boosting_type': 'gbdt', 'max_depth': 11, 'learning_rate': 0.0627595129928241, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:04,685] Trial 3 finished with value: 0.8579455578833927 and parameters: {'boosting_type': 'gbdt', 'max_depth': 6, 'learning_rate': 0.034872502772002704, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.
[I 2023-09-03 13:02:04,813] Trial 4 finished with value: 0.866122192465431 and parameters: {'boosting_type': 'gbdt', 'max_depth': 14, 'learning_rate': 0.06695344970218178, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:04,921] Trial 5 finished with value: 0.8671689506224992 and parameters: {'boosting_type': 'gbdt', 'max_depth': 10, 'learning_rate': 0.07648022674297933, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.
[I 2023-09-03 13:02:05,045] Trial 6 finished with value: 0.8633860209140406 and parameters: {'boosting_type': 'gbdt', 'max_depth': 7, 'learning_rate': 0.05297887996959303, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:05,170] Trial 7 finished with value: 0.864975283057365 and parameters: {'boosting_type': 'gbdt', 'max_depth': 15, 'learning_rate': 0.033967815508004824, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.
[I 2023-09-03 13:02:05,296] Trial 8 finished with value: 0.866860300916702 and parameters: {'boosting_type': 'gbdt', 'max_depth': 14, 'learning_rate': 0.09643151134772379, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:05,400] Trial 9 finished with value: 0.8638036273505912 and parameters: {'boosting_type': 'gbdt', 'max_depth': 7, 'learning_rate': 0.05208442003457855, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.
[I 2023-09-03 13:02:05,517] Trial 10 finished with value: 0.840227068958231 and parameters: {'boosting_type': 'gbdt', 'max_depth': 11, 'learning_rate': 0.011795887564320387, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:05,627] Trial 11 finished with value: 0.8654473400293128 and parameters: {'boosting_type': 'gbdt', 'max_depth': 10, 'learning_rate': 0.07611939069130716, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.
[I 2023-09-03 13:02:05,743] Trial 12 finished with value: 0.8660311805360866 and parameters: {'boosting_type': 'gbdt', 'max_depth': 10, 'learning_rate': 0.06765122395000564, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:05,905] Trial 13 finished with value: 0.866304078182473 and parameters: {'boosting_type': 'gbdt', 'max_depth': 12, 'learning_rate': 0.08470081390118173, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.
[I 2023-09-03 13:02:06,025] Trial 14 finished with value: 0.86652313533367 and parameters: {'boosting_type': 'gbdt', 'max_depth': 9, 'learning_rate': 0.06168116738893913, 'random_state': 42}. Best is trial 2 with value: 0.8676592089151163.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:06,138] Trial 15 finished with value: 0.8682525880450834 and parameters: {'boosting_type': 'gbdt', 'max_depth': 12, 'learning_rate': 0.08538835877736589, 'random_state': 42}. Best is trial 15 with value: 0.8682525880450834.
[I 2023-09-03 13:02:06,251] Trial 16 finished with value: 0.8659465639895738 and parameters: {'boosting_type': 'gbdt', 'max_depth': 12, 'learning_rate': 0.09035009077058441, 'random_state': 42}. Best is trial 15 with value: 0.8682525880450834.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:06,369] Trial 17 finished with value: 0.8660925348681009 and parameters: {'boosting_type': 'gbdt', 'max_depth': 12, 'learning_rate': 0.08480580778152427, 'random_state': 42}. Best is trial 15 with value: 0.8682525880450834.
[I 2023-09-03 13:02:06,480] Trial 18 finished with value: 0.8651860909479012 and parameters: {'boosting_type': 'gbdt', 'max_depth': 13, 'learning_rate': 0.09924888214624819, 'random_state': 42}. Best is trial 15 with value: 0.8682525880450834.


[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284
[LightGBM] [Info] Number of positive: 3736, number of negative: 11534
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244663 -> initscore=-1.127284
[LightGBM] [Info] Start training from score -1.127284


[I 2023-09-03 13:02:06,595] Trial 19 finished with value: 0.8669494329668511 and parameters: {'boosting_type': 'gbdt', 'max_depth': 9, 'learning_rate': 0.08547094009924186, 'random_state': 42}. Best is trial 15 with value: 0.8682525880450834.


[LightGBM] [Info] Number of positive: 5291, number of negative: 16524
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 21815, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242540 -> initscore=-1.138807
[LightGBM] [Info] Start training from score -1.138807


#### Check performance of the pipeline

In [11]:
print('F1 score : ')
print(f1_score(y_test,pred_labels))
print('Classification report : ')
print(classification_report(y_test,pred_labels))
print('Confusion matrix : ')
print(confusion_matrix(y_test,pred_labels))


F1 score : 
0.7104033970276008
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8196
           1       0.77      0.66      0.71      2550

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.81     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7709  487]
 [ 877 1673]]


#### Some estimators have predict_proba method as well

In [12]:
y_preds = pipeline.predict_proba(X_test)
print(y_preds)

[[0.99214398 0.00785602]
 [0.65613206 0.34386794]
 [0.38407584 0.61592416]
 ...
 [0.71796618 0.28203382]
 [0.68588175 0.31411825]
 [0.97986241 0.02013759]]


#### Part 2: Another way of using it


In [13]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


#### Transform features to make them ready for model input

In [14]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [15]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [16]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)

[I 2023-09-03 13:02:07,167] A new study created in memory with name: example of optuna optimizer
[I 2023-09-03 13:02:07,276] Trial 0 finished with value: 0.8685218550766183 and parameters: {'boosting_type': 'gbdt', 'max_depth': 6, 'learning_rate': 0.09352039527574546, 'random_state': 42}. Best is trial 0 with value: 0.8685218550766183.
[I 2023-09-03 13:02:07,371] Trial 1 finished with value: 0.8680368355659926 and parameters: {'boosting_type': 'gbdt', 'max_depth': 7, 'learning_rate': 0.09420294565711353, 'random_state': 42}. Best is trial 0 with value: 0.8685218550766183.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:07,492] Trial 2 finished with value: 0.8698592506234561 and parameters: {'boosting_type': 'gbdt', 'max_depth': 12, 'learning_rate': 0.04719097359520171, 'random_state': 42}. Best is trial 2 with value: 0.8698592506234561.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:07,601] Trial 3 finished with value: 0.8497426270277023 and parameters: {'boosting_type': 'gbdt', 'max_depth': 6, 'learning_rate': 0.02202578908052913, 'random_state': 42}. Best is trial 2 with value: 0.8698592506234561.
[I 2023-09-03 13:02:07,699] Trial 4 finished with value: 0.8700923074326851 and parameters: {'boosting_type': 'gbdt', 'max_depth': 7, 'learning_rate': 0.0644922038154149, 'random_state': 42}. Best is trial 4 with value: 0.8700923074326851.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:07,807] Trial 5 finished with value: 0.868673305004951 and parameters: {'boosting_type': 'gbdt', 'max_depth': 10, 'learning_rate': 0.037441786479100854, 'random_state': 42}. Best is trial 4 with value: 0.8700923074326851.
[I 2023-09-03 13:02:07,912] Trial 6 finished with value: 0.8698969605353196 and parameters: {'boosting_type': 'gbdt', 'max_depth': 15, 'learning_rate': 0.09058244776468585, 'random_state': 42}. Best is trial 4 with value: 0.8700923074326851.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:08,023] Trial 7 finished with value: 0.8683446154348955 and parameters: {'boosting_type': 'gbdt', 'max_depth': 9, 'learning_rate': 0.0503503617069211, 'random_state': 42}. Best is trial 4 with value: 0.8700923074326851.
[I 2023-09-03 13:02:08,130] Trial 8 finished with value: 0.8706330040353975 and parameters: {'boosting_type': 'gbdt', 'max_depth': 10, 'learning_rate': 0.08017426990579991, 'random_state': 42}. Best is trial 8 with value: 0.8706330040353975.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:08,236] Trial 9 finished with value: 0.8693519269954058 and parameters: {'boosting_type': 'gbdt', 'max_depth': 9, 'learning_rate': 0.04295471203154003, 'random_state': 42}. Best is trial 8 with value: 0.8706330040353975.
[I 2023-09-03 13:02:08,353] Trial 10 finished with value: 0.8692919889796312 and parameters: {'boosting_type': 'gbdt', 'max_depth': 13, 'learning_rate': 0.06960945939084873, 'random_state': 42}. Best is trial 8 with value: 0.8706330040353975.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:08,506] Trial 11 finished with value: 0.8677837442833785 and parameters: {'boosting_type': 'gbdt', 'max_depth': 8, 'learning_rate': 0.06814310189908579, 'random_state': 42}. Best is trial 8 with value: 0.8706330040353975.
[I 2023-09-03 13:02:08,631] Trial 12 finished with value: 0.8685251331608812 and parameters: {'boosting_type': 'gbdt', 'max_depth': 11, 'learning_rate': 0.070382753992882, 'random_state': 42}. Best is trial 8 with value: 0.8706330040353975.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:08,740] Trial 13 finished with value: 0.868755047114198 and parameters: {'boosting_type': 'gbdt', 'max_depth': 8, 'learning_rate': 0.07897711926583689, 'random_state': 42}. Best is trial 8 with value: 0.8706330040353975.
[I 2023-09-03 13:02:08,858] Trial 14 finished with value: 0.8686888265589969 and parameters: {'boosting_type': 'gbdt', 'max_depth': 14, 'learning_rate': 0.05932193896335917, 'random_state': 42}. Best is trial 8 with value: 0.8706330040353975.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:08,979] Trial 15 finished with value: 0.8706981396720325 and parameters: {'boosting_type': 'gbdt', 'max_depth': 11, 'learning_rate': 0.0820590471373236, 'random_state': 42}. Best is trial 15 with value: 0.8706981396720325.
[I 2023-09-03 13:02:09,091] Trial 16 finished with value: 0.8695047382317322 and parameters: {'boosting_type': 'gbdt', 'max_depth': 11, 'learning_rate': 0.08065777441075318, 'random_state': 42}. Best is trial 15 with value: 0.8706981396720325.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:09,209] Trial 17 finished with value: 0.8690484089358681 and parameters: {'boosting_type': 'gbdt', 'max_depth': 12, 'learning_rate': 0.0809112121607618, 'random_state': 42}. Best is trial 15 with value: 0.8706981396720325.
[I 2023-09-03 13:02:09,319] Trial 18 finished with value: 0.8709103699147239 and parameters: {'boosting_type': 'gbdt', 'max_depth': 10, 'learning_rate': 0.09827309893283051, 'random_state': 42}. Best is trial 18 with value: 0.8709103699147239.


[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028
[LightGBM] [Info] Number of positive: 3717, number of negative: 11553
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 15270, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243418 -> initscore=-1.134028
[LightGBM] [Info] Start training from score -1.134028


[I 2023-09-03 13:02:09,445] Trial 19 finished with value: 0.8698639987336854 and parameters: {'boosting_type': 'gbdt', 'max_depth': 12, 'learning_rate': 0.09760717421166447, 'random_state': 42}. Best is trial 18 with value: 0.8709103699147239.


[LightGBM] [Info] Number of positive: 5291, number of negative: 16524
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 21815, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242540 -> initscore=-1.138807
[LightGBM] [Info] Start training from score -1.138807


#### Check performance of the model

In [17]:
f1_plus_tn
print('F1 score plus TN : ')
print(f1_plus_tn(y_test,pred_labels))
print('F1 score : ')
print(f1_score(y_test,pred_labels))
print('Classification report : ')
print(classification_report(y_test,pred_labels))
print('Confusion matrix : ')
print(confusion_matrix(y_test,pred_labels))

F1 score plus TN : 
7709.710403397027
F1 score : 
0.7104033970276008
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8196
           1       0.77      0.66      0.71      2550

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.81     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7709  487]
 [ 877 1673]]


In [18]:
obj.get_best_estimator()

LGBMClassifier(learning_rate=0.09827309893283051, max_depth=10, random_state=42)

In [19]:
obj.best_estimator

LGBMClassifier(learning_rate=0.09827309893283051, max_depth=10, random_state=42)

#### Get fitted search object and its attributes

In [20]:
OptunaObj = obj.get_optimized_object()
print(OptunaObj)

FrozenTrial(number=18, state=TrialState.COMPLETE, values=[0.8709103699147239], datetime_start=datetime.datetime(2023, 9, 3, 13, 2, 9, 210973), datetime_complete=datetime.datetime(2023, 9, 3, 13, 2, 9, 319547), params={'boosting_type': 'gbdt', 'max_depth': 10, 'learning_rate': 0.09827309893283051, 'random_state': 42}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'boosting_type': CategoricalDistribution(choices=('gbdt',)), 'max_depth': IntDistribution(high=15, log=False, low=6, step=1), 'learning_rate': FloatDistribution(high=0.1, log=False, low=0.01, step=None), 'random_state': IntDistribution(high=42, log=False, low=42, step=1)}, trial_id=18, value=None)
