In [1]:
from lohrasb.best_estimator import BaseModel
import xgboost
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold,train_test_split
import pandas as pd
import numpy as np
import optuna
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from sklearn.metrics import f1_score, mean_absolute_error
from sklearn.linear_model import *
from sklearn.svm import *
from xgboost import *
from sklearn.linear_model import *
from lightgbm import *
from sklearn.neural_network import *
from imblearn.ensemble import *
from sklearn.ensemble import *



# Example 1 : Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

# Part 1: Use BestModel in sklearn pipeline


In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels


In [3]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [4]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]
y = y.values.ravel()


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


#### Find feature types for later use

In [5]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define model and set it argumens 

In [6]:
estimator = LGBMClassifier()
estimator_params = {
        "boosting_type":["gbdt","rf"],
        "boosting_type":["gbdt"],
        "max_depth": [4,5],
        "learning_rate":[0.1, 0.5],
        "random_state":[42],
        "learning_rate":[0.1],
        "importance_type":["split"]

    }


In [7]:
obj = BaseModel().optimize_by_optuna(
            estimator=estimator,
            estimator_params=estimator_params,
            measure_of_accuracy="f1_score",
            with_stratified=True,
            test_size=.3,
            add_extra_args_for_measure_of_accuracy = False,
            verbose=3,
            n_jobs=-1,
            random_state=42,
            # optuna params
            # optuna study init params
            study=optuna.create_study(
                storage=None,
                sampler=TPESampler(),
                pruner=HyperbandPruner(),
                study_name="example of optuna optimizer",
                direction="maximize",
                load_if_exists=False,
                directions=None,
            ),
            # optuna optimization params
            study_optimize_objective=None,
            study_optimize_objective_n_trials=20,
            study_optimize_objective_timeout=600,
            study_optimize_n_jobs=-1,
            study_optimize_catch=(),
            study_optimize_callbacks=None,
            study_optimize_gc_after_trial=False,
            study_optimize_show_progress_bar=False,
        )

[32m[I 2022-11-11 19:10:15,652][0m A new study created in memory with name: example of optuna optimizer[0m


#### Build sklearn pipeline

In [8]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            ('obj', obj),


 ])
 



#### Run Pipeline

In [9]:
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
pred_labels = np.rint(y_preds)




{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}


[32m[I 2022-11-11 19:10:17,475][0m Trial 1 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 1 with value: 0.6962384669978708.[0m
[32m[I 2022-11-11 19:10:17,492][0m Trial 5 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 1 with value: 0.6962384669978708.[0m
[32m[I 2022-11-11 19:10:17,498][0m Trial 3 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 1 with value: 0.6962384669978708.[0m


{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}


[32m[I 2022-11-11 19:10:18,580][0m Trial 6 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:18,603][0m Trial 2 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:18,607][0m Trial 7 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:18,621][0m Trial 0 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type'

{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}


[32m[I 2022-11-11 19:10:19,024][0m Trial 4 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:19,119][0m Trial 9 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m


{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}


[32m[I 2022-11-11 19:10:20,189][0m Trial 8 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:20,220][0m Trial 10 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:20,231][0m Trial 13 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m


{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}


[32m[I 2022-11-11 19:10:21,286][0m Trial 11 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:21,315][0m Trial 14 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:21,571][0m Trial 15 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:21,592][0m Trial 12 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_t

#### Check performance of the pipeline

In [10]:
print('F1 score : ')
print(f1_score(y_test,pred_labels))
print('Classification report : ')
print(classification_report(y_test,pred_labels))
print('Confusion matrix : ')
print(confusion_matrix(y_test,pred_labels))


F1 score : 
0.709161624891962
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      8196
           1       0.79      0.64      0.71      2550

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.81     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7759  437]
 [ 909 1641]]


#### Some estimators have predict_proba method as well

In [11]:
y_preds = pipeline.predict_proba(X_test)
print(y_preds)

[[0.9846395  0.0153605 ]
 [0.58312603 0.41687397]
 [0.36421549 0.63578451]
 ...
 [0.71121761 0.28878239]
 [0.67598324 0.32401676]
 [0.9773811  0.0226189 ]]


# Part 2:  Use BestModel as a standalone estimator 

In [12]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


#### Transform features to make them ready for model input

In [13]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [14]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [15]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)

{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}


[32m[I 2022-11-11 19:10:24,277][0m Trial 21 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:24,301][0m Trial 26 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:24,305][0m Trial 24 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:24,327][0m Trial 25 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_t

{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}


[32m[I 2022-11-11 19:10:24,572][0m Trial 20 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:24,586][0m Trial 23 finished with value: 0.6962384669978708 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m


{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}


[32m[I 2022-11-11 19:10:27,008][0m Trial 28 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:27,041][0m Trial 30 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:27,078][0m Trial 33 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:27,094][0m Trial 32 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_t

{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}
{'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}


[32m[I 2022-11-11 19:10:27,315][0m Trial 34 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:27,331][0m Trial 35 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:27,452][0m Trial 29 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}. Best is trial 6 with value: 0.7026269702276707.[0m
[32m[I 2022-11-11 19:10:27,472][0m Trial 31 finished with value: 0.7026269702276707 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_t

#### Check performance of the model

In [16]:
print('F1 score : ')
print(f1_score(y_test,pred_labels))
print('Classification report : ')
print(classification_report(y_test,pred_labels))
print('Confusion matrix : ')
print(confusion_matrix(y_test,pred_labels))

F1 score : 
0.709161624891962
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      8196
           1       0.79      0.64      0.71      2550

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.81     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7759  437]
 [ 909 1641]]


In [17]:
obj.get_best_estimator()

In [18]:
obj.best_estimator

#### Get fitted randomized search object and its attributes

In [19]:
OptunaObj = obj.get_optimized_object()
OptunaObj.trials

[FrozenTrial(number=0, values=[0.7026269702276707], datetime_start=datetime.datetime(2022, 11, 11, 19, 10, 15, 861794), datetime_complete=datetime.datetime(2022, 11, 11, 19, 10, 18, 621527), params={'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.1, 'random_state': 42, 'importance_type': 'split'}, distributions={'boosting_type': CategoricalDistribution(choices=('gbdt',)), 'max_depth': IntUniformDistribution(high=5, low=4, step=1), 'learning_rate': UniformDistribution(high=0.1, low=0.1), 'random_state': IntUniformDistribution(high=42, low=42, step=1), 'importance_type': CategoricalDistribution(choices=('split',))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None),
 FrozenTrial(number=1, values=[0.6962384669978708], datetime_start=datetime.datetime(2022, 11, 11, 19, 10, 15, 863799), datetime_complete=datetime.datetime(2022, 11, 11, 19, 10, 17, 475381), params={'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0