In [1]:
from lohrasb.best_estimator import BaseModel
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import optuna
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from sklearn.metrics import f1_score
from interpret.glassbox import LogisticRegression
from interpret import show
from lohrasb.utils.metrics import f1_plus_tn



2023-07-03 19:44:36,736 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-07-03 19:44:36,742 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-07-03 19:44:36,745 :: matplotlib :: interactive is False
2023-07-03 19:44:36,745 :: matplotlib :: platform is darwin
2023-07-03 19:44:36,762 :: graphviz._tools :: deprecate positional args: graphviz.backend.piping.pipe(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-03 19:44:36,764 :: graphviz._tools :: deprecate positional args: graphviz.backend.rendering.render(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-03 19:44:36,766 :: graphviz._tools :: deprecate positional args: graphviz.backend.unflattening.unflatten(['stagger', 'fanout', 'chain', 'encoding'])
2023-07-03 19:44:36,768 :: graphviz._tools :: deprecate positional args: graphviz.backend.viewing.view(['quiet'])
2023-07-03 19:44:36,772 :: graphviz._tools :: deprecate positio

#### Example 1 : Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

#### Part 1: Use BestModel in sklearn pipeline


In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels


In [3]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [4]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]
y = y.values.ravel()


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)

# for sample_weights
weights = np.ones(len(y_train))


#### Find feature types for later use

In [5]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define model and set it argumens 

In [6]:
estimator = LogisticRegression()
estimator_params = {
        "penalty":["l2"],
        "C":[0.1,1],
        "max_iter":[500,700],

    }
kwargs = {  # params for fit method  
            'fit_optuna_kwargs' :{
            'sample_weight':None,
            },
            # params for OptunaSearch
            'main_optuna_kwargs' : {
            'estimator':estimator,
            'estimator_params':estimator_params,
            'refit':True,
            'measure_of_accuracy' :'f1_score(y_true, y_pred,average="weighted")',

            },
            'train_test_split_kwargs':{
                'test_size':.3,
                            
            },
            'study_search_kwargs':{
                'storage':None,
                'sampler':TPESampler(),
                'pruner':HyperbandPruner(),
                'study_name':"example of optuna optimizer",
                'direction':"maximize",
                'load_if_exists':False,
            },
            'optimize_kwargs':{
                # optuna optimization params
                'n_trials':20,
                'timeout':600,
                'catch':(),
                'callbacks':None,
                'gc_after_trial':False,
                'show_progress_bar':False,
            }
}


In [7]:
obj = BaseModel().optimize_by_optuna(
        kwargs=kwargs
            
        )

#### Build sklearn pipeline

In [8]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            ('obj', obj),


 ])
 



#### Run Pipeline

In [9]:
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
pred_labels = np.rint(y_preds)




[I 2023-07-03 19:44:38,903] A new study created in memory with name: example of optuna optimizer
[I 2023-07-03 19:44:39,043] Trial 0 finished with value: 0.792925608717094 and parameters: {'penalty': 'l2', 'C': 0.7879676159004071, 'max_iter': 643}. Best is trial 0 with value: 0.792925608717094.
[I 2023-07-03 19:44:39,115] Trial 1 finished with value: 0.77853331583629 and parameters: {'penalty': 'l2', 'C': 0.9359064566651057, 'max_iter': 639}. Best is trial 0 with value: 0.792925608717094.
[I 2023-07-03 19:44:39,227] Trial 2 finished with value: 0.792925608717094 and parameters: {'penalty': 'l2', 'C': 0.6961289417765337, 'max_iter': 541}. Best is trial 0 with value: 0.792925608717094.
[I 2023-07-03 19:44:39,329] Trial 3 finished with value: 0.7930536503358144 and parameters: {'penalty': 'l2', 'C': 0.5291553342730184, 'max_iter': 509}. Best is trial 3 with value: 0.7930536503358144.
[I 2023-07-03 19:44:39,395] Trial 4 finished with value: 0.77853331583629 and parameters: {'penalty': 'l2'

2023-07-03 19:44:41,038 :: dev :: If refit is set to True, the optimal model will be refit on the entire dataset, i.e., X_train and y_train!
2023-07-03 19:44:41,038 :: dev :: If refit is set to True, the optimal model will be refit on the entire dataset, i.e., X_train and y_train!


#### Check performance of the pipeline

In [10]:
print('F1 score : ')
print(f1_score(y_test,pred_labels))
print('Classification report : ')
print(classification_report(y_test,pred_labels))
print('Confusion matrix : ')
print(confusion_matrix(y_test,pred_labels))


F1 score : 
0.4130901287553648
Classification report : 
              precision    recall  f1-score   support

           0       0.81      0.95      0.88      8196
           1       0.65      0.30      0.41      2550

    accuracy                           0.80     10746
   macro avg       0.73      0.63      0.64     10746
weighted avg       0.78      0.80      0.77     10746

Confusion matrix : 
[[7788  408]
 [1780  770]]


#### Some estimators have predict_proba method as well

In [11]:
y_preds = pipeline.predict_proba(X_test)
print(y_preds)

[[0.90648439 0.09351561]
 [0.57623669 0.42376331]
 [0.73076092 0.26923908]
 ...
 [0.43376818 0.56623182]
 [0.82501029 0.17498971]
 [0.95701578 0.04298422]]


#### Part 2:  Use BestModel as a standalone estimator 

In [12]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


#### Transform features to make them ready for model input

In [13]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [14]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [15]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)

[I 2023-07-03 19:44:41,755] A new study created in memory with name: example of optuna optimizer
[I 2023-07-03 19:44:42,007] Trial 0 finished with value: 0.8130408821130254 and parameters: {'penalty': 'l2', 'C': 0.46989906511435764, 'max_iter': 540}. Best is trial 0 with value: 0.8130408821130254.
[I 2023-07-03 19:44:42,100] Trial 1 finished with value: 0.7897439777116065 and parameters: {'penalty': 'l2', 'C': 0.9473060946144338, 'max_iter': 628}. Best is trial 0 with value: 0.8130408821130254.
[I 2023-07-03 19:44:42,199] Trial 2 finished with value: 0.7898726711961568 and parameters: {'penalty': 'l2', 'C': 0.3319780241431791, 'max_iter': 653}. Best is trial 0 with value: 0.8130408821130254.
[I 2023-07-03 19:44:42,278] Trial 3 finished with value: 0.7898726711961568 and parameters: {'penalty': 'l2', 'C': 0.14491548991355002, 'max_iter': 547}. Best is trial 0 with value: 0.8130408821130254.
[I 2023-07-03 19:44:42,409] Trial 4 finished with value: 0.7897439777116065 and parameters: {'pen

2023-07-03 19:44:44,798 :: dev :: If refit is set to True, the optimal model will be refit on the entire dataset, i.e., X_train and y_train!
2023-07-03 19:44:44,798 :: dev :: If refit is set to True, the optimal model will be refit on the entire dataset, i.e., X_train and y_train!


#### Check performance of the model

In [16]:

print('F1 score plus TN : ')
print(f1_plus_tn(y_test,pred_labels))
print('F1 score : ')
print(f1_score(y_test,pred_labels))
print('Classification report : ')
print(classification_report(y_test,pred_labels))
print('Confusion matrix : ')
print(confusion_matrix(y_test,pred_labels))

F1 score plus TN : 
7788.413090128755
F1 score : 
0.4130901287553648
Classification report : 
              precision    recall  f1-score   support

           0       0.81      0.95      0.88      8196
           1       0.65      0.30      0.41      2550

    accuracy                           0.80     10746
   macro avg       0.73      0.63      0.64     10746
weighted avg       0.78      0.80      0.77     10746

Confusion matrix : 
[[7788  408]
 [1780  770]]


In [17]:
obj.get_best_estimator()

In [18]:
obj.best_estimator

#### Get fitted Optuna Search object and its attributes

In [19]:
OptunaObj = obj.get_optimized_object()
OptunaObj

FrozenTrial(number=11, state=TrialState.COMPLETE, values=[0.818306109115278], datetime_start=datetime.datetime(2023, 7, 3, 19, 44, 43, 367265), datetime_complete=datetime.datetime(2023, 7, 3, 19, 44, 43, 639354), params={'penalty': 'l2', 'C': 0.12433359958022766, 'max_iter': 575}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'penalty': CategoricalDistribution(choices=('l2',)), 'C': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'max_iter': IntDistribution(high=700, log=False, low=500, step=1)}, trial_id=11, value=None)

In [20]:
print(obj.get_best_estimator())

LogisticRegression(C=0.12433359958022766, max_iter=575)
