In [1]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from lohrasb.best_estimator import BaseModel
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from sklearn.metrics import f1_score, accuracy_score, make_scorer
from lightgbm import *


2023-05-02 21:48:34,990 :: dev :: Connected to Ray cluster!
2023-05-02 21:48:34,990 :: dev :: Connected to Ray cluster!
2023-05-02 21:48:35,409 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-05-02 21:48:35,415 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-05-02 21:48:35,419 :: matplotlib :: interactive is False
2023-05-02 21:48:35,420 :: matplotlib :: platform is darwin
2023-05-02 21:48:36,705 :: matplotlib :: CACHEDIR=/Users/hjavedani/.matplotlib
2023-05-02 21:48:36,708 :: matplotlib.font_manager :: Using fontManager instance from /Users/hjavedani/.matplotlib/fontlist-v330.json


# Example: Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

# Part 1: Use BestModel in sklearn pipeline


In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels


In [3]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [4]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]


X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)


#### Find feature types for later use

In [5]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define estimator and set its arguments 


In [6]:
estimator = LGBMClassifier()
estimator_params = {
        "max_depth": [6],
        "learning_rate": [0.01, 0.1],
        "n_estimators": [100,200],
        "boosting_type": ["gbdt"],

    }
    

In [7]:

obj = BaseModel().optimize_by_randomsearchcv(
            estimator=estimator,
            estimator_params=estimator_params,
            fit_params={
                'sample_weight':None
            },
            measure_of_accuracy=make_scorer(accuracy_score, normalize=False, greater_is_better=True),
            # or scoring='accuracy'
            scoring='f1',
            verbose=3,
            n_jobs=-1,
            random_state=42,
            cv=KFold(2),
            n_iter=4,
        )

#### Build sklearn pipeline

In [8]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            ('obj', obj)

 ])


#### Run Pipeline

In [9]:
pipeline.fit(X_train,y_train.values.ravel())
y_pred = pipeline.predict(X_test)


[2m[36m(pid=9796)[0m 2023-05-02 21:48:40,518 :: dev :: Connected to Ray cluster!
[2m[36m(pid=9796)[0m 2023-05-02 21:48:40,518 :: dev :: Connected to Ray cluster!
[2m[36m(RandomSearch pid=9796)[0m 2023-05-02 21:48:41,945 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
[2m[36m(RandomSearch pid=9796)[0m 2023-05-02 21:48:41,949 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
[2m[36m(RandomSearch pid=9796)[0m 2023-05-02 21:48:41,951 :: matplotlib :: interactive is False
[2m[36m(RandomSearch pid=9796)[0m 2023-05-02 21:48:41,951 :: matplotlib :: platform is darwin
[2m[36m(RandomSearch pid=9796)[0m 2023-05-02 21:48:42,018 :: dev :: The optimization will be based on f1 metric!
[2m[36m(RandomSearch pid=9796)[0m 2023-05-02 21:48:42,018 :: dev :: The optimization will be based on f1 metric!
[2m[36m(RandomSearch pid=9796)[0m Fitting 2 folds for each of 4 candidates, totalling 8 fits


#### Check performance of the pipeline

In [10]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))


F1 score : 
0.7127749576988157
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8158
           1       0.79      0.65      0.71      2588

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.82     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7703  455]
 [ 903 1685]]


# Part 2:  Use BestModel as a standalone estimator 

In [11]:
X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)

#### Transform features to make them ready for model input

In [12]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [13]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [14]:
obj.fit(X_train,y_train.values.ravel())
y_pred = obj.predict(X_test)

[2m[36m(pid=9797)[0m 2023-05-02 21:48:45,590 :: dev :: Connected to Ray cluster!
[2m[36m(pid=9797)[0m 2023-05-02 21:48:45,590 :: dev :: Connected to Ray cluster!
[2m[36m(RandomSearch pid=9797)[0m 2023-05-02 21:48:46,782 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
[2m[36m(RandomSearch pid=9797)[0m 2023-05-02 21:48:46,786 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
[2m[36m(RandomSearch pid=9797)[0m 2023-05-02 21:48:46,787 :: matplotlib :: interactive is False
[2m[36m(RandomSearch pid=9797)[0m 2023-05-02 21:48:46,787 :: matplotlib :: platform is darwin
[2m[36m(RandomSearch pid=9797)[0m 2023-05-02 21:48:46,837 :: dev :: The optimization will be based on f1 metric!
[2m[36m(RandomSearch pid=9797)[0m 2023-05-02 21:48:46,837 :: dev :: The optimization will be based on f1 metric!
[2m[36m(RandomSearch pid=9797)[0m Fitting 2 folds for each of 4 candidates, totalling 8 fits


#### Check performance of the model

In [15]:
print('accuracy score score : ')
print(accuracy_score(y_test,y_pred,normalize=False))
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))

accuracy score score : 
9388
F1 score : 
0.7127749576988157
Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8158
           1       0.79      0.65      0.71      2588

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.82     10746
weighted avg       0.87      0.87      0.87     10746

Confusion matrix : 
[[7703  455]
 [ 903 1685]]


In [16]:
obj.get_best_estimator()

In [17]:
obj.best_estimator

#### Get fitted randomized search object and its attributes

In [18]:
RandomSearchObj = obj.get_optimized_object()
RandomSearchObj.cv_results_

{'mean_fit_time': array([0.12175357, 0.21687698, 0.0990485 , 0.17422247]),
 'std_fit_time': array([0.00492156, 0.005687  , 0.00100362, 0.0055666 ]),
 'mean_score_time': array([0.05387998, 0.11204958, 0.05606008, 0.11344421]),
 'std_score_time': array([0.00040674, 0.00518847, 0.00107503, 0.00354087]),
 'param_n_estimators': masked_array(data=[100, 200, 100, 200],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[6, 6, 6, 6],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_learning_rate': masked_array(data=[0.01, 0.01, 0.1, 0.1],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_boosting_type': masked_array(data=['gbdt', 'gbdt', 'gbdt', 'gbdt'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 100,
   