In [1]:
from lohrasb.best_estimator import BaseModel
import xgboost
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold,train_test_split
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)

  from .autonotebook import tqdm as notebook_tqdm


# Example 1 : Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Define labels


In [3]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

# Train test split

In [4]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, stratify=y['label'], random_state=42)


# Find feature types for later use

In [5]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


# Define Feature selector and set it argumens 

In [6]:

SFC_XGBCLS_OPTUNA = BaseModel(
        estimator=xgboost.XGBClassifier(),
        estimator_params={
            "max_depth": [2, 3],
            "min_child_weight": [0.1, 0.9],
            "gamma": [1, 9],
        },
        hyper_parameter_optimization_method="optuna",
        measure_of_accuracy="f1",
        test_size=0.33,
        cv=KFold(n_splits=3, random_state=42, shuffle=True),
        with_stratified=False,
        verbose=0,
        random_state=42,
        n_jobs=-1,
        n_iter=100,
        eval_metric="auc",
        number_of_trials=20,
        sampler=TPESampler(),
        pruner=HyperbandPruner(),
    )


Setting value for estimator
Getting value for estimator
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...)
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Setting

# Build sklearn Pipeline

In [7]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            ('SFC_XGBCLS_OPTUNA', SFC_XGBCLS_OPTUNA),


 ])



# Run Pipeline

In [8]:
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
pred_labels = np.rint(y_preds)




[32m[I 2022-08-01 16:40:14,366][0m A new study created in memory with name: no-name-66d107d4-468e-4277-93c3-5287abaea1f1[0m


Getting value for hyper_parameter_optimization_method
Getting value for hyper_parameter_optimization_method
Getting value for hyper_parameter_optimization_method
Getting value for estimator
Getting value for measure_of_accuracy
Getting value for estimator_params
Getting value for verbose
Getting value for test_size
Getting value for random_state
Getting value for eval_metric
Getting value for number_of_trials
Getting value for sampler
Getting value for pruner
Getting value for with_stratified
       age  workclass  fnlwgt  education  education-num  marital-status  \
27476   35          1  177102          4              9               2   
10052   61          1  193479          4              9               4   
11713   24          3  169624          4              9               2   
2513    47          3  174525          4              9               5   
12308   26          1  175789          4              9               2   
...    ...        ...     ...        ...            

[32m[I 2022-08-01 16:40:14,559][0m Trial 0 finished with value: 0.6225839267548322 and parameters: {'max_depth': 2, 'min_child_weight': 0, 'gamma': 8.101243313290928}. Best is trial 0 with value: 0.6225839267548322.[0m


[0]	validation-auc:0.85926




[1]	validation-auc:0.87514
[2]	validation-auc:0.88917
[3]	validation-auc:0.89816
[4]	validation-auc:0.89969
[5]	validation-auc:0.90345
[6]	validation-auc:0.90809
[7]	validation-auc:0.90828
[8]	validation-auc:0.90894
[9]	validation-auc:0.91035


[32m[I 2022-08-01 16:40:14,799][0m Trial 1 finished with value: 0.649182561307902 and parameters: {'max_depth': 3, 'min_child_weight': 0, 'gamma': 1.9810352775284974}. Best is trial 1 with value: 0.649182561307902.[0m


[0]	validation-auc:0.82538


[32m[I 2022-08-01 16:40:14,875][0m Trial 2 pruned. Trial was pruned at iteration 1.[0m


[0]	validation-auc:0.82538




[1]	validation-auc:0.84194
[2]	validation-auc:0.85911
[3]	validation-auc:0.87281
[4]	validation-auc:0.88634
[5]	validation-auc:0.89025
[6]	validation-auc:0.89155
[7]	validation-auc:0.89720
[8]	validation-auc:0.89813
[9]	validation-auc:0.90243


[32m[I 2022-08-01 16:40:15,259][0m Trial 3 finished with value: 0.6322624743677375 and parameters: {'max_depth': 2, 'min_child_weight': 0, 'gamma': 3.6539375290051552}. Best is trial 1 with value: 0.649182561307902.[0m


[0]	validation-auc:0.85926
[1]	validation-auc:0.87514
[2]	validation-auc:0.88917
[3]	validation-auc:0.89816
[4]	validation-auc:0.89971




[5]	validation-auc:0.90367
[6]	validation-auc:0.90642
[7]	validation-auc:0.90934
[8]	validation-auc:0.90985
[9]	validation-auc:0.91075


[32m[I 2022-08-01 16:40:15,611][0m Trial 4 finished with value: 0.6408304498269897 and parameters: {'max_depth': 3, 'min_child_weight': 0, 'gamma': 1.004312091298833}. Best is trial 1 with value: 0.649182561307902.[0m


[0]	validation-auc:0.85926
[1]	validation-auc:0.87514
[2]	validation-auc:0.88917




[3]	validation-auc:0.89816
[4]	validation-auc:0.89969
[5]	validation-auc:0.90319
[6]	validation-auc:0.90765
[7]	validation-auc:0.90821
[8]	validation-auc:0.90893
[9]	validation-auc:0.91035


[32m[I 2022-08-01 16:40:16,100][0m Trial 5 finished with value: 0.6496598639455783 and parameters: {'max_depth': 3, 'min_child_weight': 0, 'gamma': 2.689765369616926}. Best is trial 5 with value: 0.6496598639455783.[0m


[0]	validation-auc:0.85926
[1]	validation-auc:0.87514
[2]	validation-auc:0.88917


[32m[I 2022-08-01 16:40:16,206][0m Trial 6 pruned. Trial was pruned at iteration 3.[0m


[0]	validation-auc:0.85926
[1]	validation-auc:0.87514
[2]	validation-auc:0.88917
[3]	validation-auc:0.89789
[4]	validation-auc:0.89951
[5]	validation-auc:0.90225
[6]	validation-auc:0.90578
[7]	validation-auc:0.90680
[8]	validation-auc:0.90638
[9]	validation-auc:0.90824


[32m[I 2022-08-01 16:40:16,596][0m Trial 7 finished with value: 0.6368366285119667 and parameters: {'max_depth': 3, 'min_child_weight': 0, 'gamma': 6.8524412773918435}. Best is trial 5 with value: 0.6496598639455783.[0m


[0]	validation-auc:0.82538
[1]	validation-auc:0.84194
[2]	validation-auc:0.85911


[32m[I 2022-08-01 16:40:16,680][0m Trial 8 pruned. Trial was pruned at iteration 3.[0m


[0]	validation-auc:0.82538
[1]	validation-auc:0.84194
[2]	validation-auc:0.85911
[3]	validation-auc:0.87281


[32m[I 2022-08-01 16:40:16,755][0m Trial 9 pruned. Trial was pruned at iteration 3.[0m


[0]	validation-auc:0.82538




[1]	validation-auc:0.84194


[32m[I 2022-08-01 16:40:16,810][0m Trial 10 pruned. Trial was pruned at iteration 1.[0m


[0]	validation-auc:0.82538
[1]	validation-auc:0.84194


[32m[I 2022-08-01 16:40:16,859][0m Trial 11 pruned. Trial was pruned at iteration 1.[0m


[0]	validation-auc:0.82538


[32m[I 2022-08-01 16:40:16,957][0m Trial 12 pruned. Trial was pruned at iteration 1.[0m


[0]	validation-auc:0.82538




[1]	validation-auc:0.84194


[32m[I 2022-08-01 16:40:17,023][0m Trial 13 pruned. Trial was pruned at iteration 1.[0m


[0]	validation-auc:0.82538
[1]	validation-auc:0.84194


[32m[I 2022-08-01 16:40:17,070][0m Trial 14 pruned. Trial was pruned at iteration 1.[0m


[0]	validation-auc:0.85926
[1]	validation-auc:0.87514
[2]	validation-auc:0.88917
[3]	validation-auc:0.89789


[32m[I 2022-08-01 16:40:17,195][0m Trial 15 pruned. Trial was pruned at iteration 3.[0m


[0]	validation-auc:0.85926
[1]	validation-auc:0.87514
[2]	validation-auc:0.88917
[3]	validation-auc:0.89816
[4]	validation-auc:0.89969
[5]	validation-auc:0.90319
[6]	validation-auc:0.90765
[7]	validation-auc:0.90821
[8]	validation-auc:0.90893
[9]	validation-auc:0.91035


[32m[I 2022-08-01 16:40:17,582][0m Trial 16 finished with value: 0.6496598639455783 and parameters: {'max_depth': 3, 'min_child_weight': 0, 'gamma': 2.859485617169287}. Best is trial 5 with value: 0.6496598639455783.[0m


[0]	validation-auc:0.85926
[1]	validation-auc:0.87514
[2]	validation-auc:0.88917
[3]	validation-auc:0.89816
[4]	validation-auc:0.89969
[5]	validation-auc:0.90319




[6]	validation-auc:0.90765
[7]	validation-auc:0.90821
[8]	validation-auc:0.90893
[9]	validation-auc:0.91035


[32m[I 2022-08-01 16:40:17,914][0m Trial 17 finished with value: 0.6496598639455783 and parameters: {'max_depth': 3, 'min_child_weight': 0, 'gamma': 2.781562248563164}. Best is trial 5 with value: 0.6496598639455783.[0m


[0]	validation-auc:0.85926
[1]	validation-auc:0.87514
[2]	validation-auc:0.88917




[3]	validation-auc:0.89816
[4]	validation-auc:0.89971
[5]	validation-auc:0.90367
[6]	validation-auc:0.90642
[7]	validation-auc:0.90934
[8]	validation-auc:0.90985
[9]	validation-auc:0.91075


[32m[I 2022-08-01 16:40:18,469][0m Trial 18 finished with value: 0.6408304498269897 and parameters: {'max_depth': 3, 'min_child_weight': 0, 'gamma': 1.1362823853623374}. Best is trial 5 with value: 0.6496598639455783.[0m


[0]	validation-auc:0.85926
[1]	validation-auc:0.87514
[2]	validation-auc:0.88917
[3]	validation-auc:0.89816
[4]	validation-auc:0.89969
[5]	validation-auc:0.90319
[6]	validation-auc:0.90765
[7]	validation-auc:0.90821
[8]	validation-auc:0.90893




[9]	validation-auc:0.91035


[32m[I 2022-08-01 16:40:18,808][0m Trial 19 finished with value: 0.6496598639455783 and parameters: {'max_depth': 3, 'min_child_weight': 0, 'gamma': 2.9436954969915923}. Best is trial 5 with value: 0.6496598639455783.[0m


{'max_depth': 3, 'min_child_weight': 0, 'gamma': 2.689765369616926}
[0]	validation-rmse:0.42537
[1]	validation-rmse:0.38315
[2]	validation-rmse:0.35846
[3]	validation-rmse:0.34290
[4]	validation-rmse:0.33435
[5]	validation-rmse:0.32816
[6]	validation-rmse:0.32365
[7]	validation-rmse:0.32127
[8]	validation-rmse:0.31970
[9]	validation-rmse:0.31773
Setting value for best_estimator
Getting value for estimator
Getting value for estimator
Getting value for hyper_parameter_optimization_method
Getting value for best_estimator


# Check performance of the Pipeline

In [9]:
print('F1 score : ')
print(f1_score(y_test,pred_labels))
print('Classification report : ')
print(classification_report(y_test,pred_labels))
print('Confusion matrix : ')
print(confusion_matrix(y_test,pred_labels))


F1 score : 
0.6481012658227848
Classification report : 
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      8158
           1       0.80      0.54      0.65      2588

    accuracy                           0.86     10746
   macro avg       0.84      0.75      0.78     10746
weighted avg       0.85      0.86      0.85     10746

Confusion matrix : 
[[7809  349]
 [1180 1408]]
