In [1]:
import pandas as pd
import numpy as np
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold, train_test_split
from lohrasb.best_estimator import BaseModel
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    MeanMedianImputer
    )
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from sklearn.metrics import f1_score, make_scorer
from xgboost import *


#### Example: Audiology (Standardized) Data Set
#### https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29


In [2]:
urldata = "https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data"
urlname = "https://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.names"
# column names
col_names = [
    "class",
    "lymphatics",
    "block of affere",
    "bl. of lymph. c",
    "bl. of lymph. s",
    "by pass",
    "extravasates",
    "regeneration of",
    "early uptake in",
    "lym.nodes dimin",
    "lym.nodes enlar",
    "changes in lym.",
    "defect in node",
    "changes in node",
    "special forms",
    "dislocation of",
    "exclusion of no",
    "no. of nodes in",

]
data = pd.read_csv(urldata,names=col_names)
data.head()

Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,special forms,dislocation of,exclusion of no,no. of nodes in
3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


#### Define labels and train-test split


In [3]:

data.loc[(data["class"] == 1) | (data["class"] == 2), "class"] = 0
data.loc[data["class"] == 3, "class"] = 1
data.loc[data["class"] == 4, "class"] = 2
data["class"] = data["class"].astype(int)

# Train test split

X = data.loc[:, data.columns != "class"]
y = data.loc[:, data.columns == "class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33,  random_state=42
)



#### Define estimator and set its arguments 


In [4]:
estimator = XGBClassifier()
estimator_params = {
        "booster": ["gbtree","dart"],
        "eval_metric": ["auc"],
        "max_depth": [4, 5],
        "gamma": [0.1, 1.2],
        "subsample": [0.8],

    }
    

#### Define BestModel using random search

In [5]:

obj = BaseModel().optimize_by_gridsearchcv(
            estimator=estimator,
            estimator_params=estimator_params,
            fit_params = None,
            # measure_of_accuracy=make_scorer(f1_score, average='weighted', greater_is_better=True),
            # or scoring = 'f1_weighted',
            scoring = 'f1_weighted',
            verbose=3,
            n_jobs=-1,
            random_state=42,
            cv=KFold(2),
        )

#### Capture int features (the problem has only integer features)

In [6]:
int_cols = X_train.select_dtypes(include=["int"]).columns.tolist()
print(int_cols)

['lymphatics', 'block of affere', 'bl. of lymph. c', 'bl. of lymph. s', 'by pass', 'extravasates', 'regeneration of', 'early uptake in', 'lym.nodes dimin', 'lym.nodes enlar', 'changes in lym.', 'defect in node', 'changes in node', 'special forms', 'dislocation of', 'exclusion of no', 'no. of nodes in']


#### Build sklearn pipeline

In [7]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # classification model
            ('obj', obj)

 ])


#### Run Pipeline

In [8]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


2023-02-21 21:08:38,505 :: dev :: The optimization will be based on f1_weighted metric!
2023-02-21 21:08:38,505 :: dev :: The optimization will be based on f1_weighted metric!
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END booster=gbtree, eval_metric=auc, gamma=0.1, max_depth=4, subsample=0.8;, score=0.446 total time=   0.1s
[CV 2/2] END booster=gbtree, eval_metric=auc, gamma=0.1, max_depth=4, subsample=0.8;, score=0.432 total time=   0.1s
[CV 1/2] END booster=gbtree, eval_metric=auc, gamma=0.1, max_depth=5, subsample=0.8;, score=0.427 total time=   0.1s
[CV 2/2] END booster=gbtree, eval_metric=auc, gamma=0.1, max_depth=5, subsample=0.8;, score=0.428 total time=   0.1s
[CV 1/2] END booster=gbtree, eval_metric=auc, gamma=1.2, max_depth=4, subsample=0.8;, score=0.427 total time=   0.1s
[CV 2/2] END booster=gbtree, eval_metric=auc, gamma=1.2, max_depth=4, subsample=0.8;, score=0.416 total time=   0.1s
[CV 1/2] END booster=gbtree, eval_metric=auc, gamma=1.2, max_d

#### Check performance of the pipeline

In [9]:
print('F1 score : ')
print(f1_score(y_test,y_pred, average='weighted'))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))


F1 score : 
0.4802721088435374
Classification report : 
              precision    recall  f1-score   support

           0       0.50      0.57      0.53        21
           1       0.44      0.57      0.50        14
           2       0.57      0.29      0.38        14

    accuracy                           0.49        49
   macro avg       0.51      0.48      0.47        49
weighted avg       0.50      0.49      0.48        49

Confusion matrix : 
[[12  7  2]
 [ 5  8  1]
 [ 7  3  4]]
