In [1]:
import pandas as pd
import numpy as np
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold, train_test_split
from lohrasb.best_estimator import BaseModel
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    MeanMedianImputer
    )
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from sklearn.metrics import f1_score, make_scorer
from catboost import CatBoostClassifier

2023-07-03 02:47:21,060 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-07-03 02:47:21,065 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-07-03 02:47:21,069 :: matplotlib :: interactive is False
2023-07-03 02:47:21,070 :: matplotlib :: platform is darwin
2023-07-03 02:47:21,086 :: graphviz._tools :: deprecate positional args: graphviz.backend.piping.pipe(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-03 02:47:21,088 :: graphviz._tools :: deprecate positional args: graphviz.backend.rendering.render(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-03 02:47:21,090 :: graphviz._tools :: deprecate positional args: graphviz.backend.unflattening.unflatten(['stagger', 'fanout', 'chain', 'encoding'])
2023-07-03 02:47:21,092 :: graphviz._tools :: deprecate positional args: graphviz.backend.viewing.view(['quiet'])
2023-07-03 02:47:21,097 :: graphviz._tools :: deprecate positio

#### Example: Audiology (Standardized) Data Set
#### https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29


In [2]:
urldata = "https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data"
urlname = "https://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.names"
# column names
col_names = [
    "class",
    "lymphatics",
    "block of affere",
    "bl. of lymph. c",
    "bl. of lymph. s",
    "by pass",
    "extravasates",
    "regeneration of",
    "early uptake in",
    "lym.nodes dimin",
    "lym.nodes enlar",
    "changes in lym.",
    "defect in node",
    "changes in node",
    "special forms",
    "dislocation of",
    "exclusion of no",
    "no. of nodes in",

]
data = pd.read_csv(urldata,names=col_names)
data.head()

Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,special forms,dislocation of,exclusion of no,no. of nodes in
3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


#### Define labels and train-test split


In [3]:

data.loc[(data["class"] == 1) | (data["class"] == 2), "class"] = 0
data.loc[data["class"] == 3, "class"] = 1
data.loc[data["class"] == 4, "class"] = 2
data["class"] = data["class"].astype(int)

# Train test split

X = data.loc[:, data.columns != "class"]
y = data.loc[:, data.columns == "class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33,  random_state=42
)



#### Define estimator and set its arguments 


In [4]:
estimator = CatBoostClassifier()
estimator_params = {
        "depth": [4, 5],
        "learning_rate": [0.01, 0.1],

    }

kwargs = {  # params for fit method or fit_params 
            'fit_grid_kwargs' :{
            'sample_weight':None,
            },
            # params for RandomSearchCV 
            'grid_search_kwargs' : {
            'estimator':estimator,
            'param_grid':estimator_params,
            'scoring' :'f1_weighted',
            'verbose':3,
            'n_jobs':-1,
            'cv':KFold(2),
            'refit':True,
            }
            }
    

#### Define BestModel using random search

In [5]:

obj = BaseModel().optimize_by_gridsearchcv(
    kwargs=kwargs
        )

#### Capture int features (the problem has only integer features)

In [6]:
int_cols = X_train.select_dtypes(include=["int"]).columns.tolist()
print(int_cols)

['lymphatics', 'block of affere', 'bl. of lymph. c', 'bl. of lymph. s', 'by pass', 'extravasates', 'regeneration of', 'early uptake in', 'lym.nodes dimin', 'lym.nodes enlar', 'changes in lym.', 'defect in node', 'changes in node', 'special forms', 'dislocation of', 'exclusion of no', 'no. of nodes in']


#### Build sklearn pipeline

In [7]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # classification model
            ('obj', obj)

 ])


#### Run Pipeline

In [8]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


Fitting 2 folds for each of 4 candidates, totalling 8 fits
0:	learn: 1.0964066	total: 51.2ms	remaining: 51.1s
0:	learn: 1.0769296	total: 51.4ms	remaining: 51.3s
1:	learn: 1.0938873	total: 51.9ms	remaining: 25.9s
1:	learn: 1.0533784	total: 52.2ms	remaining: 26.1s
2:	learn: 1.0917111	total: 52.6ms	remaining: 17.5s
2:	learn: 1.0343271	total: 53ms	remaining: 17.6s
3:	learn: 1.0894544	total: 53.1ms	remaining: 13.2s
3:	learn: 1.0146348	total: 54.6ms	remaining: 13.6s
4:	learn: 1.0873726	total: 54.5ms	remaining: 10.9s
4:	learn: 0.9988010	total: 55.2ms	remaining: 11s
5:	learn: 1.0856279	total: 55.2ms	remaining: 9.14s
5:	learn: 0.9853834	total: 55.7ms	remaining: 9.23s
6:	learn: 1.0825365	total: 55.8ms	remaining: 7.92s
6:	learn: 0.9609991	total: 56.4ms	remaining: 8.01s
7:	learn: 1.0801677	total: 56.4ms	remaining: 6.99s
7:	learn: 0.9454180	total: 56.9ms	remaining: 7.06s
8:	learn: 0.9352791	total: 57.4ms	remaining: 6.32s
8:	learn: 1.0782821	total: 57.1ms	remaining: 6.29s
0:	learn: 1.0964765	total: 

#### Check performance of the pipeline

In [9]:
print('F1 score : ')
print(f1_score(y_test,y_pred, average='weighted'))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))


F1 score : 
0.4419742365031119
Classification report : 
              precision    recall  f1-score   support

           0       0.46      0.57      0.51        21
           1       0.38      0.43      0.40        14
           2       0.57      0.29      0.38        14

    accuracy                           0.45        49
   macro avg       0.47      0.43      0.43        49
weighted avg       0.47      0.45      0.44        49

Confusion matrix : 
[[12  7  2]
 [ 7  6  1]
 [ 7  3  4]]
