In [1]:
from lohrasb.best_estimator import BaseModel
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from lohrasb.utils.metrics import f1_plus_tn
from ray.tune.search.bayesopt import BayesOptSearch
from ray import air,tune
from ray.air import session
from xgboost import XGBClassifier
from ray.tune.search.hyperopt import HyperOptSearch



2023-07-08 01:13:49,322 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-07-08 01:13:49,329 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-07-08 01:13:49,332 :: matplotlib :: interactive is False
2023-07-08 01:13:49,334 :: matplotlib :: platform is darwin
2023-07-08 01:13:49,353 :: graphviz._tools :: deprecate positional args: graphviz.backend.piping.pipe(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-08 01:13:49,359 :: graphviz._tools :: deprecate positional args: graphviz.backend.rendering.render(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-08 01:13:49,362 :: graphviz._tools :: deprecate positional args: graphviz.backend.unflattening.unflatten(['stagger', 'fanout', 'chain', 'encoding'])
2023-07-08 01:13:49,364 :: graphviz._tools :: deprecate positional args: graphviz.backend.viewing.view(['quiet'])
2023-07-08 01:13:49,377 :: graphviz._tools :: deprecate positio

#### Example  : Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

#### Use BestModel in sklearn pipeline


In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels


In [3]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [4]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]
y = y.values.ravel()


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)

# for sample_weights
weights = np.ones(len(y_train))


#### Find feature types for later use

In [5]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define model and set it argumens 

In [6]:
estimator = XGBClassifier()
# Define the search space
param_space = {
                "max_depth": tune.randint(15, 30),
                 }
# create search algorithm, check main documentation of Tune at https://docs.ray.io/en/latest/tune/api/suggestion.html
search_alg = HyperOptSearch()

# define kwargs for base model
kwargs = {  # params for fit method  
            'fit_tune_kwargs' :{
            'sample_weight':None,
            },
            # params for TuneCV
            'main_tune_kwargs' : {
            'cv':3,
            'scoring':'f1',
            'estimator':estimator,
            },
            # kwargs of Tuner 
            'tuner_kwargs':{
                'tune_config':tune.TuneConfig(
                                    search_alg=search_alg,
                                    mode='max',
                                    metric='score',

                                ),
                'param_space':param_space,
                'run_config':air.RunConfig(stop={"training_iteration": 20}),
            
            },
}



#### Initializing BaseModel using Tune

In [7]:
obj = BaseModel().optimize_by_tune(
            kwargs=kwargs
        )

#### Use Another way of using it

In [8]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


#### Transform features to make them ready for model input

In [9]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [10]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [11]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)
pred_labels = np.rint(y_pred)

2023-07-08 01:13:51,607 :: ray.tune.tune :: Initializing Ray automatically.For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run`.


0,1
Current time:,2023-07-08 01:14:12
Running for:,00:00:15.18
Memory:,15.9/32.0 GiB

Trial name,status,loc,max_depth,iter,total time (s),score
trainable_e71ba348,TERMINATED,127.0.0.1:51246,15,1,10.8689,0.685288


[2m[36m(pid=51246)[0m 2023-07-08 01:14:00,308 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
[2m[36m(pid=51246)[0m 2023-07-08 01:14:00,313 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
[2m[36m(pid=51246)[0m 2023-07-08 01:14:00,314 :: matplotlib :: interactive is False
[2m[36m(pid=51246)[0m 2023-07-08 01:14:00,315 :: matplotlib :: platform is darwin
[2m[36m(pid=51246)[0m 2023-07-08 01:14:00,328 :: graphviz._tools :: deprecate positional args: graphviz.backend.piping.pipe(['renderer', 'formatter', 'neato_no_op', 'quiet'])
[2m[36m(pid=51246)[0m 2023-07-08 01:14:00,328 :: graphviz._tools :: deprecate positional args: graphviz.backend.rendering.render(['renderer', 'formatter', 'neato_no_op', 'quiet'])
[2m[36m(pid=51246)[0m 2023-07-08 01:14:00,329 :: graphviz._tools :: deprecate positional args: graphviz.backend.unflattening.unflatten(['stagger', 'fanout', 'chain', 'encoding'])
[

Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,score,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
trainable_e71ba348,2023-07-08_01-14-12,True,,6f84d03ce9b6475f83f087a1f70bf18d,1_max_depth=15,hjavedani-MB,1,127.0.0.1,51246,0.685288,10.8689,10.8689,10.8689,1688804052,0,,1,e71ba348,0.00972223


2023-07-08 01:14:12,290	INFO tune.py:762 -- Total run time: 15.36 seconds (15.17 seconds for the tuning loop).


#### Check performance of the model

In [12]:
print('F1 score plus TN : ')
print(f1_plus_tn(y_test,pred_labels))
print('F1 score : ')
print(f1_score(y_test,pred_labels))
print('Classification report : ')
print(classification_report(y_test,pred_labels))
print('Confusion matrix : ')
print(confusion_matrix(y_test,pred_labels))

F1 score plus TN : 
7582.680842193037
F1 score : 
0.6808421930373151
Classification report : 
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      8196
           1       0.73      0.64      0.68      2550

    accuracy                           0.86     10746
   macro avg       0.81      0.78      0.79     10746
weighted avg       0.85      0.86      0.85     10746

Confusion matrix : 
[[7582  614]
 [ 917 1633]]


#### Get best_estimator

In [13]:
obj.get_best_estimator()

#### Check best_estimator

In [14]:
obj.best_estimator

#### Get fitted search object and its attributes

In [15]:
TuneObj = obj.get_optimized_object()
print(TuneObj)

<ray.tune.result_grid.ResultGrid object at 0x113bafb80>
