In [1]:
from lohrasb.best_estimator import BaseModel
import xgboost
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold,train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    r2_score
    )
import catboost


  from .autonotebook import tqdm as notebook_tqdm


# Example 1 :Computer Hardware Data Set (a regression problem)
  
https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data"
# column names
col_names=[
    "vendor name",
    "Model Name",
    "MYCT",
    "MMIN",
    "MMAX",
    "CACH",
    "CHMIN",
    "CHMAX",
    "PRP"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
adviser,32/60,125,256,6000,256,16,128,198,199
amdahl,470v/7,29,8000,32000,32,8,32,269,253
amdahl,470v/7a,29,8000,32000,32,8,32,220,253
amdahl,470v/7b,29,8000,32000,32,8,32,172,253
amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...
sperry,80/8,124,1000,8000,0,1,8,42,37
sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
sratus,32,125,2000,8000,0,2,14,52,41
wang,vs-100,480,512,8000,32,0,0,67,47


# Train test split

In [3]:
X = data.loc[:, data.columns != "PRP"]
y = data.loc[:, data.columns == "PRP"]


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


# Find feature types for later use

In [4]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#  Define Feature selector and set its arguments  

In [5]:



SFC_CATREG_OPTUNA = BaseModel(
        estimator=catboost.CatBoostRegressor(),
        estimator_params={
                  # desired lower bound and upper bound for depth
                  'depth'         : [6,10],
                  # desired lower bound and upper bound for depth
                  'learning_rate' : [0.05, 0.1],  
                    },
        hyper_parameter_optimization_method="optuna",
        measure_of_accuracy="r2",
        test_size=0.33,
        cv=KFold(n_splits=3, random_state=42, shuffle=True),
        with_stratified=False,
        verbose=0,
        random_state=42,
        n_jobs=-1,
        n_iter=100,
        eval_metric=None,
        number_of_trials=20,
        sampler=TPESampler(),
        pruner=HyperbandPruner(),
    )


Setting value for estimator
Getting value for estimator
<catboost.core.CatBoostRegressor object at 0x137837100>
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Getting value for estimator
Setting value for estimator_params
Setting value for hyper_parameter_optimization_method
Setting value for measure_of_accuracy
Setting value for test_size
Setting value for Cross Validation object
Setting value for with_stratified
Setting value for verbose
Setting value for random_state
Setting value for n_jobs
Setting value for n_iter
Setting value for eval_metric
Setting value for number_of_trials
Setting value for sampler
Setting value for pruner
Setting value for best_estimator


# Build sklearn Pipeline

In [6]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # regression model 
            ('SFC_CATREG_OPTUNA', SFC_CATREG_OPTUNA),


 ])



# Run Pipeline

In [7]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


[32m[I 2022-08-01 16:40:20,518][0m A new study created in memory with name: no-name-fa75258f-715d-4fe2-a1f9-edc3fa3461d3[0m


Getting value for hyper_parameter_optimization_method
Getting value for hyper_parameter_optimization_method
Getting value for hyper_parameter_optimization_method
Getting value for estimator
Getting value for measure_of_accuracy
Getting value for estimator_params
Getting value for verbose
Getting value for test_size
Getting value for random_state
Getting value for eval_metric
Getting value for number_of_trials
Getting value for sampler
Getting value for pruner
Getting value for with_stratified


[32m[I 2022-08-01 16:40:21,910][0m Trial 0 finished with value: 0.9129103468740117 and parameters: {'depth': 9, 'learning_rate': 0.09664193056897151}. Best is trial 0 with value: 0.9129103468740117.[0m
[32m[I 2022-08-01 16:40:22,476][0m Trial 1 finished with value: 0.9217288930964569 and parameters: {'depth': 6, 'learning_rate': 0.07664924735814954}. Best is trial 1 with value: 0.9217288930964569.[0m
[32m[I 2022-08-01 16:40:23,600][0m Trial 2 finished with value: 0.9079572368906275 and parameters: {'depth': 9, 'learning_rate': 0.074068271568517}. Best is trial 1 with value: 0.9217288930964569.[0m
[32m[I 2022-08-01 16:40:25,123][0m Trial 3 finished with value: 0.8696898578904054 and parameters: {'depth': 10, 'learning_rate': 0.09021178521825836}. Best is trial 1 with value: 0.9217288930964569.[0m
[32m[I 2022-08-01 16:40:25,984][0m Trial 4 finished with value: 0.910159948443151 and parameters: {'depth': 9, 'learning_rate': 0.06864748556624538}. Best is trial 1 with value: 0

{'depth': 6, 'learning_rate': 0.05606137433587405}
0:	learn: 108.6761338	total: 436us	remaining: 436ms
1:	learn: 105.6092123	total: 726us	remaining: 363ms
2:	learn: 102.8957614	total: 1.06ms	remaining: 354ms
3:	learn: 99.7978045	total: 1.38ms	remaining: 344ms
4:	learn: 96.9164608	total: 1.69ms	remaining: 336ms
5:	learn: 94.5906455	total: 2.09ms	remaining: 346ms
6:	learn: 91.6800042	total: 2.35ms	remaining: 333ms
7:	learn: 88.9721543	total: 2.65ms	remaining: 329ms
8:	learn: 86.9704395	total: 2.88ms	remaining: 317ms
9:	learn: 84.2520814	total: 3.17ms	remaining: 314ms
10:	learn: 82.2925882	total: 3.48ms	remaining: 313ms
11:	learn: 80.2871924	total: 3.69ms	remaining: 304ms
12:	learn: 78.0815837	total: 4ms	remaining: 304ms
13:	learn: 76.0109231	total: 4.29ms	remaining: 302ms
14:	learn: 74.0173122	total: 4.54ms	remaining: 298ms
15:	learn: 72.0893035	total: 4.77ms	remaining: 294ms
16:	learn: 70.4877085	total: 5.08ms	remaining: 294ms
17:	learn: 68.9303039	total: 5.34ms	remaining: 292ms
18:	lea

# Check performance of the Pipeline

In [8]:
print('r2 score : ')
print(r2_score(y_test,y_pred))


r2 score : 
0.5543227447979595
