In [1]:
from lohrasb.best_estimator import BaseModel
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold,train_test_split
import pandas as pd
import optuna
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.metrics import (
    make_scorer)
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.linear_model import *


2023-05-02 21:19:28,933 :: dev :: Connected to Ray cluster!
2023-05-02 21:19:28,933 :: dev :: Connected to Ray cluster!
2023-05-02 21:19:30,518 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-05-02 21:19:30,524 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-05-02 21:19:30,526 :: matplotlib :: interactive is False
2023-05-02 21:19:30,528 :: matplotlib :: platform is darwin
2023-05-02 21:19:31,240 :: matplotlib :: CACHEDIR=/Users/hjavedani/.matplotlib
2023-05-02 21:19:31,245 :: matplotlib.font_manager :: Using fontManager instance from /Users/hjavedani/.matplotlib/fontlist-v330.json


#### Example 2 :Computer Hardware Data Set (a regression problem)
  
https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

#### Part 1: Use BestModel in sklearn pipeline


In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data"
# column names
col_names=[
    "vendor name",
    "Model Name",
    "MYCT",
    "MMIN",
    "MMAX",
    "CACH",
    "CHMIN",
    "CHMAX",
    "PRP"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
adviser,32/60,125,256,6000,256,16,128,198,199
amdahl,470v/7,29,8000,32000,32,8,32,269,253
amdahl,470v/7a,29,8000,32000,32,8,32,220,253
amdahl,470v/7b,29,8000,32000,32,8,32,172,253
amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...
sperry,80/8,124,1000,8000,0,1,8,42,37
sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
sratus,32,125,2000,8000,0,2,14,52,41
wang,vs-100,480,512,8000,32,0,0,67,47


#### Train test split

In [3]:
X = data.loc[:, data.columns != "PRP"]
y = data.loc[:, data.columns == "PRP"]
y = y.values.ravel()


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


#### Find feature types for later use

In [4]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


####  Define estimator and set its arguments  

In [5]:
estimator = LinearRegression()
estimator_params= {
        "fit_intercept": [True, False],
    }


In [6]:
obj = BaseModel().optimize_by_optuna(
            estimator=estimator,
            estimator_params=estimator_params,
            measure_of_accuracy="mean_absolute_error(y_true, y_pred, multioutput='uniform_average')",
            with_stratified=False,
            test_size=.3,
            verbose=3,
            n_jobs=-1,
            random_state=42,
            # optuna params
            # optuna study init params
            study=optuna.create_study(
                storage=None,
                sampler=TPESampler(),
                pruner=HyperbandPruner(),
                study_name=None,
                direction="minimize",
                load_if_exists=False,
                directions=None,
            ),
            # optuna optimization params
            study_optimize_objective=None,
            study_optimize_objective_n_trials=10,
            study_optimize_objective_timeout=600,
            study_optimize_n_jobs=-1,
            study_optimize_catch=(),
            study_optimize_callbacks=None,
            study_optimize_gc_after_trial=False,
            study_optimize_show_progress_bar=False,
        )

[32m[I 2023-05-02 21:19:32,037][0m A new study created in memory with name: no-name-4fb713e4-dd44-4e9f-993e-1675997cb060[0m


#### Build sklearn pipeline

In [7]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # regression model 
            ('obj', obj),


 ])



#### Run Pipeline

In [8]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


[32m[I 2023-05-02 21:19:32,140][0m Trial 0 finished with value: 14.770921159757803 and parameters: {'fit_intercept': 0}. Best is trial 0 with value: 14.770921159757803.[0m
[32m[I 2023-05-02 21:19:32,143][0m Trial 2 finished with value: 13.714193910659512 and parameters: {'fit_intercept': 1}. Best is trial 2 with value: 13.714193910659512.[0m
[32m[I 2023-05-02 21:19:32,146][0m Trial 1 finished with value: 13.714193910659512 and parameters: {'fit_intercept': 1}. Best is trial 2 with value: 13.714193910659512.[0m
[32m[I 2023-05-02 21:19:32,154][0m Trial 5 finished with value: 13.714193910659512 and parameters: {'fit_intercept': 1}. Best is trial 2 with value: 13.714193910659512.[0m
[32m[I 2023-05-02 21:19:32,156][0m Trial 4 finished with value: 13.714193910659512 and parameters: {'fit_intercept': 1}. Best is trial 2 with value: 13.714193910659512.[0m
[32m[I 2023-05-02 21:19:32,160][0m Trial 6 finished with value: 13.714193910659512 and parameters: {'fit_intercept': 1}. Be

#### Check performance of the pipeline

In [9]:
print('r2 score : ')
print(r2_score(y_test,y_pred))


r2 score : 
0.9401429300951223


#### Part 2:  Use BestModel as a standalone estimator 

In [10]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


#### Transform features to make them ready for model input

In [11]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [12]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [13]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)

[32m[I 2023-05-02 21:19:32,365][0m Trial 11 finished with value: 14.770921159757803 and parameters: {'fit_intercept': 0}. Best is trial 2 with value: 13.714193910659512.[0m
[32m[I 2023-05-02 21:19:32,371][0m Trial 12 finished with value: 13.714193910659512 and parameters: {'fit_intercept': 1}. Best is trial 2 with value: 13.714193910659512.[0m
[32m[I 2023-05-02 21:19:32,375][0m Trial 10 finished with value: 14.770921159757803 and parameters: {'fit_intercept': 0}. Best is trial 2 with value: 13.714193910659512.[0m
[32m[I 2023-05-02 21:19:32,387][0m Trial 16 finished with value: 13.714193910659512 and parameters: {'fit_intercept': 1}. Best is trial 2 with value: 13.714193910659512.[0m
[32m[I 2023-05-02 21:19:32,394][0m Trial 15 finished with value: 13.714193910659512 and parameters: {'fit_intercept': 1}. Best is trial 2 with value: 13.714193910659512.[0m
[32m[I 2023-05-02 21:19:32,394][0m Trial 13 finished with value: 13.714193910659512 and parameters: {'fit_intercept': 

#### Check performance of the model

In [14]:
print('r2 score : ')
print(r2_score(y_test,y_pred))
print('mean_absolute_error : ')
print(mean_absolute_error(y_test,y_pred))


r2 score : 
0.9401429300951223
mean_absolute_error : 
25.975645074588805


In [15]:
obj.get_best_estimator()

In [16]:
obj.best_estimator

#### Get fitted randomized search object and its attributes

In [17]:
OptunaObj = obj.get_optimized_object()
OptunaObj.trials

[FrozenTrial(number=0, values=[14.770921159757803], datetime_start=datetime.datetime(2023, 5, 2, 21, 19, 32, 116678), datetime_complete=datetime.datetime(2023, 5, 2, 21, 19, 32, 140702), params={'fit_intercept': 0}, distributions={'fit_intercept': IntUniformDistribution(high=True, low=False, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None),
 FrozenTrial(number=1, values=[13.714193910659512], datetime_start=datetime.datetime(2023, 5, 2, 21, 19, 32, 119864), datetime_complete=datetime.datetime(2023, 5, 2, 21, 19, 32, 146285), params={'fit_intercept': 1}, distributions={'fit_intercept': IntUniformDistribution(high=True, low=False, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=1, state=TrialState.COMPLETE, value=None),
 FrozenTrial(number=2, values=[13.714193910659512], datetime_start=datetime.datetime(2023, 5, 2, 21, 19, 32, 134888), datetime_complete=datetime.datetime(2023, 5, 2, 21, 19, 32, 