In [1]:
from lohrasb.best_estimator import BaseModel
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold,train_test_split
import pandas as pd
import optuna
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.metrics import (
    make_scorer)
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.linear_model import *


2023-07-03 17:15:40,392 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-07-03 17:15:40,398 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-07-03 17:15:40,401 :: matplotlib :: interactive is False
2023-07-03 17:15:40,402 :: matplotlib :: platform is darwin
2023-07-03 17:15:40,416 :: graphviz._tools :: deprecate positional args: graphviz.backend.piping.pipe(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-03 17:15:40,418 :: graphviz._tools :: deprecate positional args: graphviz.backend.rendering.render(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-07-03 17:15:40,420 :: graphviz._tools :: deprecate positional args: graphviz.backend.unflattening.unflatten(['stagger', 'fanout', 'chain', 'encoding'])
2023-07-03 17:15:40,421 :: graphviz._tools :: deprecate positional args: graphviz.backend.viewing.view(['quiet'])
2023-07-03 17:15:40,425 :: graphviz._tools :: deprecate positio

#### Example 2 :Computer Hardware Data Set (a regression problem)
  
https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

#### Part 1: Use BestModel in sklearn pipeline


In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data"
# column names
col_names=[
    "vendor name",
    "Model Name",
    "MYCT",
    "MMIN",
    "MMAX",
    "CACH",
    "CHMIN",
    "CHMAX",
    "PRP"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
adviser,32/60,125,256,6000,256,16,128,198,199
amdahl,470v/7,29,8000,32000,32,8,32,269,253
amdahl,470v/7a,29,8000,32000,32,8,32,220,253
amdahl,470v/7b,29,8000,32000,32,8,32,172,253
amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...
sperry,80/8,124,1000,8000,0,1,8,42,37
sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
sratus,32,125,2000,8000,0,2,14,52,41
wang,vs-100,480,512,8000,32,0,0,67,47


#### Train test split

In [3]:
X = data.loc[:, data.columns != "PRP"]
y = data.loc[:, data.columns == "PRP"]
y = y.values.ravel()


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


#### Find feature types for later use

In [4]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


####  Define estimator and set its arguments  

In [5]:
estimator = LinearRegression()
estimator_params= {
        "fit_intercept": [True, False],
    }

kwargs = {  # params for fit method  
            'fit_optuna_kwargs' :{
            'sample_weight':None,
            },
            # params for OptunaSearch
            'main_optuna_kwargs' : {
            'estimator':estimator,
            'estimator_params':estimator_params,
            'refit':True,
            'measure_of_accuracy' :"mean_absolute_error(y_true, y_pred, multioutput='uniform_average')"

            },
            'train_test_split_kwargs':{
                'test_size':.3,
                            
            },
            'study_search_kwargs':{
                'storage':None,
                'sampler':TPESampler(),
                'pruner':HyperbandPruner(),
                'study_name':"example of optuna optimizer",
                'direction':"maximize",
                'load_if_exists':False,
            },
            'optimize_kwargs':{
                # optuna optimization params
                'n_trials':20,
                'timeout':600,
                'catch':(),
                'callbacks':None,
                'gc_after_trial':False,
                'show_progress_bar':False,
            }
}


In [6]:
obj = BaseModel().optimize_by_optuna(
        kwargs=kwargs
        )


#### Build sklearn pipeline

In [7]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # regression model 
            ('obj', obj),


 ])



#### Run Pipeline

In [8]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


[I 2023-07-03 17:15:41,499] A new study created in memory with name: example of optuna optimizer
[I 2023-07-03 17:15:41,504] Trial 0 finished with value: 17.224204194190605 and parameters: {'fit_intercept': 0}. Best is trial 0 with value: 17.224204194190605.
[I 2023-07-03 17:15:41,509] Trial 1 finished with value: 15.746844270792897 and parameters: {'fit_intercept': 1}. Best is trial 0 with value: 17.224204194190605.
[I 2023-07-03 17:15:41,516] Trial 2 finished with value: 17.224204194190605 and parameters: {'fit_intercept': 0}. Best is trial 0 with value: 17.224204194190605.
[I 2023-07-03 17:15:41,521] Trial 3 finished with value: 15.746844270792897 and parameters: {'fit_intercept': 1}. Best is trial 0 with value: 17.224204194190605.
[I 2023-07-03 17:15:41,526] Trial 4 finished with value: 17.224204194190605 and parameters: {'fit_intercept': 0}. Best is trial 0 with value: 17.224204194190605.
[I 2023-07-03 17:15:41,530] Trial 5 finished with value: 15.746844270792897 and parameters: {

2023-07-03 17:15:41,651 :: dev :: If refit is set to True, the optimal model will be refit on the entire dataset, i.e., X_train and y_train!
2023-07-03 17:15:41,651 :: dev :: If refit is set to True, the optimal model will be refit on the entire dataset, i.e., X_train and y_train!


#### Check performance of the pipeline

In [9]:
print('r2 score : ')
print(r2_score(y_test,y_pred))


r2 score : 
0.9192355855868294


#### Part 2: Another way of using it

In [10]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


#### Transform features to make them ready for model input

In [11]:
transform_pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model

 ])

#### Transform X_train and X_test

In [12]:
X_train=transform_pipeline.fit_transform(X_train,y_train)
X_test=transform_pipeline.transform(X_test)


#### Train model and predict

In [13]:
obj.fit(X_train,y_train)
y_pred = obj.predict(X_test)

[I 2023-07-03 17:15:41,725] A new study created in memory with name: example of optuna optimizer
[I 2023-07-03 17:15:41,733] Trial 0 finished with value: 17.30868987281197 and parameters: {'fit_intercept': 1}. Best is trial 0 with value: 17.30868987281197.
[I 2023-07-03 17:15:41,739] Trial 1 finished with value: 17.30868987281197 and parameters: {'fit_intercept': 1}. Best is trial 0 with value: 17.30868987281197.
[I 2023-07-03 17:15:41,743] Trial 2 finished with value: 17.30868987281197 and parameters: {'fit_intercept': 1}. Best is trial 0 with value: 17.30868987281197.
[I 2023-07-03 17:15:41,749] Trial 3 finished with value: 17.536991316420217 and parameters: {'fit_intercept': 0}. Best is trial 3 with value: 17.536991316420217.
[I 2023-07-03 17:15:41,754] Trial 4 finished with value: 17.30868987281197 and parameters: {'fit_intercept': 1}. Best is trial 3 with value: 17.536991316420217.
[I 2023-07-03 17:15:41,758] Trial 5 finished with value: 17.536991316420217 and parameters: {'fit_in

2023-07-03 17:15:41,978 :: dev :: If refit is set to True, the optimal model will be refit on the entire dataset, i.e., X_train and y_train!
2023-07-03 17:15:41,978 :: dev :: If refit is set to True, the optimal model will be refit on the entire dataset, i.e., X_train and y_train!


#### Check performance of the model

In [14]:
print('r2 score : ')
print(r2_score(y_test,y_pred))
print('mean_absolute_error : ')
print(mean_absolute_error(y_test,y_pred))


r2 score : 
0.9192355855868294
mean_absolute_error : 
30.288069195735655


In [15]:
obj.get_best_estimator()

In [16]:
obj.best_estimator

#### Get fitted search object and its attributes

In [17]:
OptunaObj = obj.get_optimized_object()
OptunaObj

FrozenTrial(number=3, state=TrialState.COMPLETE, values=[17.536991316420217], datetime_start=datetime.datetime(2023, 7, 3, 17, 15, 41, 744969), datetime_complete=datetime.datetime(2023, 7, 3, 17, 15, 41, 749112), params={'fit_intercept': 0}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'fit_intercept': IntDistribution(high=1, log=False, low=0, step=1)}, trial_id=3, value=None)