In [1]:
from lohrasb.best_estimator import BaseModel
import xgboost
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold,train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    r2_score
    )
import catboost


ModuleNotFoundError: No module named 'lohrasb'

# Example 1 :Computer Hardware Data Set (a regression problem)
  
https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

In [None]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data"
# column names
col_names=[
    "vendor name",
    "Model Name",
    "MYCT",
    "MMIN",
    "MMAX",
    "CACH",
    "CHMIN",
    "CHMAX",
    "PRP"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data

# Train test split

In [None]:
X = data.loc[:, data.columns != "PRP"]
y = data.loc[:, data.columns == "PRP"]


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


# Find feature types for later use

In [None]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#  Define Feature selector and set its arguments  

In [None]:



SFC_CATREG_OPTUNA = BaseModel(
        estimator=catboost.CatBoostRegressor(),
        estimator_params={
                  # desired lower bound and upper bound for depth
                  'depth'         : [6,10],
                  # desired lower bound and upper bound for depth
                  'learning_rate' : [0.05, 0.1],  
                    },
        hyper_parameter_optimization_method="optuna",
        measure_of_accuracy="r2",
        test_size=0.33,
        cv=KFold(n_splits=3, random_state=42, shuffle=True),
        with_stratified=False,
        verbose=0,
        random_state=42,
        n_jobs=-1,
        n_iter=100,
        eval_metric=None,
        number_of_trials=20,
        sampler=TPESampler(),
        pruner=HyperbandPruner(),
    )


# Build sklearn Pipeline

In [None]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # regression model 
            ('SFC_CATREG_OPTUNA', SFC_CATREG_OPTUNA),


 ])



# Run Pipeline

In [None]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


# Check performance of the Pipeline

In [None]:
print('r2 score : ')
print(r2_score(y_test,y_pred))
