# Categorical parameters tuning

____


In [1]:
import sys
sys.path.append('..')

# --- Dependencies
import pygmo as pg
import numpy as np
import pandas as pd
# import dovpanda

import plotly
from plotly import graph_objs as go
from plotly import tools
import plotly.express as px

import category_encoders as ec
import sklearn.gaussian_process as gp
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

from src.composite import PredictTutor, ModelsUnion
from src.generator import SamplesGenerator
from src.ploting import plot_mo
from src.search import Nsga2

from src.hypothesis.tpot_estimator import TpotWrp
from src.hypothesis.custom_gp_kernel import KERNEL_MAUNA, KERNEL_SIMPLE, KERNEL_GPML

In [2]:
# Generate data
prob = pg.problem(pg.wfg(prob_id=4, dim_dvs=4, dim_obj=2, dim_k=1))
pop = pg.population(prob = prob, size = 30)
X = pop.get_x()
y = pop.get_f()
model = gp.GaussianProcessRegressor(kernel=KERNEL_GPML,
                                    alpha=0,
                                    n_restarts_optimizer=10,
                                    normalize_y=True)

model.fit(X, y)

GaussianProcessRegressor(alpha=0, copy_X_train=True,
                         kernel=66**2 * RBF(length_scale=67) + 2.4**2 * RBF(length_scale=90) * ExpSineSquared(length_scale=1.3, periodicity=1) + 0.66**2 * RationalQuadratic(alpha=0.78, length_scale=1.2) + 0.18**2 * RBF(length_scale=0.134) + WhiteKernel(noise_level=0.0361),
                         n_restarts_optimizer=10, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)

In [3]:
nsga = Nsga2(bounds=prob.get_bounds(), pop_size=8)
nsga.fit([model])

NSGA2: Evolve GaussianProcessRegressor by 8 population size in 100 generation


Nsga2(bounds=None, gen=100, mask_col=None, mask_val=None, models=None,
      pop_size=8)

In [4]:
nsga.predict()

array([[0.49892974, 1.6379138 , 4.90855333, 6.1396614 ],
       [0.49892974, 1.6379138 , 4.90855333, 6.1396614 ],
       [0.03634494, 1.43939914, 5.2360753 , 3.8787892 ],
       [0.03634494, 1.43207006, 5.2360753 , 5.82107498],
       [0.03634494, 1.43325931, 5.1733399 , 3.34259492],
       [0.60305841, 1.63746305, 4.5576071 , 3.88425834],
       [0.03634494, 1.43939914, 4.47290247, 3.88375243],
       [0.02422551, 1.62047846, 4.90855333, 6.15878818]])

In [5]:
c = [0,2]
v = [1.11,2.22]

nsga.set_mask(c, v)
# nsga.set_mask(None, None)
nsga.predict()


NSGA2: Evolve GaussianProcessRegressor by 8 population size in 100 generation


array([[1.11      , 1.80158016, 2.22      , 3.18968301],
       [1.11      , 3.7687445 , 2.22      , 3.46384888],
       [1.11      , 3.7687445 , 2.22      , 3.46384888],
       [1.11      , 1.80158016, 2.22      , 3.18968301],
       [1.11      , 3.06560436, 2.22      , 3.18963185],
       [1.11      , 3.7687445 , 2.22      , 3.18067316],
       [1.11      , 2.68745911, 2.22      , 3.18319736],
       [1.11      , 2.59691167, 2.22      , 3.19160185]])

In [6]:
# Problem
inst = Pagmo_problem(models=[model], bounds=prob.get_bounds())
c_prob = pg.problem(inst)
# model.predict([[0,0,0]])
pop = pg.population(prob=c_prob, size = 5, seed=42)
X = pop.get_x()
y = pop.get_f()
X

NameError: name 'Pagmo_problem' is not defined

In [None]:
col=[0,2]
value=[1.111, 2.2222]

# inst.set_mask(col=col, value=value)
inst2 = Pagmo_problem(models=[model], bounds=prob.get_bounds())
inst2.set_mask(col=col, value=value)

c_prob2 = pg.problem(inst2)
pop2 = pg.population(prob=c_prob2, size = 5, seed=42)
X = pop2.get_x()
y = pop2.get_f()
X

In [None]:
y

In [None]:
x_test = np.copy(X)
for c, v in zip(col, value):
    x_test = np.insert(x_test, c, v, 1)
    
    
model.predict(x_test)

In [None]:
x_test

In [None]:
col = [0,2]
val = [1111,2222]

temp = np.delete(X, [0,2], 1)
# temp[:, [0,2]]
# temp[0][[0,2]]

# np.insert(temp[:, [0,2]], [11,22])
# temp

# for c, v in zip(col, val):
#     temp = np.insert(temp, c, v, 1)
        
temp

In [None]:
X

In [None]:
temp

In [None]:
X[:, [0,2]] = [11,22]
X

In [None]:
prob.get_bounds()[1][[0,2]]

In [None]:
prob.get_bounds()[1][None]

In [None]:
tuple(b[[0,2]] for b in prob.get_bounds())

In [None]:
["s", "d", "s"] > "s"

In [None]:
from typing import List, Tuple
from sklearn.base import BaseEstimator


class Pagmo_problem:
    def __init__(self, models: List[BaseEstimator],
                 bounds: Tuple[List] = None,
                 is_single=False,
                 m_col=None,
                 m_value=None):
        self._estimators = models
        self.__target_func = 'predict'
        self._bounds = bounds
        self._is_single = is_single
        self._mask_columns = m_col
        self._mask_value = m_value

    def set_mask(self, col, value):
        if len(col) != len(value):
            raise ValueError(
                f"Columns and values should be equal length. Columns: {col}, values: {value}")
        self._mask_columns = col
        self._mask_value = value

    def get_mask(self, col, value):
        return (self.mask_columns, self.mask_value)

    def fitness(self, x):
        x = np.array(x)
        if None not in (self._mask_columns, self._mask_value):
            for c, v in zip(self._mask_columns, self._mask_value):
                x = np.insert(x, c, v, 0)

        return self._singl_obj(x) if self._is_single else self._multi_obj(x)

    def _multi_obj(self, x):
        f_vector = [self.__evaluate(e, x) for e in self._estimators]
        return np.array(f_vector).flatten().tolist()

    def _singl_obj(self, x):
        fw_vector = self._multi_obj(x)
        return [np.mean(fw_vector)]

    def __evaluate(self, estimator, x):
        result = getattr(estimator, self.__target_func)(x.reshape(1, -1))
        return result.tolist()[0]

    def get_nobj(self):
        nobj = None
        if self._is_single:
            nobj = 1
        else:
            if len(self._estimators) > 1:
                nobj = len(self._estimators)
            else:
                prediction = self._estimators[0].predict([self._bounds[0]])
                nobj = len(prediction[0])

        return nobj

    # def get_nix(self):
    #     return len(self._bounds[0])

    # Return bounds of decision variables
    def get_bounds(self):
        if None not in (self._mask_value, self._mask_value):
            return tuple(np.delete(b, self._mask_columns, 0).flatten() for b in self._bounds)
        else:
            return self._bounds

    def set_bounds(self, bounds: Tuple[List]):
        self._bounds = bounds
        return self

    # Return function name
    def get_name(self):
        return " vs ".join([type(t).__name__ for t in self._estimators])


In [None]:
model = gp.GaussianProcessRegressor(kernel=KERNEL_GPML,
                                    alpha=0,
                                    n_restarts_optimizer=10,
                                    normalize_y=True)

model.fit(X, y)

class my_udp:
    def fitness(self, x):
        pred = model.predict([x]).tolist()[0]
        return np.array(pred)
    
    def get_bounds(self):
        return pro.get_bounds()
    def gradient(self, x):
        return pg.estimate_gradient_h(lambda x: self.fitness(x), x)
    def get_name(self):
        return pro.get_name()
    
pop = pg.population(prob = my_udp(), size = 1)
pop = algo.evolve(pop)

solution = go.Scatter3d(x=pop.get_x()[:,0], 
                        y=pop.get_x()[:,1],
                        z=np.array(pop.get_f()).flatten(),
                        mode='markers',
                        name='bobyqa'
                        )
plot_so(my_udp(), extra_plot=solution, samples=250)

___
___

In [None]:
# df_NB = pd.read_csv("../src/scenario/NB_final_result.csv")
# df_RF = pd.read_csv("../src/scenario/RF_final_result.csv")
# df_NB1 = pd.read_csv("../src/scenario/taskNB1.csv")

In [None]:
df_NB

In [None]:
# px.parallel_categories(df_NB1, color="PREC_AT_99_REC", dimensions=list(df_NB.columns),
#                        color_continuous_scale=px.colors.sequential.Inferno)

In [None]:
X = df_NB1.drop(columns=["PREC_AT_99_REC"], axis=1)
y = df_NB1[["PREC_AT_99_REC"]]
df_NB1

In [None]:
# -- Numerical
numeric_features = ['application_grid_size']
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

# -- Categorical
categorical_features = ['bandwidth_selection', 'use_application_grid']
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('woe', ec.CatBoostEncoder())])

preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

preprocessor2 = Pipeline(steps=[
        ('james', ec.JamesSteinEncoder(cols=['bandwidth_selection', 'use_application_grid'])),
        ('cat', ec.CatBoostEncoder(cols=['estimation_mode', 'laplace_correction', 'bandwidth']))
])

# pipe = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('classifier', RandomForestClassifier(n_estimators=500))])

# model = pipe.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# print(encoder)
# print(f1_score(y_test, y_pred, average='macro'))
pd.DataFrame(preprocessor2.fit_transform(X, y))

___
## Model Tutor


In [None]:
# 1
tea_pot = TpotWrp(generations=2, population_size=10, random_state=42) 
# 2
gp_sim = gp.GaussianProcessRegressor(kernel=KERNEL_SIMPLE, alpha=0, n_restarts_optimizer=10, normalize_y=True) 
# 3
grad_uni = ModelsUnion(
    models=[GradientBoostingRegressor(n_estimators=200)],
    split_y=True) 
# 4 
lin_uni = ModelsUnion(models=[LinearRegression()], split_y=True)

# bounds = ([min(X[column]) for column in X], [max(X[column]) for column in X])
# tutor = PredictTutor(bounds, portfolio=[GradientBoostingRegressor(n_estimators=200), LinearRegression()])
# tutor.fit(X_train, y_train)
# tutor.predict(X_train, y_train, n=1)

In [None]:
trns = Pipeline(steps=[
            ('bin', ec.BinaryEncoder())
])

trns = ec.BinaryEncoder(cols=X.columns.tolist())
trns_X = trns.fit_transform(X)
bounds = ([min(trns_X[column]) for column in trns_X], [max(trns_X[column]) for column in trns_X])


pipe = Pipeline(steps=[
        ('bin', ec.BinaryEncoder(cols=X.columns.tolist())),
        ('tutor', PredictTutor(bounds, portfolio=[GradientBoostingRegressor(
            n_estimators=200), LinearRegression()]))
])

X_train, X_test, y_train, y_test = train_test_split(X, y)
pipe.fit(X_train, y_train, tutor__cv=4)
pred = pipe.predict(X_test, tutor__n=4)
pred

In [None]:
pd.DataFrame(pred)

In [None]:
y

___
### Preprocesing and prediction by-hend

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values)
# --- Encoder
coder = ec.BaseNEncoder(cols=list(range(X_train.shape[1]))).fit(X_train)
X_train_ec = coder.transform(X_train)

bounds = ([min(trns_X[column]) for column in trns_X], [max(trns_X[column]) for column in trns_X])
ptutor = PredictTutor(bounds, portfolio=[GradientBoostingRegressor(
            n_estimators=200), LinearRegression()])

ptutor.fit(X_train_ec, y_train)
pred = ptutor.predict(None)
# X_train_ec
pred

In [None]:
coder.inverse_transform(pred)

# Review integer problem from pygmo

In [None]:
udp = pg.minlp_rastrigin(dim_i=2)
# udp = pg.zdt(prob_id=ID, param=DIM)
pro = pg.problem(udp)
pro.get_bounds()
pro

In [None]:
np.array(y)[[1,34],:]

In [None]:
# coder.inverse_transform(list(X_train_ec.values))
# coder.inverse_transform(pred)
X.values

In [None]:
trns_X

In [None]:
# px.parallel_categories(df_NB1, color="PREC_AT_99_REC", dimensions=list(df_NB.columns),
#                        color_continuous_scale=px.colors.sequential.Inferno)