In [2]:
import skopt
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.svm import SVC
from skopt.space import Integer
from skopt.space import Real
from skopt.space import Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

# Optimization

## Steps involved in HyperOptimization using Scikit-Optimizer

1. Define the space of hyperparameters to search
1. Define the function used to evaluate a given configuration
1. Minimize the loss using Space and Function defined in previous steps.

In [3]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/ionosphere.csv'
df = pd.read_csv(url, header=None)
data = df.values

x, y = data[:, :-1], data[:, -1]
print(x.shape, y.shape)

(351, 34) (351,)


In [6]:
import os, sys

project_dir = os.getcwd()
if project_dir not in sys.path:
    sys.path.append(project_dir)

from feature_selection import FeatureSelection

In [20]:
model = FeatureSelection(n_features=1, selected_features=(2**34)-1)
model.fit(x)
model.transform(x).shape

(351, 34)

In [3]:
n_features = x.shape[1]

## 1. Define the space of hyperparameters to search

In [4]:
search_space = list()
search_space.append(Real(1e-6, 100.0, 'log-uniform', name='C'))
search_space.append(Categorical(['linear', 'poly', 'rbf', 'sigmoid'], name='kernel'))
search_space.append(Integer(1, 5, name='degree'))
search_space.append(Real(1e-6, 100.0, 'log-uniform', name='gamma'))
search_space.append(Integer(10, n_features, name='n_features'))

## 2. Define the function used to evaluate a given configuration

In [11]:
@use_named_args(search_space) # https://scikit-optimize.github.io/stable/modules/generated/skopt.utils.use_named_args.html
def evaluate_model(**params):
	model_params = {key: value for key, value in params.items() if key != 'n_features'}
	features_idx = np.linspace(0, n_features-1, params['n_features'], dtype=int)
	# configure the model with specific hyperparameters
	model = SVC()
	model.set_params(**model_params)
	# define test harness
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# calculate 5-fold cross validation
	result = cross_val_score(model, x[:, features_idx], y, cv=cv, n_jobs=-1, scoring='accuracy')
	# calculate the mean of the scores
	estimate = np.mean(result)
	# convert from a maximizing score to a minimizing score
	return 1.0 - estimate

## 3. Perform optimization

In [12]:
result = gp_minimize(evaluate_model, search_space)

In [13]:
# summarizing finding:
print('Best Accuracy: %.3f' % (1.0 - result.fun))
print('Best Parameters: %s' % (result.x))

Best Accuracy: 0.952
Best Parameters: [0.5810616726309397, 'rbf', 1, 0.19977534459005017, 31]


# Automated Way

The Scikit-Optimize library provides a similar interface for performing a Bayesian Optimization of model hyperparameters via the BayesSearchCV class.

In [None]:
from skopt import BayesSearchCV

# Define search space
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the search
search = BayesSearchCV(estimator=SVC(), search_spaces=params, n_jobs=-1, cv=cv)
# perform the search
search.fit(x, y)
# report the best result
print(search.best_score_)
print(search.best_params_)

In [None]:
# import scipy.io
# data_dir = "/home/abian/Data/Dataset/IUMA/DermaDatabase/"
# files = list(map(lambda x: os.path.join(data_dir, x), os.listdir(data_dir)))
# for filename in files:
#     mat = scipy.io.loadmat(filename)
#     print(mat)
# mat['patient']

In [None]:
class SVC2(SVC):
    def __init__(self, feature=-1, C=1.0, kernel='rbf', degree=3, gamma='scale'):
        '''
            Parameters:
            ---
                feature (int): features to uses from the dataset
        '''
        pass

In [17]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False)
a = SelectFromModel(lsvc)
type(a)

sklearn.feature_selection._from_model.SelectFromModel

In [154]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEquidistantSelection(TransformerMixin, BaseEstimator):
    def __init__(self, n_features_to_select=None):
        self.n_features_to_select = n_features_to_select
        self.selected_features_idx = []

    def get_params(self, deep=True):
        return {"n_features_to_select": self.n_features_to_select}       

    def fit(self, X, y=None):
        n_features = X.shape[1]
        self.selected_features_idx = np.linspace(0, n_features-1, self.n_features_to_select, dtype=int)
        return self

    def transform(self, X, y=None, **kwargs):
        return X[:, self.selected_features_idx]

In [155]:
from sklearn.pipeline import Pipeline
a = BandSelection()
print(a.get_params())
new_params = {'n_features_to_select': 10}
a.set_params(**new_params)
print(a.get_params())

model = SVC()
pipe = Pipeline([("transform", a), ('svc', model)])

new_params = {'transform__n_features_to_select': 22, 'svc__C': 0.12}

print(pipe.get_params(deep=False))
pipe.set_params(**new_params)
print(pipe.get_params(deep=True))
# assert pipe.get_params(deep=True) == dict( svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False) )

{'n_features_to_select': None}
{'n_features_to_select': 10}
{'memory': None, 'steps': [('transform', BandSelection(n_features_to_select=10)), ('svc', SVC())], 'verbose': False}
{'memory': None, 'steps': [('transform', BandSelection(n_features_to_select=22)), ('svc', SVC(C=0.12))], 'verbose': False, 'transform': BandSelection(n_features_to_select=22), 'svc': SVC(C=0.12), 'transform__n_features_to_select': 22, 'svc__C': 0.12, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': False, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}


In [156]:
X = np.random.rand(10,25)
a = BandSelection()
new_params = {'n_features_to_select': 20}
a.set_params(**new_params)
a.fit(X)
a.transform(X).shape

(10, 20)

In [178]:
# X = np.random.rand(10,25)
# y = np.random.randint(10,1)
pipe = Pipeline([("transform", a), ('svc', model)])
# pipe = Pipeline([("transform", a)])

new_params = {'transform__n_features_to_select': 26, 'svc__C': 0.1}
pipe.set_params(**new_params)

pipe.fit(x, y)
pipe.predict(x)

array(['g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g',
       'g', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b',
       'g', 'b', 'g', 'b', 'g', 'b', 'g', 'g', 'g', 'b', 'g', 'b', 'g',
       'g', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b',
       'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g',
       'g', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'g', 'g', 'b', 'g', 'b',
       'g', 'b', 'g', 'b', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g',
       'b', 'g', 'b', 'g', 'g', 'g', 'g', 'b', 'g', 'b', 'g', 'b', 'g',
       'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'g', 'g', 'g',
       'g', 'b', 'g', 'g', 'g', 'b', 'g', 'b', 'g', 'g', 'g', 'b', 'g',
       'g', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'g',
       'g', 'g', 'g', 'b', 'g', 'g', 'g', 'b', 'g', 'b', 'g', 'b', 'g',
       'b', 'g', 'g', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b', 'g', 'b',
       'g', 'b', 'g', 'b', 'g', 'g', 'g', 'b', 'g', 'b', 'g', 'b

BandSelection(n_features_to_select=22)