In [None]:
import os, sys
project_dir = os.getcwd()
if project_dir not in sys.path:
    sys.path.append(project_dir)

import numpy as np
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.svm import SVC
from skopt.space import Integer
from skopt.space import Real
from skopt.space import Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

In [None]:
from dataset import DermaDataset

dataset_root_dir = "/home/abian/Data/Dataset/IUMA/DermaDatabase/dataCubes/"
train_dir = ['train', 'validation']
dataset_dir = list(map(lambda x: os.path.join(dataset_root_dir, x), train_dir))

dataset = DermaDataset(dataset_dir)
x, y = dataset.get()

# Dataset balancing
**https://imbalanced-learn.org/stable/under_sampling.html**

In [None]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss

# Randomly selecting a subset of data for the targeted classes:
# rus = RandomUnderSampler(random_state=123)
# x, y = rus.fit_resample(x, y)

# Let positive samples be the samples belonging to the targeted class to be under-sampled. 
# Negative sample refers to the samples from the minority class.
# Select the positive samples for which the average distance to the N closest samples of the negative class is the smallest
nm1 = NearMiss(version=1)
x, y = nm1.fit_resample(x, y)

# Feature Reduction

Due to a computational limitation, it is now only possible to apply the selection of features by Bayesian optimization up to 64 features. This step is used in order to reduce the number of features based on Tree-based feature importance score.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

# clf = ExtraTreesClassifier(n_estimators=150)
clf = RandomForestClassifier(random_state=123)
clf_params={'n_estimators':[50, 150, 500, 1000]}

fs_clf = GridSearchCV(clf, clf_params, cv=10,iid=False, n_jobs=-1)
fs_clf.fit(x,y)

In [None]:
print("Best estimator: {}".format(fs_clf.best_estimator_))
fi = fs_clf.best_estimator_.feature_importances_

In [None]:
threshold = 0.0075 #Empirical value
fs_model = SelectFromModel(fs_clf.best_estimator_, threshold=threshold, prefit=True)
X_new = fs_model.transform(x)

print("Features selected: {}".format(np.where(fs_clf.best_estimator_.feature_importances_ > threshold)[0]))
print("X shape: {}".format(X_new.shape))

n_features=X_new.shape[1]

# Parameters optimization

In [None]:
from feature_selection import FeatureSelection, FeatureEquidistantSelection
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV

In [None]:
search_space = list()
# search_space.append(Integer(1, float(2**(116)-1), 'log-uniform', name='transform__selected_features'))
search_space.append(Integer(1, 2**(64)-1, 'uniform', name='transform__selected_features', dtype=np.uint64))
search_space.append(Real(1e-6, 100.0, 'log-uniform', name='svc__C'))
search_space.append(Categorical(['linear', 'poly', 'rbf', 'sigmoid'], name='svc__kernel'))
search_space.append(Integer(1, 5, name='svc__degree'))
search_space.append(Real(1e-6, 100.0, 'log-uniform', name='svc__gamma'))

In [None]:
# define the function used to evaluate a given configuration
@use_named_args(search_space) # https://scikit-optimize.github.io/stable/modules/generated/skopt.utils.use_named_args.html
def evaluate_model(**params):
	# configure the model with specific hyperparameters
	model = Pipeline([("transform", FeatureSelection()), ('svc', SVC())])
	model.set_params(**params)
	# define test harness
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=123)
	# calculate 5-fold cross validation
	result = cross_val_score(model, X_new, y, cv=cv, n_jobs=-1, scoring='accuracy')
	# calculate the mean of the scores
	estimate = np.mean(result)
	
	# TODO, Comprobar penalización por bandas!
	# Acc_Penalized = None
	# if model['transform'].selected_features:
	# 	feature_idx = model['transform'].getIndex()
	# 	Acc_Penalized = 1 - (estimate / (1 + (feature_idx.sum()/model['transform'].n_features)))

	# convert from a maximizing score to a minimizing score
	return 1.0 - estimate


In [None]:
from skopt.callbacks import CheckpointSaver
from datetime import datetime

exp_id = f'{datetime.now().timestamp()}'.split('.')[0]
checkpoint_saver = CheckpointSaver("./checkpoints/{}.pkl".format(exp_id), compress=9) # keyword arguments will be passed to `skopt.dump`

result = gp_minimize(evaluate_model, search_space, callback=[checkpoint_saver], random_state=123)

In [None]:
# summarizing finding:
print('Best Accuracy: %.3f' % (1.0 - result.fun))
print('Best Parameters: %s' % (result.x))

# Continue search from checkpoint

In [23]:
from skopt import load
res = load('checkpoints/1633684942.pkl')
x0 = res.x_iters
y0 = res.func_vals

In [26]:
result = gp_minimize(
            evaluate_model, 
            search_space,
            x0=x0,              # already examined values for x
            y0=y0,              # observed values for x0
            callback=[checkpoint_saver], 
            random_state=123)

KeyboardInterrupt: 