In [1]:
import os
import pandas as pd
import pickle
import warnings

from tqdm.auto import tqdm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils.fixes import _joblib_parallel_args

from src.model.SupportSubsetEstimator import *
from src.model.moess import *
from src.utils import *


warnings.filterwarnings('ignore')

experiment = 'r2' 
# Experiment paths

print(f'Experiment: {experiment}\n')

results_folder = f'results/{experiment}'

os.makedirs(results_folder, exist_ok=True)

data = pd.read_parquet(f'../data/prep_real_data/{experiment}.parquet')

random_state = 1234

# Preprocessing
scaler = StandardScaler()
X = scaler.fit_transform(data.drop(columns=['y']))
y = data.y.values

# Calcular particiones
skf = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=True)
ss_estimator = SupportSubsetEstimator()
ss_idx = []
train_idx = []
test_idx = []

for train_index, test_index in tqdm(list(skf.split(X, y)), desc="Estimating Support Subsets"):
    ss_estimator.fit(X[train_index], y[train_index])
    train_idx.append(train_index)
    test_idx.append(test_index)
    ss_idx.append(ss_estimator.supportsubset)

folds = list(zip(train_idx, test_idx, ss_idx))

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Experiment: r2



Estimating Support Subsets: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.06it/s]


In [2]:
# comprobar que entra bien el parámetro de active_set

class GridSearch_moess():

    def __init__(self,
                 method,
                 grid_params,
                 scoring=scaled_mcc,
                 folds=None,
                 n_jobs=-1,
                 random_state=1234,
                 kwargs=None,
                 verbose=True):

        self.method = method
        self.grid_params = grid_params
        self.scoring = scoring
        self.k_fold_splits = folds
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.kwargs = kwargs
        self.verbose = verbose


    def _eval(self, X, y, params):

        scores = []

        if self.kwargs is None:
            clf = self.method(**params)
        else:
            clf = self.method(**params, **self.kwargs)

        for train_index, test_index, ss_index in self.k_fold_splits:

            clf.fit(X[train_index], y[train_index], active_indexes=ss_index)

            scores.append(self.scoring(
                y[test_index], clf.predict(X[test_index])))

        return [clf, params, scores, np.mean(scores)]

    def fit(self, X, y):
        
        grid_params =list(ParameterGrid(self.grid_params))

        if self.verbose:
            with tqdm_joblib(tqdm(desc="Searching best hyperparameters", total=len(grid_params))) as progress_bar:
                self.scoring_results_ = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))(
                    delayed(self._eval)(X, y, params)for params in grid_params)
        else:
            self.scoring_results_ = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))(
                    delayed(self._eval)(X, y, params)for params in grid_params)

        self.best_index_ = np.argmax([result[3]
                                      for result in self.scoring_results_])

        self.best_estimator_ = self.scoring_results_[self.best_index_][0]
        self.best_params_ = self.scoring_results_[self.best_index_][1]
        self.best_score_ = np.mean(self.scoring_results_[self.best_index_][2]), np.std(
            self.scoring_results_[self.best_index_][2])

In [3]:
# hyperparameters gridsearch
random_state = 1234

grid_params = {
    'wrab': [True, False],
    'lam': [1, 3, 5],
    'prop_sample': [0.10, 0.20, 0.30],
    'n_learners': [10, 20, 30],
    'random_state': [random_state]
}

# MOESS-kNN 

kwargs = {'method':KNeighborsClassifier, 'params':{'n_neighbors' : list(range(1, 13, 2))}}

ensemble = MOESS

ensemble_grid = GridSearch_moess(ensemble, grid_params, scoring=scaled_mcc, folds=folds, n_jobs=-1, kwargs=kwargs)

print('MOESS knn: \n')

ensemble_grid.fit(X, y)

print(f'params: {ensemble_grid.best_params_}, score: {ensemble_grid.best_score_}\n')

MOESS knn: 



Searching best hyperparameters: 21654it [02:47, 129.44it/s]                                                                                                     

params: {'lam': 1, 'n_learners': 10, 'prop_sample': 0.1, 'random_state': 1234, 'wrab': False}, score: (0.7651230198161892, 0.07746415411216032)






In [6]:
ensemble_grid.scoring_results_

[[<src.model.moess.MOESS at 0x7fb250228070>,
  {'lam': 1,
   'n_learners': 10,
   'prop_sample': 0.1,
   'random_state': 1234,
   'wrab': True},
  [0.6666666666666666,
   0.7886751345948129,
   0.5487950036474266,
   0.5487950036474266,
   0.7988071523335984,
   0.6889822365046137,
   0.16458980337503154,
   0.7151657414559676,
   0.6443375672974064,
   0.5],
  0.606481430952295],
 [<src.model.moess.MOESS at 0x7fb2502283d0>,
  {'lam': 1,
   'n_learners': 10,
   'prop_sample': 0.1,
   'random_state': 1234,
   'wrab': False},
  [0.753546276418555,
   0.8333333333333333,
   0.7988071523335984,
   0.6857142857142857,
   0.7988071523335984,
   0.9225771273642582,
   0.7333333333333334,
   0.7795084971874737,
   0.6304372986874878,
   0.7151657414559676],
  0.7651230198161892],
 [<src.model.moess.MOESS at 0x7fb2502287c0>,
  {'lam': 1,
   'n_learners': 10,
   'prop_sample': 0.2,
   'random_state': 1234,
   'wrab': True},
  [0.753546276418555,
   0.7886751345948129,
   0.7439750182371333,
   0