# Evaluation of oversamplers with a set of classifiers on a set of datasets

In this notebook, we give an example of evaluating multiple oversamplers on multiple datasets with multiple classifiers. 

In [1]:
import os.path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import smote_variants as sv

import imbalanced_databases as imbd

2022-08-14 19:55:00.175318: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-14 19:55:00.179542: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-14 19:55:00.179556: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Setting the cache_path which is used for caching during the evaluation

cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [3]:
# Specifying two datasets by their load functions

datasets= [imbd.load_glass0(), imbd.load_yeast1()]

In [4]:
# Specifying the classifiers used for evaluation

classifiers = [('sklearn.neighbors', 'KNeighborsClassifier', {}),
                ('sklearn.tree', 'DecisionTreeClassifier', {})]

In [5]:
oversamplers = [('smote_variants', 'SMOTE_ENN', {}),
                ('smote_variants', 'NEATER', {}),
                ('smote_variants', 'Lee', {})]

In [6]:
# Executing the evaluation using 5 parallel jobs, and at most 35 different 
# random but meaningful parameter combinations with the oversamplers

results= sv.evaluation.evaluate_oversamplers(datasets=datasets,
                                    oversamplers=oversamplers,
                                    classifiers=classifiers,
                                    cache_path=cache_path,
                                    n_jobs=2)

2022-08-14 19:55:04.235335: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-14 19:55:04.238994: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-14 19:55:04.239035: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-08-14 19:55:04.248881: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-14 19:55:04.2526

In [7]:
results

Unnamed: 0,database,oversampler,classifier,oversampler_params_key,classifier_module,classifier_params_key,acc_mean,sens_mean,spec_mean,ppv_mean,...,ltn_std,lp_std,ln_std,uc_std,informedness_std,markedness_std,p_top20_std,brier_std,log_loss_std,auc_std
0,glass0,Lee,DecisionTreeClassifier,"{'proportion': 1.0, 'n_neighbors': 5, 'nn_para...",sklearn.tree,{},0.78289,0.7,0.82303,0.6623,...,8.83125,0.138899,0.029599,0.024265,0.163764,0.133311,0.0,0.065268,2.254307,0.081882
1,glass0,Lee,KNeighborsClassifier,"{'proportion': 1.0, 'n_neighbors': 5, 'nn_para...",sklearn.neighbors,{},0.766113,0.764286,0.766749,0.629884,...,11.387851,0.138899,0.029599,0.023711,0.144836,0.138675,0.0,0.039447,1.451721,0.042586
2,yeast1,Lee,DecisionTreeClassifier,"{'proportion': 1.0, 'n_neighbors': 5, 'nn_para...",sklearn.tree,{},0.701804,0.547592,0.764455,0.487282,...,38.359694,0.224269,0.300052,0.003819,0.065679,0.063816,0.048895,0.029393,1.015217,0.03284
3,yeast1,Lee,KNeighborsClassifier,"{'proportion': 1.0, 'n_neighbors': 5, 'nn_para...",sklearn.neighbors,{},0.7227,0.648016,0.753081,0.515712,...,20.315236,0.224269,0.300052,0.004181,0.06338,0.050955,0.048895,0.018357,0.663763,0.033304


In [8]:
# The results are arranged in a pandas DataFrame with the following columns:

print(results.columns)

Index(['database', 'oversampler', 'classifier', 'oversampler_params_key',
       'classifier_module', 'classifier_params_key', 'acc_mean', 'sens_mean',
       'spec_mean', 'ppv_mean', 'npv_mean', 'fpr_mean', 'fdr_mean', 'fnr_mean',
       'bacc_mean', 'gacc_mean', 'f1_mean', 'mcc_mean', 'l_mean', 'ltp_mean',
       'lfp_mean', 'lfn_mean', 'ltn_mean', 'lp_mean', 'ln_mean', 'uc_mean',
       'informedness_mean', 'markedness_mean', 'p_top20_mean', 'brier_mean',
       'log_loss_mean', 'auc_mean', 'acc_std', 'sens_std', 'spec_std',
       'ppv_std', 'npv_std', 'fpr_std', 'fdr_std', 'fnr_std', 'bacc_std',
       'gacc_std', 'f1_std', 'mcc_std', 'l_std', 'ltp_std', 'lfp_std',
       'lfn_std', 'ltn_std', 'lp_std', 'ln_std', 'uc_std', 'informedness_std',
       'markedness_std', 'p_top20_std', 'brier_std', 'log_loss_std',
       'auc_std'],
      dtype='object')


In [9]:
# The results can be processed according to the requirements of the analysis

print(results)

  database oversampler              classifier  \
0   glass0         Lee  DecisionTreeClassifier   
1   glass0         Lee    KNeighborsClassifier   
2   yeast1         Lee  DecisionTreeClassifier   
3   yeast1         Lee    KNeighborsClassifier   

                              oversampler_params_key  classifier_module  \
0  {'proportion': 1.0, 'n_neighbors': 5, 'nn_para...       sklearn.tree   
1  {'proportion': 1.0, 'n_neighbors': 5, 'nn_para...  sklearn.neighbors   
2  {'proportion': 1.0, 'n_neighbors': 5, 'nn_para...       sklearn.tree   
3  {'proportion': 1.0, 'n_neighbors': 5, 'nn_para...  sklearn.neighbors   

  classifier_params_key  acc_mean  sens_mean  spec_mean  ppv_mean  ...  \
0                    {}  0.782890   0.700000   0.823030  0.662300  ...   
1                    {}  0.766113   0.764286   0.766749  0.629884  ...   
2                    {}  0.701804   0.547592   0.764455  0.487282  ...   
3                    {}  0.722700   0.648016   0.753081  0.515712  ...   

  