In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.base import clone
from sklearn.metrics import balanced_accuracy_score, recall_score
from imblearn.metrics import geometric_mean_score, specificity_score
import pandas as pd
from NRAS import NRAS

In [2]:
def data(file):
    data = pd.read_csv('imbalanced_datasets/%s.csv' %(file), delimiter=',')
    X = np.array(data.iloc[:,:-1])
    y = (np.array(data.iloc[:,-1]))
    return X, y 

In [3]:
datasets = {1:'yeast3',
            2:'yeast1',
            3:'wisconsin',
            4:'vehicle0',
            5:'vehicle1',
            6:'vehicle2',
            7:'vehicle3',
            8:'segment0',
            9:'pima',
            10:'page-blocks0',
            11:'new-thyroid1',
            12:'newthyroid2',
            13:'iris0',
            14:'haberman',
            15:'glass6',
            16:'glass1',
            17:'glass0',
            18:'glass-0-1-2-3_vs_4-5-6',
            19:'ecoli3',
            #20:'ecoli2',
            #21:'ecoli1',
            #22:'ecoli-0_vs_1'
            }

In [4]:
preprocessing = {
	'ros':RandomOverSampler(random_state=1),
	'smote':SMOTE(random_state=1,k_neighbors=5),
	'nras':NRAS(random_state=1,n_neighbors=5),
	'none':None,
}

In [5]:
classificator = {
    'tree_clf':DecisionTreeClassifier(random_state=1),
    'svc_clf':SVC(random_state=1),
}

In [6]:
metrics = {
	'balanced_accuracy':balanced_accuracy_score,
	'gem':geometric_mean_score,
	'specificity':specificity_score,
	'recall':recall_score,	
}

In [7]:
n_splits = 5
n_repeats = 2
rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)

In [8]:
score=np.zeros((len(preprocessing),len(datasets), n_splits*n_repeats, len(classificator), len(metrics)))

In [9]:
for dataset_id, dataset in enumerate(datasets):
    X, y = data(datasets[dataset])
    
    for fold_id, (train, test) in enumerate(rskf.split(X,y)):
        for prepro_id, prepro in enumerate(preprocessing):         
            for cl_id, cl in enumerate(classificator):
                clf = clone(classificator[cl]) 
                if preprocessing[prepro] == None:
                    X_train, y_train = X[train], y[train]
                else:
                    X_train, y_train = preprocessing[prepro].fit_resample(X[train], y[train])
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X[test])
                
                for metric_id, metric in enumerate(metrics):
                    if metric=='specificity' or metric=='recall':
                        score[prepro_id, dataset_id, fold_id, cl_id, metric_id] = metrics[metric](y[test],y_pred,
                                                                                                pos_label=' positive')
                    else:
                        score[prepro_id, dataset_id, fold_id, cl_id, metric_id] = metrics[metric](y[test],y_pred)

2022-05-28 12:16:06,385:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-28 12:16:06,535:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-28 12:16:07,036:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-28 12:16:07,156:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-28 12:16:07,555:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-28 12:16:07,659:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-28 12:16:08,115:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5

In [11]:
results = np.mean(score,axis=2)
np.save('results', results)
for cl_id, cl in enumerate(classificator):
    for metric_id, metric in enumerate(metrics):
        table = pd.DataFrame(results[:,:,cl_id,metric_id].T)
        table.columns = ['Ros','Smote','Nran','None']
        #table.index = np.arange(1,20)
        table.index = datasets.values()
        table.index.name = 'Dataset'
        table['best'] = table.idxmax(axis=1)
        print(f'{cl}, {metric}\n {table}\n')

tree_clf, balanced_accuracy
                              Ros     Smote      Nran      None   best
Dataset                                                              
yeast3                  0.823674  0.843987  0.807197  0.809896  Smote
yeast1                  0.655667  0.655484  0.650828  0.656837   None
wisconsin               0.921850  0.935680  0.938618  0.942873   None
vehicle0                0.910455  0.912072  0.903730  0.890651  Smote
vehicle1                0.673010  0.679167  0.661568  0.655726  Smote
vehicle2                0.944430  0.946064  0.944135  0.938275  Smote
vehicle3                0.685612  0.689698  0.642785  0.696976   None
segment0                0.984673  0.987336  0.981137  0.985063  Smote
pima                    0.681726  0.672598  0.673030  0.672152    Ros
page-blocks0            0.904533  0.919557  0.898298  0.909962  Smote
new-thyroid1            0.918770  0.967222  0.935992  0.910198  Smote
newthyroid2             0.941667  0.943056  0.934524  0.91166

In [37]:
from scipy.stats import rankdata

for cl_id, cl in enumerate(classificator):
    for metric_id, metric in enumerate(metrics):
        ranks = []
        for dataset_id, dataset in enumerate(datasets):
            ranks.append(rankdata(results[:,dataset_id,cl_id,metric_id]))
        table = pd.DataFrame(ranks)
        table.columns = ['Ros','Smote','Nran','None']
        table.index = np.arange(1,20)
        table.index.name = 'Dataset'
        print(f'{cl}, {metric}\n {table}')
        mean_ranks = np.mean(ranks, axis=0)
        print("Mean ranks:\n", mean_ranks)

tree_clf, balanced_accuracy
          Ros  Smote  Nran  None
Dataset                        
1        3.0    4.0   1.0   2.0
2        3.0    2.0   1.0   4.0
3        1.0    2.0   3.0   4.0
4        3.0    4.0   2.0   1.0
5        3.0    4.0   2.0   1.0
6        3.0    4.0   2.0   1.0
7        2.0    3.0   1.0   4.0
8        2.0    4.0   1.0   3.0
9        4.0    2.0   3.0   1.0
10       2.0    4.0   1.0   3.0
11       2.0    4.0   3.0   1.0
12       3.0    4.0   2.0   1.0
13       2.5    2.5   2.5   2.5
14       1.0    2.0   3.0   4.0
15       3.0    4.0   2.0   1.0
16       4.0    2.0   1.0   3.0
17       1.0    2.0   3.0   4.0
18       2.0    3.0   1.0   4.0
19       3.0    1.0   2.0   4.0
Mean ranks:
 [2.5        3.02631579 1.92105263 2.55263158]
tree_clf, gem
          Ros  Smote  Nran  None
Dataset                        
1        3.0    4.0   1.0   2.0
2        2.0    4.0   1.0   3.0
3        1.0    2.0   3.0   4.0
4        3.0    4.0   2.0   1.0
5        3.0    4.0   1.0   2.0
6