In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.base import clone
from sklearn.metrics import balanced_accuracy_score
import pandas as pd
from NRAS import NRAS

In [2]:
def data(file):
    data = pd.read_csv('imbalanced_datasets/%s.csv' %(file), delimiter=',')
    X = np.array(data.iloc[:,:-1])
    y = (np.array(data.iloc[:,-1]))
    return X, y 

In [3]:
datasets = {1:'yeast3',
            2:'yeast1',
            3:'wisconsin',
            4:'vehicle0',
            5:'vehicle1',
            6:'vehicle2',
            7:'vehicle3',
            8:'segment0',
            9:'pima',
            10:'page-blocks0',
            11:'new-thyroid1',
            12:'newthyroid2',
            13:'iris0',
            14:'haberman',
            15:'glass6',
            16:'glass1',
            17:'glass0',
            18:'glass-0-1-2-3_vs_4-5-6',
            19:'ecoli3',
            #20:'ecoli2',
            #21:'ecoli1',
            #22:'ecoli-0_vs_1'
            }

In [4]:
preprocessing = {
	'ros':RandomOverSampler(random_state=1),
	'smote':SMOTE(random_state=1,k_neighbors=5),
	'nras':NRAS(random_state=1,n_neighbors=5),
	'none':None,
}

In [5]:
classificator = {
    'tree_clf':DecisionTreeClassifier(random_state=1),
    'svc_clf':SVC(random_state=1),
}

In [6]:
metrics = {
	'balanced_accuracy':balanced_accuracy_score
}

In [7]:
n_splits = 5
n_repeats = 2
rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)

In [8]:
score=np.zeros((len(preprocessing),len(datasets), n_splits*n_repeats, len(classificator), len(metrics)))

In [9]:
for dataset_id, dataset in enumerate(datasets):
    X, y = data(datasets[dataset])
    
    for fold_id, (train, test) in enumerate(rskf.split(X,y)):
        for prepro_id, prepro in enumerate(preprocessing):         
            for cl_id, cl in enumerate(classificator):
                clf = clone(classificator[cl]) 
                if preprocessing[prepro] == None:
                    X_train, y_train = X[train], y[train]
                else:
                    X_train, y_train = preprocessing[prepro].fit_resample(X[train], y[train])
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X[test])
                
                for metric_id, metric in enumerate(metrics):
                    score[prepro_id, dataset_id, fold_id, cl_id, metric_id] = metrics[metric](y[test],y_pred)
    

2022-05-23 10:33:53,342:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-23 10:33:53,469:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-23 10:33:53,835:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-23 10:33:53,955:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-23 10:33:54,345:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-23 10:33:54,498:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-05-23 10:33:54,904:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5

In [21]:
methods=['ROS', 'SMOTE','NRAS','None']
df_svc = pd.DataFrame(index=range(1,23),columns=['ROS-balanced_accuracy','SMOTE-balanced_accuracy','NRAS-balanced_accuracy','None'])
df_tree = df_svc.copy()
for cl_id, cl in enumerate(classificator):
    for dataset_id, dataset in  enumerate(datasets):
        for method_id, method in enumerate(methods):
            for metric_id, metric in enumerate(metrics):
                value=score[method_id, dataset_id, :, cl_id, metric_id]
                if cl_id==0:
                    df_tree.iloc[dataset_id,metric_id+len(metrics)*method_id]=(value.mean())
                else:
                    df_svc.iloc[dataset_id,metric_id+len(metrics)*method_id]=(value.mean())
                #print(f'{cl}, Dataset: {dataset}, Metoda: {method}, Metryka:{metric} {value.mean()}')


In [22]:
df_tree

Unnamed: 0,ROS-balanced_accuracy,SMOTE-balanced_accuracy,NRAS-balanced_accuracy,None
1,0.823674,0.843987,0.807197,0.809896
2,0.655667,0.655484,0.650828,0.656837
3,0.92185,0.93568,0.938618,0.942873
4,0.910455,0.912072,0.90373,0.890651
5,0.651011,0.651502,0.560382,0.631945
6,0.899384,0.903399,0.60381,0.91375
7,0.633472,0.651502,0.5797,0.631945
8,0.984673,0.987336,0.982273,0.985063
9,0.681726,0.672598,0.666783,0.672152
10,0.904533,0.919557,0.898839,0.909962


In [23]:
df_svc

Unnamed: 0,ROS-balanced_accuracy,SMOTE-balanced_accuracy,NRAS-balanced_accuracy,None
1,0.916998,0.916525,0.890436,0.834848
2,0.717276,0.72266,0.708065,0.622891
3,0.972952,0.969723,0.969369,0.972065
4,0.810668,0.808261,0.812215,0.549254
5,0.520091,0.51772,0.493658,0.491826
6,0.708984,0.704736,0.560773,0.532636
7,0.515622,0.51772,0.49784,0.492227
8,0.985841,0.98622,0.984199,0.851138
9,0.720881,0.716937,0.718259,0.690506
10,0.669547,0.684392,0.678821,0.532375
