In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pathlib
import sys
sys.path.append(str(pathlib.Path.cwd().parent.parent))
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold

from imblearn.metrics import geometric_mean_score, classification_report_imbalanced
from sklearn.metrics import (recall_score, roc_auc_score, confusion_matrix, precision_score, precision_recall_curve,
                             f1_score, balanced_accuracy_score, accuracy_score, auc)

In [28]:
home_folder = pathlib.Path.cwd().parent.parent
path = home_folder.joinpath(home_folder, 'CSV_results')
df_info = pd.read_csv(path.joinpath('dataset_info.csv'))
df = pd.read_csv(path.joinpath('Cross_validation', 'cross_validation_adult.csv'), index_col=0).reset_index(drop=True)

In [30]:
a = df.oversampler.unique()
b = ['RandomOverSampler', 
     'SMOTE', 
     'SMOTENC', 
     'SVMSMOTE', 
     'ADASYN', 
     'BorderlineSMOTE', 
     'synthsonic', 
     'polynom_fit_SMOTE',
     'Random_SMOTE',
     'NoSMOTE']

for old, new in zip(a,b) :
    df.loc[df.oversampler == old, 'oversampler'] = new

df

Unnamed: 0,dataset,oversampler,proportion,balanced_accuracy,balanced_accuracy_std,G_mean,G_mean_std,f1,f1_std,precision,precision_std,recall,recall_std,pr_auc,pr_auc_std,runtime,runtime_std
0,Adult,RandomOverSampler,0.4,0.815048,0.003054,0.807922,0.003481,0.725028,0.004009,0.743391,0.001895,0.707564,0.006062,0.826307,0.004788,0.006333,0.000834
1,Adult,RandomOverSampler,0.6,0.830358,0.004753,0.828515,0.004945,0.725817,0.006628,0.682389,0.006203,0.775158,0.007571,0.826238,0.004008,0.007064,6.9e-05
2,Adult,RandomOverSampler,0.8,0.83621,0.002756,0.835887,0.002786,0.720669,0.003887,0.647154,0.00412,0.813034,0.004028,0.825336,0.004963,0.007921,8.4e-05
3,Adult,RandomOverSampler,1.0,0.83924,0.005278,0.839223,0.005262,0.713584,0.006708,0.618202,0.005742,0.843771,0.008184,0.825816,0.005274,0.008967,0.000463
4,Adult,SMOTE,0.4,0.813286,0.004593,0.806126,0.005333,0.721811,0.005715,0.738661,0.003356,0.705779,0.009926,0.824606,0.004847,0.217954,0.007182
5,Adult,SMOTE,0.6,0.826564,0.004401,0.824047,0.004508,0.723856,0.00689,0.689246,0.008363,0.762149,0.005848,0.822298,0.006258,0.225179,0.009261
6,Adult,SMOTE,0.8,0.830627,0.004679,0.82965,0.004798,0.719323,0.006581,0.659942,0.006639,0.790462,0.00709,0.819548,0.005721,0.248615,0.015757
7,Adult,SMOTE,1.0,0.832816,0.005705,0.832573,0.005795,0.713837,0.007486,0.63615,0.006861,0.813163,0.009349,0.816625,0.007653,0.246663,0.005876
8,Adult,SMOTENC,0.4,0.810865,0.003969,0.802804,0.004324,0.720748,0.006138,0.746344,0.006142,0.696851,0.006229,0.825573,0.003972,2.588054,0.134995
9,Adult,SMOTENC,0.6,0.818624,0.002212,0.813529,0.002425,0.723168,0.003092,0.718936,0.003051,0.727459,0.004075,0.82343,0.003319,2.752761,0.058074


In [11]:
def get_best(df, dataset, metric) :
    "get best performance per oversampler for a specific dataset"
    
    temp_df = df[df['dataset'] == dataset]
    
    return df.loc[temp_df.groupby('oversampler')[metric].idxmax()].sort_values(metric, ascending=False)

def select_best(df, metric) :
    "get best performance per oversampler for all datasets"
    
    temp = []
    
    for dataset in df['dataset'].unique() :
        temp.append(get_best(df, dataset, metric))
        
    res = pd.concat(temp)
    
    return res

def ranking_oversampler(df, metric, return_all = False) :
    
    df2 = df.groupby('oversampler').mean().sort_values(metric, ascending=False)
    df3 = df2[['proportion', metric, f'{metric}_std']]
    
    if return_all :
        return df2
    else :
        return df3
    
def final_ranking(df, metric) :
    
    df2 = select_best(df, metric)
    
    return ranking_oversampler(df2,metric)

In [4]:
# make list of dataset types
cats = list(df_info[df_info['type'] == 'categorical']['dataset'])
nums = list(df_info[df_info['type'] == 'numerical']['dataset'])
mixed = list(df_info[df_info['type'] == 'mixed']['dataset'])

# categorical

In [21]:
dfc = df[df['dataset'].isin(cats)]
final_ranking(dfc, 'pr_auc')

Unnamed: 0_level_0,proportion,pr_auc,pr_auc_std
oversampler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"('polynom_fit_SMOTE', ""{'proportion': 1.0, 'topology': 'star', 'random_state': 5}"")",0.833333,0.834415,0.014458
NoOversampling,0.0,0.829999,0.01157
"('Random_SMOTE', ""{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': 5}"")",0.466667,0.829338,0.014724
"SVMSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.633333,0.827487,0.014201
"RandomOverSampler(random_state=5, sampling_strategy=1.0)",0.4,0.826162,0.01239
"SMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.6,0.826036,0.016542
"ADASYN(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.433333,0.82555,0.017131
"BorderlineSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.533333,0.824528,0.014778
"('synthsonic', ""{'proportion': 1.0, 'distinct_threshold': 20, 'do_PCA': True, 'ordering': 'pca'}"")",0.433333,0.821132,0.01516


# mixed

In [22]:
dfm = df[df['dataset'].isin(mixed)]
final_ranking(dfm, 'pr_auc')

Unnamed: 0_level_0,proportion,pr_auc,pr_auc_std
oversampler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ADASYN(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.581818,0.578692,0.065
"('synthsonic', ""{'proportion': 1.0, 'distinct_threshold': 20, 'do_PCA': False, 'ordering': ''}"")",0.636364,0.57412,0.05693
"SMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.436364,0.573334,0.060993
"('polynom_fit_SMOTE', ""{'proportion': 1.0, 'topology': 'star', 'random_state': 5}"")",0.472727,0.572027,0.061363
"('Random_SMOTE', ""{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': 5}"")",0.6,0.570462,0.056368
"SVMSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.454545,0.567902,0.063671
"RandomOverSampler(random_state=5, sampling_strategy=1.0)",0.6,0.567079,0.051507
NoOversampling,0.0,0.5617,0.069286
"BorderlineSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.309091,0.559454,0.066364


# numerical

In [23]:
dfn = df[df['dataset'].isin(nums)]
final_ranking(dfn, 'pr_auc')

Unnamed: 0_level_0,proportion,pr_auc,pr_auc_std
oversampler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"SVMSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.52,0.63225,0.053205
"RandomOverSampler(random_state=5, sampling_strategy=1.0)",0.54,0.631779,0.054252
"('synthsonic', ""{'proportion': 1.0, 'distinct_threshold': 20, 'do_PCA': False, 'ordering': ''}"")",0.7,0.629249,0.050908
"BorderlineSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.5,0.625611,0.061506
"ADASYN(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.36,0.621052,0.05155
"SMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.38,0.620947,0.056602
"('Random_SMOTE', ""{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': 5}"")",0.52,0.618087,0.053675
"('polynom_fit_SMOTE', ""{'proportion': 1.0, 'topology': 'star', 'random_state': 5}"")",0.5,0.617412,0.043489
NoOversampling,0.0,0.617302,0.051237
