In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pathlib
import sys
sys.path.append(str(pathlib.Path.cwd().parent.parent))
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold

from imblearn.metrics import geometric_mean_score, classification_report_imbalanced
from sklearn.metrics import (recall_score, roc_auc_score, confusion_matrix, precision_score, precision_recall_curve,
                             f1_score, balanced_accuracy_score, accuracy_score, auc)

In [2]:
home_folder = pathlib.Path.cwd().parent.parent
path = home_folder.joinpath(home_folder, 'CSV_results')
df_info = pd.read_csv(path.joinpath('dataset_info.csv'))
df = pd.read_csv(path.joinpath('Cross_validation', 'cross_val_res.csv'), index_col=0)

In [11]:
def get_best(df, dataset, metric) :
    "get best performance per oversampler for a specific dataset"
    
    temp_df = df[df['dataset'] == dataset]
    
    return df.loc[temp_df.groupby('oversampler')[metric].idxmax()].sort_values(metric, ascending=False)

def select_best(df, metric) :
    "get best performance per oversampler for all datasets"
    
    temp = []
    
    for dataset in df['dataset'].unique() :
        temp.append(get_best(df, dataset, metric))
        
    res = pd.concat(temp)
    
    return res

def ranking_oversampler(df, metric, return_all = False) :
    
    df2 = df.groupby('oversampler').mean().sort_values(metric, ascending=False)
    df3 = df2[['proportion', metric, f'{metric}_std']]
    
    if return_all :
        return df2
    else :
        return df3
    
def final_ranking(df, metric) :
    
    df2 = select_best(df, metric)
    
    return ranking_oversampler(df2,metric)

In [4]:
# make list of dataset types
cats = list(df_info[df_info['type'] == 'categorical']['dataset'])
nums = list(df_info[df_info['type'] == 'numerical']['dataset'])
mixed = list(df_info[df_info['type'] == 'mixed']['dataset'])

# categorical

In [21]:
dfc = df[df['dataset'].isin(cats)]
final_ranking(dfc, 'pr_auc')

Unnamed: 0_level_0,proportion,pr_auc,pr_auc_std
oversampler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"('polynom_fit_SMOTE', ""{'proportion': 1.0, 'topology': 'star', 'random_state': 5}"")",0.833333,0.834415,0.014458
NoOversampling,0.0,0.829999,0.01157
"('Random_SMOTE', ""{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': 5}"")",0.466667,0.829338,0.014724
"SVMSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.633333,0.827487,0.014201
"RandomOverSampler(random_state=5, sampling_strategy=1.0)",0.4,0.826162,0.01239
"SMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.6,0.826036,0.016542
"ADASYN(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.433333,0.82555,0.017131
"BorderlineSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.533333,0.824528,0.014778
"('synthsonic', ""{'proportion': 1.0, 'distinct_threshold': 20, 'do_PCA': True, 'ordering': 'pca'}"")",0.433333,0.821132,0.01516


# mixed

In [22]:
dfm = df[df['dataset'].isin(mixed)]
final_ranking(dfm, 'pr_auc')

Unnamed: 0_level_0,proportion,pr_auc,pr_auc_std
oversampler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ADASYN(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.581818,0.578692,0.065
"('synthsonic', ""{'proportion': 1.0, 'distinct_threshold': 20, 'do_PCA': False, 'ordering': ''}"")",0.636364,0.57412,0.05693
"SMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.436364,0.573334,0.060993
"('polynom_fit_SMOTE', ""{'proportion': 1.0, 'topology': 'star', 'random_state': 5}"")",0.472727,0.572027,0.061363
"('Random_SMOTE', ""{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': 5}"")",0.6,0.570462,0.056368
"SVMSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.454545,0.567902,0.063671
"RandomOverSampler(random_state=5, sampling_strategy=1.0)",0.6,0.567079,0.051507
NoOversampling,0.0,0.5617,0.069286
"BorderlineSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.309091,0.559454,0.066364


# numerical

In [23]:
dfn = df[df['dataset'].isin(nums)]
final_ranking(dfn, 'pr_auc')

Unnamed: 0_level_0,proportion,pr_auc,pr_auc_std
oversampler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"SVMSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.52,0.63225,0.053205
"RandomOverSampler(random_state=5, sampling_strategy=1.0)",0.54,0.631779,0.054252
"('synthsonic', ""{'proportion': 1.0, 'distinct_threshold': 20, 'do_PCA': False, 'ordering': ''}"")",0.7,0.629249,0.050908
"BorderlineSMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.5,0.625611,0.061506
"ADASYN(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.36,0.621052,0.05155
"SMOTE(n_jobs=-1, random_state=5, sampling_strategy=1.0)",0.38,0.620947,0.056602
"('Random_SMOTE', ""{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': 5}"")",0.52,0.618087,0.053675
"('polynom_fit_SMOTE', ""{'proportion': 1.0, 'topology': 'star', 'random_state': 5}"")",0.5,0.617412,0.043489
NoOversampling,0.0,0.617302,0.051237
