In [1]:
import numpy as np
import pandas as pd
import smote_variants as sv
from sklearn.base import clone
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, recall_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.metrics import specificity_score
from scipy.stats import ttest_rel
from our_implementations import OurSMOTE

In [2]:
def data(name):
    data = pd.read_csv(
        f"imbalanced_datasets/{name}.dat",
        delimiter=',',
        skipinitialspace=True,
        comment='@'
    )
    X = np.array(data.iloc[:,:-1])
    y = (np.array(data.iloc[:,-1]))
    r_striper = np.vectorize(lambda x: x.rstrip())
    y = r_striper(y)
    return X, y

In [3]:
def df_to_latex_table(df: list[pd.DataFrame]) -> str:
    tab = df.to_latex(
        columns = prepro_names,
        float_format = "{:0.2f}".format
    )
    return tab

In [4]:
datasets = {
    0: 'ecoli3',
    1: 'yeast3',
    2: 'yeast1',
    3: 'wisconsin',
    4: 'vehicle0',
    5: 'vehicle1',
    6: 'vehicle2',
    7: 'vehicle3',
    8: 'segment0',
    9: 'pima',
    10: 'page-blocks0',
    11: 'new-thyroid1',
    12: 'newthyroid2',
    13: 'iris0',
    14: 'haberman',
    15: 'glass6',
    16: 'glass1',
    17: 'glass0',
    18: 'glass-0-1-2-3_vs_4-5-6',
}

In [5]:
preprocessing = {
	'ros': RandomOverSampler(random_state=1),
	'smote': SMOTE(random_state=1, k_neighbors=5),
	'nras': sv.NRAS(random_state=1, n_neighbors=5),
	'ours': OurSMOTE(),
	'none': None,
}
prepro_names = [name.upper() for name in preprocessing.keys()]

In [6]:
classificator = {
    'tree_clf': DecisionTreeClassifier(random_state=1),
    'svc_clf': SVC(random_state=1),
}

In [7]:
metrics = {
	'balanced_accuracy': balanced_accuracy_score,
	'specificity': specificity_score,
	'recall': recall_score,	
}

In [8]:
n_splits = 5
n_repeats = 2
rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)

In [9]:
score=np.zeros((len(preprocessing), len(datasets), n_splits*n_repeats, len(classificator), len(metrics)))

In [10]:
%%capture

for dataset_id, dataset in enumerate(datasets):
    X, y = data(datasets[dataset])
    for fold_id, (train, test) in enumerate(rskf.split(X, y)):
        for cl_id, cl in enumerate(classificator):
            for prepro_id, prepro in enumerate(preprocessing):         
                clf = clone(classificator[cl])
                
                if not preprocessing[prepro]:
                    X_resampled, y_resampled = X[train], y[train]
                elif preprocessing[prepro] == 'nras':
                    X_resampled, y_resampled = preprocessing[prepro].sample(X[train], y[train])
                else:
                    X_resampled, y_resampled = preprocessing[prepro].fit_resample(X[train], y[train])

                clf.fit(X_resampled, y_resampled)
                y_pred = clf.predict(X[test])

                for metric_id, metric in enumerate(metrics):
                    if metric == 'specificity' or metric == 'recall':
                        score[prepro_id, dataset_id, fold_id, cl_id, metric_id] = metrics[metric](y[test], y_pred, pos_label='positive')
                    else:
                        score[prepro_id, dataset_id, fold_id, cl_id, metric_id] = metrics[metric](y[test], y_pred)

2022-06-12 19:09:59,998:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-06-12 19:10:00,179:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-06-12 19:10:00,341:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-06-12 19:10:00,491:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-06-12 19:10:00,637:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-06-12 19:10:00,789:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5, 'n_jobs': 1, 'random_state': 1}")
2022-06-12 19:10:00,936:INFO:NRAS: Running sampling via ('NRAS', "{'proportion': 1.0, 'n_neighbors': 5, 't': 0.5

In [11]:
np.save('score', score)

In [12]:
%%capture

avg_across_folds = np.mean(score, axis=2)
latex_score_tables = []

for cl_id, cl in enumerate(classificator):
    for metric_id, metric in enumerate(metrics):
        df = pd.DataFrame(avg_across_folds[:,:,cl_id,metric_id].T)
        df.columns = prepro_names
        df.index = datasets.values()
        df['Best'] = df.idxmax(axis=1)
        latex_score_tables.append(f"{cl}, {metric}\n")
        latex_score_tables.append(df_to_latex_table(df))

In [13]:
with open("latex_score_tables.txt", "w") as f:
    f.writelines(latex_score_tables)

In [14]:
alfa = 0.05
prepro_len = len(preprocessing)

t_statistic = np.zeros((len(classificator), len(datasets), prepro_len, prepro_len))
p_value = np.zeros((len(classificator), len(datasets), prepro_len, prepro_len))

for cl_id, cl in enumerate(classificator):
    for dataset_id, dataset in enumerate(datasets):
        for i in range(prepro_len):
            for j in range(prepro_len):
                score_1 = score[i, dataset_id, :, cl_id, 0]
                score_2 = score[j, dataset_id, :, cl_id, 0]
                t_statistic[cl_id, dataset_id, i, j], p_value[cl_id, dataset_id, i, j] = ttest_rel(score_1, score_2)

In [15]:
advantage = np.zeros((len(classificator), len(datasets), prepro_len, prepro_len))
advantage[t_statistic > 0] = 1

In [16]:
significance = np.zeros((len(classificator), len(datasets), prepro_len, prepro_len))
significance[p_value <= alfa] = 1

In [17]:
stat_better = significance * advantage

In [18]:
%%capture

latex_stats_tables = []
 
for cl_id, cl in enumerate(classificator):
    for dataset_id, dataset in enumerate(datasets):
        df = pd.DataFrame(stat_better[cl_id, dataset_id])
        df.index = df.columns = prepro_names
        latex_stats_tables.append(f"{cl}, {datasets[dataset]}\n")
        latex_stats_tables.append(df_to_latex_table(df))

In [19]:
with open("latex_stats_tables.txt", "w") as f:
    f.writelines(latex_stats_tables)