# DAB-SMOTE INITIAL BENCHMARKS #
## Authors: Unai Lalana Morales & José Antonio Sanz Delgado ##

### Imports ###

In [1]:
#Sistem imports
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..\..', 'classes')))

#Local imports
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import f1_score

In [3]:
%load_ext autoreload
%autoreload 2
import DAB_SMOTE
from dataset import dataReading

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Functions ###

In [4]:
def highlight_max(row):
    values_to_compare = row[1:]
    maximum = max(values_to_compare)
    is_unique = (values_to_compare == maximum).sum() == 1
    styles = []
    for val in values_to_compare:
        if val == maximum:
            color = 'green' if is_unique else 'orange'
            styles.append(f'background-color: {color}; font-weight: bold;')
        else:
            styles.append('')
    return [''] + styles

In [5]:
from tqdm import tqdm

def benchmark(root_path, param_name, methods, fixed_params):
    dataframe_names = []
    results_per_method = {method: [] for method in methods}
    dirs = [d for d in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, d))]
    for paths in tqdm(dirs, desc="Processing datasets"):
        complete_path = os.path.join(root_path, paths)
        dataframe_names.append(paths)
        train = []
        test = []
        for file in os.listdir(complete_path):
            if "tra.dat" in file:
                train_data = dataReading(complete_path + "/" + file)
                train.append(train_data)
                suffix = file.replace("tra.dat", "")
                test_data = dataReading(complete_path + "/" + suffix + "tst.dat")
                test.append(test_data)
        
        for method in methods:
            f1_scores = []
            for i in range(len(train)):
                dab = DAB_SMOTE.DAB_SMOTE(**{param_name: method}, **fixed_params)
                newX, newY = dab.fit_resample(train[i].data, train[i].target)
                cls = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', max_iter=10000)
                cls.fit(newX, newY)
                predictions = cls.predict(test[i].data)
                f1_scores.append(f1_score(np.array(test[i].target), predictions, pos_label=0))
            results_per_method[method].append(np.mean(f1_scores))

    df_methods = pd.DataFrame(results_per_method)
    df_methods = pd.concat([pd.DataFrame(dataframe_names, columns=["dataset"]),df_methods], axis=1)
    return dataframe_names, df_methods


### First benchmark ###

In [6]:
methods = ["euclidean", "manhattan", "chebyshev"]
fiexed_params = {"k": 1}
dataframe_names, dbscan_distances_df = benchmark("./../../data/benchmarks/imb_IRLowerThan9/","distMethod", methods, fiexed_params)

Processing datasets: 100%|██████████| 22/22 [01:03<00:00,  2.88s/it]


In [7]:
dbscan_distances_df_color = dbscan_distances_df.style.apply(highlight_max, axis=1)
dbscan_distances_df_color

Unnamed: 0,dataset,euclidean,manhattan,chebyshev
0,ecoli-0_vs_1,0.976369,0.976369,0.975662
1,ecoli1,0.769922,0.769922,0.756427
2,ecoli2,0.708644,0.708644,0.714277
3,ecoli3,0.604589,0.611256,0.600384
4,glass-0-1-2-3_vs_4-5-6,0.832573,0.85621,0.844954
5,glass0,0.644528,0.644528,0.6465
6,glass1,0.576912,0.576912,0.576062
7,glass6,0.812821,0.779487,0.791608
8,haberman,0.466593,0.48159,0.480482
9,iris0,1.0,1.0,1.0
