In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, balanced_accuracy_score, roc_auc_score

from pymoo.operators.mutation.bitflip import BitflipMutation, Mutation
from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from pymoo.operators.sampling.rnd import BinaryRandomSampling, Sampling
from pymoo.operators.crossover.hux import HUX
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.indicators.hv import Hypervolume
from pymoo.core.problem import Problem
from pymoo.optimize import minimize

import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
import torch

from scipy.stats import ranksums

from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
import pandas as pd
import numpy as np
import pickle
import os
import re

from main import *

import matplotlib.pyplot as plt

with open('../data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)

data_keys = list(data_mapper.keys())

In [3]:
results_by_name = {}
synthetic_by_name = {}
for file in os.listdir('results'):
	if 'csv' in file:
		synthetic_by_name[file] = pd.read_csv(f"results/{file}")
		
	else:
		with open(f"results/{file}", 'rb') as fh:
			results_by_name[file] = pickle.load(fh)

In [14]:
synthetic_by_name

{'0_abalone-20_vs_8-9-10.csv':            0         1         2         3         4         5         6
 0   0.541783  0.632622  0.914845  0.851335  0.287811  0.575589  1.585080
 1   0.542695  0.685580  0.940856  0.884839  0.289538  0.594237  1.688329
 2   0.557034  0.708311  0.965232  0.911167  0.297396  0.615129  1.749495
 3   0.555569  0.705658  0.962030  0.908031  0.296367  0.612956  1.743129
 4   0.543930  0.680587  0.925192  0.887299  0.289122  0.578805  1.705232
 5   0.556801  0.713600  0.979276  0.916276  0.311769  0.628934  1.763386
 6   0.563546  0.711783  0.965632  0.918238  0.300803  0.617695  1.761925
 7   0.548072  0.692496  0.939757  0.893065  0.292245  0.600745  1.713910
 8   0.560306  0.707745  0.960214  0.912967  0.299011  0.614146  1.751871
 9   0.550238  0.695196  0.943378  0.896589  0.293443  0.603118  1.720630
 10  0.454360  0.579187  0.789507  0.748192  0.238492  0.506210  1.393479
 11  0.535224  0.680383  0.924844  0.880874  0.283132  0.596305  1.638011,
 '0_aba

In [15]:
iter_by_datset_name = {}
ir_by_data = {}
num_samples = {}
for file in results_by_name:
	iter_name = file.replace(".result", '')
	iter_num = iter_name.split("_")[0]
	dataset_name = "_".join(iter_name.split("_")[1:])
	
	if dataset_name not in iter_by_datset_name:
		iter_by_datset_name[dataset_name] = []
	
	iter_by_datset_name[dataset_name].append(file)
	print(file)

	if dataset_name not in ir_by_data:
		num_samples[dataset_name] = []
		ir_by_data[dataset_name] = []

	y_train = data_mapper[iter_name]['y_train']
	counts = pd.DataFrame(y_train).value_counts()
	ir_by_data[dataset_name].append(counts.max() / counts.min())
	num_samples[dataset_name].append(len(synthetic_by_name[file.replace('result','csv')]))

for k in iter_by_datset_name:
	print(len(iter_by_datset_name[k]), repr(k))

0_abalone-20_vs_8-9-10.result
0_abalone19.result
0_abalone9-18.result
0_cleveland-0_vs_4.result
0_ecoli-0-1-4-7_vs_2-3-5-6.result
0_ecoli-0-2-6-7_vs_3-5.result
0_ecoli-0-4-6_vs_5.result
0_ecoli-0-6-7_vs_3-5.result
0_ecoli1.result
0_ecoli4.result
0_glass-0-6_vs_5.result
0_glass1.result
0_glass6.result
0_haberman.result
0_pima.result
0_winequality-red-8_vs_6-7.result
0_wisconsin.result
0_yeast-2_vs_4.result
0_yeast4.result
10_abalone-20_vs_8-9-10.result
10_abalone19.result
10_abalone9-18.result
10_cleveland-0_vs_4.result
10_ecoli-0-1-4-7_vs_2-3-5-6.result
10_ecoli-0-2-6-7_vs_3-5.result
10_ecoli-0-4-6_vs_5.result
10_ecoli-0-6-7_vs_3-5.result
10_ecoli1.result
10_ecoli4.result
10_glass-0-6_vs_5.result
10_glass1.result
10_glass6.result
10_haberman.result
10_pima.result
10_winequality-red-8_vs_6-7.result
10_wisconsin.result
10_yeast-2_vs_4.result
11_abalone-20_vs_8-9-10.result
11_abalone19.result
11_abalone9-18.result
11_cleveland-0_vs_4.result
11_ecoli-0-1-4-7_vs_2-3-5-6.result
11_ecoli-0-2-

In [20]:
for key in ir_by_data:
    print(f"{key:>25} {round(np.mean(ir_by_data[key]), 3):<15} {round(np.mean(num_samples[key]))}")

     abalone-20_vs_8-9-10 72.692          10
                abalone19 129.438         1085
              abalone9-18 16.381          20
         cleveland-0_vs_4 13.333          24
 ecoli-0-1-4-7_vs_2-3-5-6 10.69           38
     ecoli-0-2-6-7_vs_3-5 9.182           34
         ecoli-0-4-6_vs_5 9.1             33
       ecoli-0-6-7_vs_3-5 9.091           34
                   ecoli1 3.355           60
                   ecoli4 15.8            26
           glass-0-6_vs_5 11.368          20
                   glass1 1.816           70
                   glass6 6.643           48
                 haberman 2.825           43
                     pima 1.866           213
 winequality-red-8_vs_6-7 46.444          11
                wisconsin 1.866           75
             yeast-2_vs_4 9.089           41
                   yeast4 28.109          16


In [4]:
curr = "ecoli4"

# Oversample versus no oversample (no optimization both schemes)

In [5]:
for curr in iter_by_datset_name:
	validation_baseline = []
	test_baseline = []

	oversample_validation = []
	oversample_test = []

	record = []
	lists = {
		"Validation baseline acc": [],
		"Test baseline acc": [],

		"Validation baseline auc": [],
		"Test baseline auc": [],

		"Validation oversample auc": [],
		"Test oversample auc": [],

		"Validation oversample acc": [],
		"Test oversample acc": [],

	}
	for file in iter_by_datset_name[curr]:

		result = results_by_name[file]
		synthetic_samples = synthetic_by_name[file.replace(".result", ".csv")]
		
		data_split = data_mapper[file.replace(".result", "")]
		x_train, y_train = data_split['x_train'], data_split['y_train']
		x_validation, y_validation = data_split['x_validation'], data_split['y_validation']
		x_test, y_test = data_split['x_test'], data_split['y_test']
		
		
		minority_label = pd.DataFrame(y_train).value_counts().argmin()
		# x_SYNTH, y_SYNTH = np.concatenate((x_train, synthetic_samples)), np.concatenate((y_train, [minority_label] * len(synthetic_samples)))

		# Fit with baseline train
		model = KNeighborsClassifier(n_neighbors=AUC_Optimizer.n_neighbours)
		model.fit(x_train, y_train)

		y_pred = model.predict(x_validation)
		baseline_validation_acc = accuracy_score(y_validation, y_pred)
		baseline_validation_auc = roc_auc_score(y_validation, y_pred)
		
		y_pred = model.predict(x_test)
		baseline_test_acc = accuracy_score(y_test, y_pred)
		baseline_test_auc = roc_auc_score(y_test, y_pred)

		# Fit with oversampled train
		model = KNeighborsClassifier(n_neighbors=AUC_Optimizer.n_neighbours)
		model.fit(x_train, y_train)
		#  model.fit(x_SYNTH, y_SYNTH)

		y_pred = model.predict(x_validation)
		oversample_validation_acc = accuracy_score(y_validation, y_pred)
		oversample_validation_auc = roc_auc_score(y_validation, y_pred)

		y_pred = model.predict(x_test)
		oversample_test_acc = accuracy_score(y_test, y_pred)
		oversample_test_auc = roc_auc_score(y_test, y_pred)

		lists["Validation baseline acc"].append(baseline_validation_acc)
		lists["Test baseline acc"].append(baseline_test_acc)

		lists["Validation baseline auc"].append(baseline_validation_auc)
		lists["Test baseline auc"].append(baseline_test_auc)

		lists["Validation oversample auc"].append(oversample_validation_auc)
		lists["Test oversample auc"].append(oversample_test_auc)

		lists["Validation oversample acc"].append(oversample_validation_acc)
		lists["Test oversample acc"].append(oversample_test_acc)

	counts = pd.DataFrame(y_train).value_counts()
	print(curr, f"IR: {round(counts.max()/counts.min(), 4)}")

	print(f"Mean validation acc diff {np.mean(np.subtract(lists['Validation oversample acc'], lists['Validation baseline acc']))}")
	print(f"Mean test acc diff       {np.mean(np.subtract(lists['Test oversample acc'], lists['Test baseline acc']))}")

	print(f"Mean validation auc diff {np.mean(np.subtract(lists['Validation oversample auc'], lists['Validation baseline auc']))}")
	print(f"Mean test auc diff       {np.mean(np.subtract(lists['Test oversample auc'], lists['Test baseline auc']))}")
	
	print(f"\nValidation acc diff pval {True if ranksums(lists['Validation baseline acc'], lists['Validation oversample acc']).pvalue < 0.05 else False}")
	print(f"Test acc diff pval       {True if ranksums(lists['Test baseline acc'], lists['Test oversample acc']).pvalue < 0.05 else False}")

	print(f"\nValidation auc diff pval {True if ranksums(lists['Validation baseline auc'], lists['Validation oversample auc']).pvalue < 0.05 else False}")
	print(f"Test auc diff pval       {True if ranksums(lists['Test baseline auc'], lists['Test oversample auc']).pvalue < 0.05 else False}")
	print("\n--\n")
	


	

abalone-20_vs_8-9-10 IR: 72.6923
Mean validation acc diff 0.0
Mean test acc diff       0.0
Mean validation auc diff 0.0
Mean test auc diff       0.0

Validation acc diff pval False
Test acc diff pval       False

Validation auc diff pval False
Test auc diff pval       False

--

abalone19 IR: 129.4375
Mean validation acc diff 0.0
Mean test acc diff       0.0
Mean validation auc diff 0.0
Mean test auc diff       0.0

Validation acc diff pval False
Test acc diff pval       False

Validation auc diff pval False
Test auc diff pval       False

--

abalone9-18 IR: 16.381
Mean validation acc diff 0.0
Mean test acc diff       0.0
Mean validation auc diff 0.0
Mean test auc diff       0.0

Validation acc diff pval False
Test acc diff pval       False

Validation auc diff pval False
Test auc diff pval       False

--

cleveland-0_vs_4 IR: 13.3333
Mean validation acc diff 0.0
Mean test acc diff       0.0
Mean validation auc diff 0.0
Mean test auc diff       0.0

Validation acc diff pval False
Tes

# Oversample + optimization versus baseline

In [6]:
for curr in iter_by_datset_name:
	validation_baseline = []
	test_baseline = []

	oversample_validation = []
	oversample_test = []

	record = []
	lists = {
		"Validation baseline auc": [],
		"Test baseline auc": [],

		"Validation baseline acc": [],
		"Test baseline acc": [],

		"Optimized Validation auc": [],
		"Optimized Test auc": [],
		"Ideal Test auc": [],

		"Optimized Validation acc": [],
		"Optimized Test acc": [],
		"Ideal Test acc": [],

	}
	for file in iter_by_datset_name[curr]:

		result = results_by_name[file]
		synthetic_samples = synthetic_by_name[file.replace(".result", ".csv")]
		
		data_split = data_mapper[file.replace(".result", "")]
		x_train, y_train = data_split['x_train'], data_split['y_train']
		x_validation, y_validation = data_split['x_validation'], data_split['y_validation']
		x_test, y_test = data_split['x_test'], data_split['y_test']
		
		minority_label = pd.DataFrame(y_train).value_counts().argmin()

		# Fit with baseline train
		model = KNeighborsClassifier(n_neighbors=AUC_Optimizer.n_neighbours)
		model.fit(x_train, y_train)

		y_pred = model.predict(x_validation)
		baseline_validation_acc = accuracy_score(y_validation, y_pred)
		baseline_validation_auc = roc_auc_score(y_validation, y_pred)
		
		y_pred = model.predict(x_test)
		baseline_test_acc = accuracy_score(y_test, y_pred)
		baseline_test_auc = roc_auc_score(y_test, y_pred)

		x_SYNTH, y_SYNTH = np.concatenate((x_train, synthetic_samples)), np.concatenate((y_train, [minority_label] * len(synthetic_samples)))

		# Select ideal instance
		problem = AUC_Optimizer(
			x_SYNTH,
			y_SYNTH,
			x_validation,
			y_validation,
		)
		algorithm = NSGA2(
			pop_size=AUC_Optimizer.population_size, 
			sampling=DiverseCustomSampling(),
			crossover=HUX(), 
			mutation=BitflipMutation(), 
			eliminate_duplicates=True,
		)
		result = minimize(
			problem, 
			algorithm, 
			('n_gen', AUC_Optimizer.population_size), # <--- maybe increase
			save_history=False
		)
		
		validation_aucs = []
		test_aucs = []
		
		for instance in result.X:
			model = KNeighborsClassifier(n_neighbors=AUC_Optimizer.n_neighbours)
			model.fit(x_SYNTH[instance], y_SYNTH[instance])
			y_pred = model.predict(x_validation)
			validation_aucs.append(roc_auc_score(y_validation, y_pred))
			y_pred = model.predict(x_test)
			test_aucs.append(roc_auc_score(y_test, y_pred))

		validation_idx = np.argmin(validation_aucs)
		test_idx = np.argmin(test_aucs)

		# Calculate metrics using ideal instance w.r.t validation AUC
		model = KNeighborsClassifier(n_neighbors=AUC_Optimizer.n_neighbours)
		model.fit(x_SYNTH[result.X[validation_idx]], y_SYNTH[result.X[validation_idx]])

		y_pred = model.predict(x_validation)
		optimized_validation_acc = accuracy_score(y_validation, y_pred)
		optimized_validation_auc = roc_auc_score(y_validation, y_pred)
		
		y_pred = model.predict(x_test)
		optimized_test_acc = accuracy_score(y_test, y_pred)
		optimized_test_auc = roc_auc_score(y_test, y_pred)

		# Calculate metrics using ideal instance w.r.t test AUC
		model = KNeighborsClassifier(n_neighbors=AUC_Optimizer.n_neighbours)
		model.fit(x_SYNTH[result.X[test_idx]], y_SYNTH[result.X[test_idx]])
		
		y_pred = model.predict(x_test)
		ideal_test_acc = accuracy_score(y_test, y_pred)
		ideal_test_auc = roc_auc_score(y_test, y_pred)

		lists["Validation baseline acc"].append(baseline_validation_acc)
		lists["Test baseline acc"].append(baseline_test_acc)

		lists["Validation baseline auc"].append(baseline_validation_auc)
		lists["Test baseline auc"].append(baseline_test_auc)

		lists["Optimized Validation auc"].append(optimized_validation_auc)
		lists["Optimized Test auc"].append(optimized_test_auc)
		lists["Ideal Test auc"].append(ideal_test_auc)

		lists["Optimized Validation acc"].append(optimized_validation_acc)
		lists["Optimized Test acc"].append(optimized_test_acc)
		lists["Ideal Test acc"].append(ideal_test_acc)

	counts = pd.DataFrame(y_train).value_counts()
	print(curr, f"IR: {round(counts.max()/counts.min(), 4)}")

	print(f"Mean optimized validation acc diff {np.mean(np.subtract(lists['Optimized Validation acc'], lists['Validation baseline acc']))}")
	print(f"Mean optimized test acc diff       {np.mean(np.subtract(lists['Optimized Test acc'], lists['Test baseline acc']))}")
	print(f"Mean ideal test acc diff           {np.mean(np.subtract(lists['Ideal Test acc'], lists['Test baseline acc']))}")
	
	print(f"Mean optimized validation auc diff {np.mean(np.subtract(lists['Optimized Validation auc'], lists['Validation baseline auc']))}")
	print(f"Mean optimized test auc diff       {np.mean(np.subtract(lists['Optimized Test auc'], lists['Test baseline auc']))}")
	print(f"Mean ideal test auc diff           {np.mean(np.subtract(lists['Ideal Test auc'], lists['Test baseline auc']))}")
	
	print(f"\nValidation acc diff pval         {True if ranksums(lists['Validation baseline acc'], lists['Optimized Validation acc']).pvalue < 0.05 else False}")
	print(f"Test acc diff pval                 {True if ranksums(lists['Test baseline acc'], lists['Optimized Test acc']).pvalue < 0.05 else False}")
	print(f"Ideal Test acc diff pval           {True if ranksums(lists['Test baseline acc'], lists['Ideal Test acc']).pvalue < 0.05 else False}")

	print(f"\nValidation auc diff pval         {True if ranksums(lists['Validation baseline auc'], lists['Optimized Validation auc']).pvalue < 0.05 else False}")
	print(f"Test auc diff pval                 {True if ranksums(lists['Test baseline auc'], lists['Test baseline auc']).pvalue < 0.05 else False}")
	print(f"Ideal Test auc diff pval           {True if ranksums(lists['Test baseline auc'], lists['Ideal Test auc']).pvalue < 0.05 else False}")
	print("\n--\n")
	


	


Compiled modules for significant speedup can not be used!
https://pymoo.org/installation.html#installation

from pymoo.config import Config



KeyboardInterrupt: 

In [None]:
temp = np.concatenate((x_train, synthetic_samples))
temp1 = np.concatenate((y_train, [minority_label] * len(synthetic_samples)))

x_train.shape, temp.shape, temp1.shape, result.X.shape


((958, 7), (970, 7), (970,), (6, 981))