In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import ranksums

from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from pymoo.operators.mutation.bitflip import BitflipMutation
from pymoo.operators.sampling.rnd import Sampling
from pymoo.operators.crossover.hux import HUX
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.core.problem import Problem
from pymoo.optimize import minimize

from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch

from joblib import Parallel, delayed
from datetime import datetime
import pandas as pd
import numpy as np
import pickle
import os

from main import *

In [2]:
with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)
	splits = pd.read_csv('data_splits.csv')

In [3]:
for key in splits:
	minority_varience = []
	global_varience = []
	auc_baseline = []
	ir_baseline = []
	if 'zoo' in key: continue
	for data_key in splits[key]:
		# print(data_key)
		split_number, dataset_name = split_info(data_key)

		x_train = data_mapper[data_key]['x_train'] 
		y_train = data_mapper[data_key]['y_train']
		x_validation = data_mapper[data_key]['x_validation'] 
		y_validation = data_mapper[data_key]['y_validation']
		x_test = data_mapper[data_key]['x_test'] 
		y_test = data_mapper[data_key]['y_test']
		
		model = KNeighborsClassifier(n_neighbors=AUC_Optimizer.n_neighbours)
		model.fit(x_train, y_train)

		y_pred = model.predict(x_validation)
		baseline_validation_AUC = roc_auc_score(y_validation, y_pred)
		auc_baseline.append(baseline_validation_AUC)
		
		minority_label = pd.DataFrame(y_train).value_counts().argmin()
		minority_indices = np.where(y_train==minority_label)[0]
		minority_features = x_train[minority_indices]
		minority_labels = y_train[minority_indices]
		
		# print(len(minority_labels))
		
		minority_label = pd.DataFrame(y_validation).value_counts().argmin()
		minority_indices = np.where(y_validation==minority_label)[0]
		minority_features = x_validation[minority_indices]
		minority_labels = y_validation[minority_indices]
		
		# problem = AUC_Optimizer(
		# 	x_train, y_train, 
		# 	x_validation, y_validation,
		# 	X_test=x_test,
		# 	Y_test=y_test)

		# algorithm = NSGA2(pop_size=AUC_Optimizer.population_size, sampling=DiverseCustomSampling(), crossover=HUX(), mutation=BitflipMutation(), eliminate_duplicates=True)
		# result = minimize(problem, algorithm, ('n_gen', AUC_Optimizer.population_size), save_history=False)

		# for instance in result.X:
		# 	# instance = [idx]

		# 	if np.sum(instance) >= AUC_Optimizer.n_neighbours:
		# 		model = KNeighborsClassifier(n_neighbors=AUC_Optimizer.n_neighbours)
		# 		model.fit(
		# 			self.X_train[instance], 
		# 			self.y_train[instance]
		# 		)
		# 		y_pred = model.predict(self.X_val)
		# 		validation_aucs.append(roc_auc_score(self.y_val, y_pred))
		# 		y_pred = model.predict(self.X_TEST)
		# 		test_aucs.append(roc_auc_score(self.Y_TEST, y_pred))
		# 	else:
		# 		validation_aucs.append(0)
		# 		test_aucs.append(0)
				
		# validation_idx = np.argmax(validation_aucs)
		ir_baseline.append(calculate_IR(y_train))
		# print(len(minority_labels))

		# Define configuration of cVAE
		# input_dim = x_train[0].shape[0]
		# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
		# cvae = ConditionalVAE(input_dim, 1, input_dim//2, 2).to(device)
		# cvae = train(x_train, y_train, cvae, lr=1e-3, epochs=200, batch_size=20, beta=0.8)

		# global_varience.append(calculate_latent_dimension_variance(x_train, y_train, cvae))
		# minority_varience.append(calculate_latent_dimension_variance(minority_features, minority_labels, cvae))
	print(key)
	# print(np.mean(auc_baseline, axis=0), np.min(auc_baseline, axis=0), np.max(auc_baseline, axis=0))
	print(f"{np.mean(ir_baseline)}\n")
		

abalone-20_vs_8-9-10
72.69230769230768

abalone19
129.4375

abalone9-18
16.380952380952387

cleveland-0_vs_4
13.333333333333329

ecoli-0-1-4-7_vs_2-3-5-6
10.69032258064516

ecoli-0-2-6-7_vs_3-5
9.181818181818183

ecoli-0-4-6_vs_5
9.100000000000001

ecoli-0-6-7_vs_3-5
9.090909090909088

ecoli1
3.3552305080318656

ecoli4
15.800000000000004

glass-0-6_vs_5
11.367741935483872

glass1
1.815789473684211

glass6
6.642857142857142

haberman
2.825000000000001

pima
1.865671641791045

winequality-red-8_vs_6-7
46.444444444444414

wisconsin
1.865546218487395

yeast-2_vs_4
9.08868486352357

yeast4
28.127642679900738



In [10]:
np.mean(minority_varience, axis=0), np.min(minority_varience, axis=0), np.max(minority_varience, axis=0),

(array([0.9970051 , 0.96717733], dtype=float32),
 array([0.42999035, 0.57313544], dtype=float32),
 array([1.9429154, 1.9661407], dtype=float32))

In [11]:
np.mean(global_varience, axis=0), np.min(global_varience, axis=0), np.max(global_varience, axis=0),

(array([0.98154336, 1.0109628 ], dtype=float32),
 array([0.90596944, 0.9211443 ], dtype=float32),
 array([1.0840032, 1.1332822], dtype=float32))

In [22]:
np.mean(auc_baseline, axis=0), np.min(auc_baseline, axis=0), np.max(auc_baseline, axis=0),

(np.float64(0.5545940363475055),
 np.float64(0.4958100558659218),
 np.float64(0.6538461538461539))

In [13]:
problem = AUC_Optimizer(
	x_train, y_train, 
	x_validation, y_validation,
	X_test=x_test,
	Y_test=y_test)

algorithm = NSGA2(pop_size=AUC_Optimizer.population_size, sampling=DiverseCustomSampling(), crossover=HUX(), mutation=BitflipMutation(), eliminate_duplicates=True)
result = minimize(problem, algorithm, ('n_gen', AUC_Optimizer.population_size), save_history=False)

In [20]:
result.pop[0].X.shape

(742,)