In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from pymoo.operators.sampling.rnd import BinaryRandomSampling
from pymoo.operators.crossover.hux import HUX
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.indicators.hv import Hypervolume
from pymoo.core.mutation import Mutation
from pymoo.core.sampling import Sampling
from pymoo.core.problem import Problem
from pymoo.optimize import minimize

import matplotlib.pyplot as plt
from pymoo.operators.mutation.bitflip import BitflipMutation

from sklearn.model_selection import StratifiedShuffleSplit

from joblib import Parallel, delayed

from scipy.stats import ranksums

from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch
import torch.nn as nn

import pickle
import os
import re

from ucimlrepo import fetch_ucirepo 

import pandas as pd
import numpy as np

In [40]:
def load_datasets():
	datasets = [
		(fetch_ucirepo(id=52), "ionosphere"),
		(fetch_ucirepo(id=43), "haberman"),
		(fetch_ucirepo(id=53), "iris0"),
		(fetch_ucirepo(id=42), "glass1"),
		(fetch_ucirepo(id=143), "australia"),
		(fetch_ucirepo(id=277), "thoracic"),
		(fetch_ucirepo(id=50), "segment0"),
		# (fetch_ucirepo(id=149), "vehicle0"),
		(fetch_ucirepo(id=109), "wine"),
		# (fetch_ucirepo(id=39), "ecoli"),
		(fetch_ucirepo(id=225), "ILPD"),
		(fetch_ucirepo(id=45), "heart_disease"),
		(fetch_ucirepo(id=17), "wisconsin"),
		# (fetch_ucirepo(id=73), "mushroom"),
		(fetch_ucirepo(id=94), "spambase"),
		(fetch_ucirepo(id=161), "mammographic"),
		# (fetch_ucirepo(id=12), "balance"),
		(fetch_ucirepo(id=110), "yeast1"),
		(fetch_ucirepo(id=451), "coimbra"),
		(fetch_ucirepo(id=244), "fertility")
	]

	return datasets

def prepare_splits(x, y):
	train_split = StratifiedShuffleSplit(
		n_splits=31, 
		test_size=0.5
	)
	splits = []
	for train_idx, temp_idx in train_split.split(x, y):
		test_split = StratifiedShuffleSplit(
			n_splits=1, 
			test_size=0.5
		)
		test_idx, validation_idx = next(test_split.split(x[temp_idx], y[temp_idx]))

		validation_idx = temp_idx[validation_idx]
		test_idx = temp_idx[test_idx]
		
		splits.append((train_idx, validation_idx, test_idx))
	return splits

def create_preprocessor_pipeline(variables):
	
	type_mappings = {}
	for variable_idx, variable_name in enumerate(variables['name']):
		variable_type = variables['type'][variable_idx]
		if variable_type not in type_mappings:
			type_mappings[variable_type] = []

		if variables['role'][variable_idx] == 'Feature':
			type_mappings[variable_type].append(variable_name)

	categorical_transformer = Pipeline(steps=[
		('imputer', SimpleImputer(strategy='most_frequent')),
		('onehot', OneHotEncoder(handle_unknown='ignore'))
	])
	numerical_transformer = Pipeline(steps=[
		('imputer', SimpleImputer(strategy='mean')),
		('scaler', StandardScaler())
	])

	numerical_features = []
	if 'Continuous' in type_mappings:
		for feature in type_mappings['Continuous']:
			numerical_features.append(feature)
	if 'Integer' in type_mappings:
		for feature in type_mappings['Integer']:
			numerical_features.append(feature)
			
	transformer_steps = []
	if numerical_features != []:
		transformer_steps.append(
			('num', numerical_transformer, numerical_features)
		)
	if 'Categorical' in type_mappings:
		transformer_steps.append(
			('cat', categorical_transformer, type_mappings['Categorical'])
		)
	preprocessor = ColumnTransformer(
		transformers=transformer_steps
	)
	pipeline = Pipeline(steps=[
		('preprocessor', preprocessor)
	])
	
	return pipeline

def over_sample(x, y):
	counts = pd.DataFrame(y).value_counts()
	minority_class_label = counts.index[np.argmin(counts)][0]
	minority_class_indicies = np.where(y == minority_class_label)[0]
	# y = y.reshape(-1, 1)
	over_sampled_x = np.concatenate((x, x[minority_class_indicies]), axis=0)
	over_sampled_y = np.concatenate((y, y[minority_class_indicies]), axis=0)
	# over_sampled_y = over_sampled_y.reshape(-1)    
	return over_sampled_x, over_sampled_y

def parallel_error(instance, x_train, y_train, x_compare, y_compare):
	x_filtered, y_filtered = x_train[instance], y_train[instance]
	if x_filtered.shape[0] < GenericOptimizer.n_neighbours: 
		error = 1
	else:
		knn = KNeighborsClassifier(n_neighbors=GenericOptimizer.n_neighbours)
		knn.fit(x_filtered, y_filtered)
		y_pred = knn.predict(x_compare)
		error = 1 - accuracy_score(y_compare, y_pred)
	return error

def calculate_metrics(x_train, y_train, x_validation, y_validation, x_test, y_test, result):
	baseline_validation_err = GenericOptimizer.calculate_overall_error(
		x_train, y_train,
		x_validation, y_validation,
		GenericOptimizer.n_neighbours
	)
	baseline_test_err = GenericOptimizer.calculate_overall_error(
		x_train, y_train,
		x_test, y_test,
		GenericOptimizer.n_neighbours
	)

	validation_F = Parallel(n_jobs=-1)(delayed(parallel_error)(instance, x_train, y_train, x_validation, y_validation) for instance in result.X)
	test_F = Parallel(n_jobs=-1)(delayed(parallel_error)(instance, x_train, y_train, x_test, y_test) for instance in result.X)
	ideal_validation = result.X[np.argmin(validation_F)]
	ideal_test = result.X[np.argmin(test_F)]
	
	validation_inclusions = np.sum(ideal_validation)
	test_inclusions = np.sum(ideal_test)

	# reduction_rate = 1 - (best_x_train.shape[0] / x_train.shape[0])
	# optimized_ir = GenericOptimizer.calculate_IR(best_y_train)
	optimized_validation_err = GenericOptimizer.calculate_overall_error(
		x_train[ideal_validation],
		y_train[ideal_validation],
		x_validation,
		y_validation,
		GenericOptimizer.n_neighbours
	)
	optimized_test_err = GenericOptimizer.calculate_overall_error(
		x_train[ideal_validation],
		y_train[ideal_validation],
		x_test,
		y_test,
		GenericOptimizer.n_neighbours
	)
	ideal_optimized_test_err = GenericOptimizer.calculate_overall_error(
		x_train[ideal_test],
		y_train[ideal_test],
		x_test,
		y_test,
		GenericOptimizer.n_neighbours
	)
	return validation_inclusions, test_inclusions, baseline_validation_err, baseline_test_err, optimized_validation_err, optimized_test_err, ideal_optimized_test_err

class GenericOptimizer(Problem):
	population_size = 100
	n_neighbours = 5
	sequential = False
	def __init__(self, X_train, y_train, X_val, y_val, objectives, exec_mode):
		self.mutation_history = {}
		self.generation_number = 0

		self.exec_mode = exec_mode

		self.X_train = X_train
		self.y_train = y_train

		self.X_val = X_val
		self.y_val = y_val

		self.training_data = X_train
		self.n_instances = X_train.shape[0]
		
		self.objectives = objectives

		super().__init__(
			n_var=self.n_instances,
			n_obj=len(objectives),               
			n_constr=0,            
			xl=0,                  
			xu=1,                  
			type_var=np.bool_,     
		)

	def _evaluate(self, x, out, *args, **kwargs):
		
		if self.exec_mode == "sequential":
			metrics = []
			for objective in self.objectives:
				metrics.append(self.eval_objective((objective, x)))
		else:
			metrics = Parallel(n_jobs=-1)(delayed(self.eval_objective)((objective, x)) for objective in self.objectives)
		
		self.generation_number += 1
		print(metrics)
		out["F"] = np.column_stack(metrics)

	def eval_objective(self, pack):
		objective, x = pack
			
		if "calculate_num_examples" in repr(objective):
			return GenericOptimizer.calculate_num_examples(x)

		elif "calculate_IR" in repr(objective):
			vals = []
			for instance in x:
				vals.append(GenericOptimizer.calculate_IR(self.y_train[instance]))
			return vals
		
		else:
			vals = []
			for instance in x:
				vals.append(objective(
					self.X_train[instance],
					self.y_train[instance],
					self.X_val,
					self.y_val,
					GenericOptimizer.n_neighbours
				))
			return vals

	@classmethod
	def calculate_IR(cls, y):
		df = pd.DataFrame(y).value_counts()
		return (df[1]/df[0]) if df.min() == 0 else (df[0]/df[1])
	
	@classmethod
	def filter_by_class(cls, x, y, label):
		indices = np.where(y==label)
		return x[indices], y[indices]
	
	@classmethod
	def calculate_overall_error(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			acc = accuracy_score(y_val, y_pred)
			return 1-acc
		else:
			return 1

	@classmethod
	def calculate_class0_error(cls, x_train, y_train, x_val, y_val, n):
		class0_x_val, class0_y_val = cls.filter_by_class(x_val, y_val, 0)
		err = cls.calculate_overall_error(
			x_train,
			y_train,
			class0_x_val,
			class0_y_val,
			n
		)
		return err

	@classmethod
	def calculate_class1_error(cls, x_train, y_train, x_val, y_val, n):
		class1_x_val, class1_y_val = cls.filter_by_class(x_val, y_val, 1)
		err = cls.calculate_overall_error(
			x_train,
			y_train,
			class1_x_val,
			class1_y_val,
			n
		)
		return err

	@classmethod
	def calculate_overall_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]
		counts = pd.DataFrame(y_train).value_counts()
		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			f1 = f1_score(y_val, y_pred, average='weighted')
			return 1-f1
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_f1 = cls.calculate_overall_inverse_f1(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_f1

	@classmethod
	def calculate_class1_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_f1 = cls.calculate_overall_inverse_f1(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_f1
	
	@classmethod
	def calculate_overall_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			prec = precision_score(y_val, y_pred)
			return 1-prec
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_prec = cls.calculate_overall_inverse_precision(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_prec

	@classmethod
	def calculate_class1_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_prec = cls.calculate_overall_inverse_precision(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_prec
		
	@classmethod
	def calculate_overall_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			recall = recall_score(y_val, y_pred)
			return 1-recall
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_recall = cls.calculate_overall_inverse_recall(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_recall

	@classmethod
	def calculate_class1_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_recall = cls.calculate_overall_inverse_recall(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_recall
	
	@classmethod
	def calculate_num_examples(cls, instances):
		return np.sum(instances, axis=1)

	@classmethod
	def quantify_performance(cls, population, objectives, x_train, y_train, x_validation, y_validation, x_test, y_test):
		pass

	@classmethod
	def unbound_eval_objectives(cls, objective, instances, x_train, y_train, x_validation, y_validation):
		if "calculate_num_examples" in repr(objective):
			return GenericOptimizer.calculate_num_examples(instances)

		elif "calculate_IR" in repr(objective):
			vals = []
			for instance in instances:
				vals.append(GenericOptimizer.calculate_IR(y_train[instance]))
			return vals
		
		else:
			vals = []
			for instance in instances:
				vals.append(objective(
					x_train[instance],
					y_train[instance],
					x_validation,
					y_validation,
					GenericOptimizer.n_neighbours
				))
			return vals
		
	@classmethod
	def calculate_optimal_instance(cls, x_train, y_train, x_val, y_val, metrics, population, n):

		fronts = NonDominatedSorting().do(metrics, only_non_dominated_front=True)
		_, pareto_indicies = np.unique(metrics[fronts], axis=0, return_index=True)

		best_acc = 0
		best_instance = None
		for idx, instance in enumerate(population[pareto_indicies]):
			x_filtered, y_filtered = x_train[instance], y_train[instance]
			if x_filtered.shape[0] < n: 
				acc = 0
			else:
				knn = KNeighborsClassifier(n_neighbors=n)
				knn.fit(x_filtered, y_filtered)
				y_pred = knn.predict(x_val)
				acc = accuracy_score(y_val, y_pred)
			
				if acc > best_acc:
					best_acc = acc
					best_instance = instance
				
		return pareto_indicies, x_train[best_instance], y_train[best_instance]
	  

In [33]:
datasets = {}
for dataset, name in load_datasets():
	datasets[name] = dataset
	print(name)
	print(pd.DataFrame(dataset.data.targets).value_counts())
	print("\n")

ionosphere
Class
g        225
b        126
Name: count, dtype: int64


haberman
survival_status
1                  225
2                   81
Name: count, dtype: int64


iris0
class          
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64


glass1
Type_of_glass
2                76
1                70
7                29
3                17
5                13
6                 9
Name: count, dtype: int64


australia
A15
0      383
1      307
Name: count, dtype: int64


thoracic
Risk1Yr
F          400
T           70
Name: count, dtype: int64


segment0
class    
BRICKFACE    30
CEMENT       30
FOLIAGE      30
GRASS        30
PATH         30
SKY          30
WINDOW       30
Name: count, dtype: int64


wine
class
2        71
1        59
3        48
Name: count, dtype: int64


ILPD
Selector
1           416
2           167
Name: count, dtype: int64


heart_disease
num
0      164
1       55
2       36
3       35
4       13
Name: count, dtype: int64

In [36]:
dataset = datasets['ionosphere']
raw_X, y = dataset.data.features, dataset.data.targets
pipeline = create_preprocessor_pipeline(dataset.variables)
pipeline.fit(raw_X, y)
X = pipeline.transform(raw_X)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

splits = []
for c, (train_idx, validation_idx, test_idx) in enumerate(prepare_splits(X, y)):
	x_train, y_train = X[train_idx], y[train_idx]
	x_validation, y_validation = X[validation_idx], y[validation_idx]
	x_test, y_test = X[test_idx], y[test_idx]
	splits.append((
		x_train, y_train,
		x_validation, y_validation,
		x_test, y_test
	))



  y = column_or_1d(y, warn=True)


In [37]:
x_train, y_train, x_validation, y_validation, x_test, y_test = splits[0]

counts = pd.DataFrame(y_train).value_counts()
minority_class_label = counts.index[np.argmin(counts)][0]
minority_class_indicies = np.where(y_train == minority_class_label)[0]

minority_x = x_train[minority_class_indicies]
minority_y = y_train[minority_class_indicies]

minority_x[0]


array([-2.86998991,  0.        , -1.29042961, -0.1006609 , -1.15785826,
       -0.25184872, -1.11818968, -0.22953584, -1.01087286, -0.37533106,
       -0.84625644, -0.31377589, -2.25463227, -2.21263915, -0.52793468,
       -0.15540672,  1.00147721,  1.51922382, -0.57467942,  0.04634965,
       -0.55290335, -0.01603297, -2.25984427,  2.00758853, -0.6857975 ,
        0.1401952 ,  0.88920842, -1.69408712, -2.39702657, -1.91639814,
        1.13460972,  1.95731486, -0.66938378, -0.03096227])

In [None]:
class MLP(nn.Module):
	def __init__(self):
		super(MLP, self).__init__()
		self.linear1 = nn.Linear(input_dim, input_dim//2)
		self.relu1 = nn.ReLU()
		self.linear2 = nn.Linear(input_dim//2, input_dim//3)
		self.relu2 = nn.ReLU()
		self.linear3 = nn.Linear(input_dim//3, input_dim)

	def forward(self, x):
		x = self.linear1(x)
		x = self.relu1(x)
		x = self.linear2(x)
		x = self.relu2(x)
		x = self.linear3(x)
		return x
	
class CustomDataset(Dataset):
	def __init__(self, x_train, y_train):
		self.x_train = x_train
		self.y_train = y_train
	def __len__(self):
		return self.x_train.shape[0]
	def __getitem__(self, ind):
		x = self.x_train[ind]
		y = self.y_train[ind]
		return x, y
	
train_set = CustomDataset(
	x_train, 
	y_train
)

train_loader = DataLoader(
	train_set, 
	batch_size=215, 
	shuffle=True
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	
model = MLP().to(device)
optimizer = torch.optim.Adam(
	model.parameters(), 
	lr=0.000001
)
criterion = nn.BCEWithLogitsLoss()

model.train()
for epoch in range(50):
	losses = []
	for batch_num, input_data in enumerate(train_loader):
		optimizer.zero_grad()
		x, y = input_data
		x, y = x.to(device).float(), y.to(device)

		output = model(x)
		loss = criterion(output, y)
		loss.backward()
		losses.append(loss.item())
		optimizer.step()

In [41]:
optimized_validation_acc = []
optimized_test_acc = []
optimized_ideal_test_acc = []
baseline_test_acc = []
baseline_validaion_acc = []
validation_inclusions = []
test_inclusions = []

for c, (train_idx, validation_idx, test_idx) in enumerate(prepare_splits(X, y)):
	x_train, y_train = X[train_idx], y[train_idx]
	x_validation, y_validation = X[validation_idx], y[validation_idx]
	x_test, y_test = X[test_idx], y[test_idx]

	x_train, y_train = over_sample(
		x_train, 
		y_train
	)
	problem = GenericOptimizer(
		x_train, 
		y_train, 
		x_validation, 
		y_validation,
		[GenericOptimizer.calculate_class0_error, GenericOptimizer.calculate_class1_error],
		"sequential"
	)
	algorithm = NSGA2(
		pop_size=GenericOptimizer.population_size, 
		sampling=BinaryRandomSampling(), 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True,
	)
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', GenericOptimizer.population_size),
		save_history=False
	)	

	num_validation, num_test, baseline_validation_err, baseline_test_err, optimized_validation_err, optimized_test_err, ideal_optimized_test_err = calculate_metrics(
				x_train, 
				y_train, 
				x_validation, 
				y_validation, 
				x_test, 
				y_test, 
				result
	)

	validation_inclusions.append(num_validation)
	test_inclusions.append(num_test)
	baseline_validaion_acc.append(1-baseline_validation_err)
	baseline_test_acc.append(1-baseline_test_err)
	optimized_validation_acc.append(1-optimized_validation_err)
	optimized_test_acc.append(1-optimized_test_err)
	optimized_ideal_test_acc.append(1-ideal_optimized_test_err)

	break
val_pval = ranksums(baseline_validaion_acc, optimized_validation_acc).pvalue
test_pval = ranksums(baseline_test_acc, optimized_test_acc).pvalue
ideal_test_pval = ranksums(baseline_test_acc, optimized_ideal_test_acc).pvalue

[[0.25806451612903225, 0.22580645161290325, 0.25806451612903225, 0.25806451612903225, 0.29032258064516125, 0.22580645161290325, 0.25806451612903225, 0.3870967741935484, 0.32258064516129037, 0.3548387096774194, 0.25806451612903225, 0.25806451612903225, 0.32258064516129037, 0.29032258064516125, 0.29032258064516125, 0.22580645161290325, 0.29032258064516125, 0.32258064516129037, 0.29032258064516125, 0.3870967741935484, 0.32258064516129037, 0.29032258064516125, 0.3548387096774194, 0.32258064516129037, 0.32258064516129037, 0.25806451612903225, 0.32258064516129037, 0.25806451612903225, 0.3548387096774194, 0.25806451612903225, 0.25806451612903225, 0.29032258064516125, 0.25806451612903225, 0.3548387096774194, 0.29032258064516125, 0.29032258064516125, 0.29032258064516125, 0.29032258064516125, 0.29032258064516125, 0.25806451612903225, 0.25806451612903225, 0.22580645161290325, 0.3548387096774194, 0.32258064516129037, 0.29032258064516125, 0.32258064516129037, 0.29032258064516125, 0.3548387096774194

In [None]:
baseline_test_acc, optimized_test_acc, optimized_ideal_test_acc


([0.8181818181818182], [0.7840909090909091], [0.8636363636363636])

: 