In [33]:
from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from pymoo.operators.sampling.rnd import BinaryRandomSampling
from pymoo.operators.mutation.bitflip import BitflipMutation
from pymoo.operators.crossover.hux import HUX
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.core.sampling import Sampling
from pymoo.core.problem import Problem
from pymoo.optimize import minimize

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

import itertools 

from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler

display_pca = False
N_NEIGHBOURS = 5
RANDOM_SEED = None

In [None]:
def prepare_splits(path):
	try:
		df = pd.read_csv(path, delimiter=', ', engine='python')
		x = df.drop(columns='Class')
		y = df['Class']
	except KeyError:
		df = pd.read_csv(path, delimiter=',')
		x = df.drop(columns='Class')
		y = df['Class']

	label_encoder = LabelEncoder()
	x = np.array(x)
	y = label_encoder.fit_transform(y)
	
	train_split = StratifiedShuffleSplit(
		n_splits=31,
		test_size=0.5,
	)

	splits = []

	for train_idx, temp_idx in train_split.split(x, y):

		test_split = StratifiedShuffleSplit(
			n_splits=1,
			test_size=0.5
		)

		x_temp, y_temp = x[temp_idx], y[temp_idx]

		test_idx, validation_idx = next(test_split.split(x_temp, y_temp))

		validation_idx = temp_idx[validation_idx]
		test_idx = temp_idx[test_idx]

		splits.append((train_idx, validation_idx, test_idx))
	
	return x, y, splits

def over_sample(x, y):
	
	counts = pd.DataFrame(y).value_counts()

	if counts[0] < counts[1]:
		minority_class_indicies = np.where(y == 1)
	else:
		minority_class_indicies = np.where(y == 0)

	over_sampled_x = np.concatenate((x, x[minority_class_indicies]), axis=0)
	over_sampled_y = np.concatenate((y, y[minority_class_indicies]), axis=0)

	return over_sampled_x, over_sampled_y

class BiasedBinarySampling(Sampling):
	def __init__(self, labels, major_prob, minor_prob):
		
		self.labels = labels
		counts = pd.DataFrame(labels).value_counts()
		if counts[0] > counts[1]:
			self.c0_thresh = major_prob
			self.c1_thresh = minor_prob
		else:
			self.c0_thresh = minor_prob
			self.c1_thresh = major_prob

		super().__init__()

	def _do(self, problem, n_samples, **kwargs):

		rands = np.random.random((n_samples, problem.n_var))
		init_pops = np.zeros((n_samples, problem.n_var), dtype=bool)
		for idx, label in enumerate(self.labels):
			if label == 0:
				init_pops[:, idx] = (rands[:, idx] < self.c0_thresh).astype(bool)
			if label == 1:
				init_pops[:, idx] = (rands[:, idx] < self.c1_thresh).astype(bool)


		return init_pops

class InheritedSampling(Sampling):
	def __init__(self, pareto_front, mutation_prob, num_rows_inherited):
		
		self.parent = pareto_front
		self.thresh = mutation_prob
		self.inherit_thresh = num_rows_inherited
		super().__init__()

	def _do(self, problem, n_samples, **kwargs):

		init_pops = np.zeros((n_samples, problem.n_var), dtype=bool)
		rands = np.random.random((n_samples, problem.n_var))
		for i in range(init_pops.shape[0]):
			for j in range(init_pops.shape[1]):

				if i < self.inherit_thresh:
					if rands[i, j] < self.thresh:
						init_pops[i, j] = 0 if self.parent[i, j] == 1 else 1
					else:
						init_pops[i, j] = self.parent[i, j]
				else:
					init_pops[i, j] = 0 if rands[i, j] < 0.5 else 1

		return init_pops

class GenericOptimizer(Problem):
	population_size = 100
	n_neighbours = 5
	sequential = False
	def __init__(self, X_train, y_train, X_val, y_val, objectives, exec_mode):
		
		self.exec_mode = exec_mode

		self.X_train = X_train
		self.y_train = y_train

		self.X_val = X_val
		self.y_val = y_val

		self.training_data = X_train
		self.n_instances = X_train.shape[0]
		
		self.objectives = objectives

		super().__init__(
			n_var=self.n_instances,
			n_obj=len(objectives),               
			n_constr=0,            
			xl=0,                  
			xu=1,                  
			type_var=np.bool_,     
		)

	def _evaluate(self, x, out, *args, **kwargs):
		
		if self.exec_mode == "sequential":
			metrics = []
			for objective in self.objectives:
				metrics.append(self.eval_objective((objective, x)))
		else:
			metrics = Parallel(n_jobs=-1)(delayed(self.eval_objective)((objective, x)) for objective in self.objectives)
		
		out["F"] = np.column_stack(metrics)

	def eval_objective(self, pack):
		objective, x = pack
			
		if "calculate_num_examples" in repr(objective):
			return GenericOptimizer.calculate_num_examples(x)

		elif "calculate_IR" in repr(objective):
			vals = []
			for instance in x:
				vals.append(GenericOptimizer.calculate_IR(self.y_train[instance]))
			return vals
		
		else:
			vals = []
			for instance in x:
				vals.append(objective(
					self.X_train[instance],
					self.y_train[instance],
					self.X_val,
					self.y_val,
					GenericOptimizer.n_neighbours
				))
			return vals

	@classmethod
	def calculate_IR(cls, y):
		df = pd.DataFrame(y).value_counts()
		return (df[1]/df[0]) if df.min() == 0 else (df[0]/df[1])
	
	@classmethod
	def filter_by_class(cls, x, y, label):
		indices = np.where(y==label)
		return x[indices], y[indices]
	
	@classmethod
	def calculate_overall_error(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			acc = accuracy_score(y_val, y_pred)
			return 1-acc
		else:
			return 1

	@classmethod
	def calculate_class0_error(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		err = cls.calculate_overall_error(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return err

	@classmethod
	def calculate_class1_error(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		err = cls.calculate_overall_error(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return err

	@classmethod
	def calculate_overall_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			f1 = f1_score(y_val, y_pred)
			return 1-f1
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_f1 = cls.calculate_overall_inverse_f1(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_f1

	@classmethod
	def calculate_class1_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_f1 = cls.calculate_overall_inverse_f1(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_f1
	
	@classmethod
	def calculate_overall_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			prec = precision_score(y_val, y_pred)
			return 1-prec
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_prec = cls.calculate_overall_inverse_precision(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_prec

	@classmethod
	def calculate_class1_precision(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_prec = cls.calculate_overall_inverse_precision(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_prec
		
	@classmethod
	def calculate_overall_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			recall = recall_score(y_val, y_pred)
			return 1-recall
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_recall = cls.calculate_overall_inverse_recall(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_recall

	@classmethod
	def calculate_class1_recall(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_recall = cls.calculate_overall_inverse_recall(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_recall
	
	@classmethod
	def calculate_num_examples(cls, instances):
		return np.sum(instances, axis=1)

	@classmethod
	def calculate_optimal_instance(cls, x_train, y_train, x_val, y_val, result, n):

		fronts = NonDominatedSorting().do(result.F, only_non_dominated_front=True)
		_, pareto_indicies = np.unique(result.F[fronts], axis=0, return_index=True)

		best_instance_idx = 0
		best_acc = 0
		best_instance = None
		for idx, instance in enumerate(result.X[pareto_indicies]):
			x_filtered, y_filtered = x_train[instance], y_train[instance]
			if x_filtered.shape[0] < n: 
				acc = 1
			else:
				knn = KNeighborsClassifier(n_neighbors=n)
				knn.fit(x_filtered, y_filtered)
				y_pred = knn.predict(x_val)
				acc = accuracy_score(y_val, y_pred)
			
				if acc > best_acc:
					best_acc = acc
					best_instance_idx = idx
					best_instance = instance
				
		return best_instance_idx, x_train[best_instance], y_train[best_instance]
	



In [None]:
def execute(config):
	train_idx = config["train_idx"]
	validation_idx = config["validation_idx"]
	test_idx = config["test_idx"]
	
	x_train, y_train = config["x"][train_idx], config["y"][train_idx]
	x_validation, y_validation = config["x"][validation_idx], config["y"][validation_idx]
	x_test, y_test = config["x"][test_idx], config["y"][test_idx]
	
	if config["over_sample"]:
		x_train, y_train = over_sample(x_train, y_train)

	objectives = config["objectives_list"]

	problem = GenericOptimizer(
		x_train, 
		y_train, 
		x_validation, 
		y_validation,
		objectives,
		"parallel" if config["thread_evaluations"] else "sequential"
	)

	algorithm = NSGA2(
		pop_size=GenericOptimizer.population_size, 
		sampling=config["sampling"], 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True)
	
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', GenericOptimizer.population_size)
	)

	return (config, result)

configs = []
for folder in os.listdir('Datasets'):
	x, y, splits = prepare_splits(os.path.join('Datasets', folder, f"{folder}.csv"))
	for train_idx, test_idx, validation_idx in splits: # 31 splits for Wilcoxon rank-sum
		config = {
			"x": x,
			"y": y,
			"train_idx": train_idx,
			"validation_idx": validation_idx,
			"test_idx": test_idx,
			"thread_evaluations": False,
			"over_sample": True,
			"objectives_list": [GenericOptimizer.calculate_overall_error, GenericOptimizer.calculate_overall_inverse_f1],
			"sampling": BinaryRandomSampling(),
		}
		configs.append(config)
		break
	break

for config in configs:
	execute(config)
# output = Parallel(n_jobs=-1)(delayed(execute)((config)) for config in configs)



0
0    191
1    154
Name: count, dtype: int64


KeyboardInterrupt: 