In [5]:
from pymoo.core.problem import Problem
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.core.sampling import Sampling
from pymoo.operators.crossover.hux import HUX
from pymoo.operators.mutation.bitflip import BitflipMutation
from pymoo.optimize import minimize
from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from pymoo.operators.sampling.rnd import BinaryRandomSampling
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

# from tqdm import tqdm

from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from utils import *

display_pca = False
N_NEIGHBOURS = 5
RANDOM_SEED = None

In [2]:
class BiasedBinarySampling(Sampling):
	def __init__(self, labels, major_prob, minor_prob):
		
		self.labels = labels
		counts = pd.DataFrame(labels).value_counts()
		if counts[0] > counts[1]:
			self.c0_thresh = major_prob
			self.c1_thresh = minor_prob
		else:
			self.c0_thresh = minor_prob
			self.c1_thresh = major_prob

		super().__init__()

	def _do(self, problem, n_samples, **kwargs):

		rands = np.random.random((n_samples, problem.n_var))
		init_pops = np.zeros((n_samples, problem.n_var), dtype=bool)
		for idx, label in enumerate(self.labels):
			if label == 0:
				init_pops[:, idx] = (rands[:, idx] < self.c0_thresh).astype(bool)
			if label == 1:
				init_pops[:, idx] = (rands[:, idx] < self.c1_thresh).astype(bool)


		return init_pops

class InheritedSampling(Sampling):
	def __init__(self, pareto_front, mutation_prob, num_rows_inherited):
		
		self.parent = pareto_front
		self.thresh = mutation_prob
		self.inherit_thresh = num_rows_inherited
		super().__init__()

	def _do(self, problem, n_samples, **kwargs):

		init_pops = np.zeros((n_samples, problem.n_var), dtype=bool)
		rands = np.random.random((n_samples, problem.n_var))
		for i in range(init_pops.shape[0]):
			for j in range(init_pops.shape[1]):

				if i < self.inherit_thresh:
					if rands[i, j] < self.thresh:
						init_pops[i, j] = 0 if self.parent[i, j] == 1 else 1
					else:
						init_pops[i, j] = self.parent[i, j]
				else:
					init_pops[i, j] = 0 if rands[i, j] < 0.5 else 1

		return init_pops
	

In [6]:
class GenericOptimizer(Problem):
	population_size = 100
	n_neighbours = 5
	sequential = False
	def __init__(self, X_train, y_train, X_val, y_val, objectives, exec_mode):
		
		self.exec_mode = exec_mode

		self.X_train = X_train
		self.y_train = y_train

		self.X_val = X_val
		self.y_val = y_val

		self.training_data = X_train
		self.n_instances = X_train.shape[0]
		
		self.objectives = objectives

		super().__init__(
			n_var=self.n_instances,
			n_obj=len(objectives),               
			n_constr=0,            
			xl=0,                  
			xu=1,                  
			type_var=np.bool_,     
		)

	def _evaluate(self, x, out, *args, **kwargs):
		
		if self.exec_mode == "sequential":
			metrics = []
			for objective in self.objectives:
				metrics.append(self.eval_objective((objective, x)))
		else:
			metrics = Parallel(n_jobs=-1)(delayed(self.eval_objective)((objective, x)) for objective in self.objectives)
		
		out["F"] = np.column_stack(metrics)

	def eval_objective(self, pack):
		objective, x = pack
			
		if "calculate_num_examples" in repr(objective):
			return GenericOptimizer.calculate_num_examples(x)

		elif "calculate_IR" in repr(objective):
			vals = []
			for instance in x:
				vals.append(GenericOptimizer.calculate_IR(self.y_train[instance]))
			return vals
		
		else:
			vals = []
			for instance in x:
				vals.append(objective(
					self.X_train[instance],
					self.y_train[instance],
					self.X_val,
					self.y_val,
					GenericOptimizer.n_neighbours
				))
			return vals

	@classmethod
	def calculate_IR(cls, y):
		df = pd.DataFrame(y).value_counts()
		return (df[1]/df[0]) if df.min() == 0 else (df[0]/df[1])
	
	@classmethod
	def filter_by_class(cls, x, y, label):
		indices = np.where(y_train==label)
		return x[indices], y[indices]
	
	@classmethod
	def calculate_overall_error(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			acc = accuracy_score(y_val, y_pred)
			return 1-acc
		else:
			return 1

	@classmethod
	def calculate_class0_error(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		err = cls.calculate_overall_error(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return err

	@classmethod
	def calculate_class1_error(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		err = cls.calculate_overall_error(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return err

	@classmethod
	def calculate_overall_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			f1 = f1_score(y_val, y_pred)
			return 1-f1
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_f1 = cls.calculate_overall_inverse_f1(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_f1

	@classmethod
	def calculate_class1_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_f1 = cls.calculate_overall_inverse_f1(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_f1
	
	@classmethod
	def calculate_overall_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			prec = precision_score(y_val, y_pred)
			return 1-prec
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_prec = cls.calculate_overall_inverse_precision(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_prec

	@classmethod
	def calculate_class1_precision(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_prec = cls.calculate_overall_inverse_precision(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_prec
		
	@classmethod
	def calculate_overall_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			recall = recall_score(y_val, y_pred)
			return 1-recall
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_recall = cls.calculate_overall_inverse_recall(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_recall

	@classmethod
	def calculate_class1_recall(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_recall = cls.calculate_overall_inverse_recall(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_recall
	
	@classmethod
	def calculate_num_examples(cls, instances):
		return np.sum(instances, axis=1)

	@classmethod
	def calculate_optimal_instance(cls, x_train, y_train, x_val, y_val, result, n):

		fronts = NonDominatedSorting().do(result.F, only_non_dominated_front=True)
		_, pareto_indicies = np.unique(result.F[fronts], axis=0, return_index=True)

		best_instance_idx = 0
		best_acc = 0
		best_instance = None
		for idx, instance in enumerate(result.X[pareto_indicies]):
			x_filtered, y_filtered = x_train[instance], y_train[instance]
			if x_filtered.shape[0] < n: 
				acc = 1
			else:
				knn = KNeighborsClassifier(n_neighbors=n)
				knn.fit(x_filtered, y_filtered)
				y_pred = knn.predict(x_val)
				acc = accuracy_score(y_val, y_pred)
			
				if acc > best_acc:
					best_acc = acc
					best_instance_idx = idx
					best_instance = instance
				
		return best_instance_idx, x_train[best_instance], y_train[best_instance]

for folder in os.listdir('Datasets'):
	X, y, X_train, X_val, X_test, y_train, y_val, y_test, name = parse_dataset(
		os.path.join('Datasets', folder, f"{folder}.csv"), 
		folder
	)
	
	objectives = [
		GenericOptimizer.calculate_overall_error,
		# GenericOptimizer.calculate_num_examples,
		GenericOptimizer.calculate_overall_inverse_f1
	]

	problem = GenericOptimizer(
		X_train, 
		y_train, 
		X_val, 
		y_val,
		objectives,
		"sequential"
	)

	algorithm = NSGA2(
		pop_size=GenericOptimizer.population_size, 
		sampling=BinaryRandomSampling(), 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True)
	
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', GenericOptimizer.population_size), 
		# verbose=True
	)

	best_instance_idx, x_train_best_instance, y_train_best_instance = GenericOptimizer.calculate_optimal_instance(X_train, y_train, X_val, y_val, result, GenericOptimizer.n_neighbours)

	print(GenericOptimizer.calculate_overall_error(x_train_best_instance, y_train_best_instance, X_test, y_test, GenericOptimizer.n_neighbours))
	break


0.11560693641618502


In [5]:
def calculate_average_optimized_metrics(data):
	runs, run = data
	
	segments = run.split("__")

	data_option = segments[0]
	init_pop = segments[1]
	optimization = segments[2]
	dataset = segments[3].replace(".pickle", "")
	
	header = [
		"Baseline Test Accuracy",
		"Baseline IR",
		"Optimized Test Accuracy",
		"Optimized IR",
		"Reduction Rate %",
	]
	values = []
	for save_var in runs[run]:

		X_train, X_val, X_test, y_train, y_val, y_test = save_var['Data']
		counts, _, _, baseline_testAcc = assess_baseline_metrics(X_train, y_train, X_test, y_test)
		baseline_IR = max(counts) / min(counts)

		_, _, x_filtered, y_filtered = select_optimal_instance(X_train, y_train, X_val, y_val, save_var['Result'])
		counts = pd.DataFrame(y_filtered).value_counts()
		optimized_IR = max(counts) / min(counts)

		optimization_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
		optimization_knn.fit(x_filtered, y_filtered)

		y_pred = optimization_knn.predict(X_test)
		test_acc = accuracy_score(y_test, y_pred)


		reduction_rate = (X_train.shape[0] - x_filtered.shape[0]) / X_train.shape[0]*100
		
		values.append([
			baseline_testAcc,
			baseline_IR,
			test_acc,
			optimized_IR,
			reduction_rate
		])
	
	averages = pd.DataFrame(values, columns=header).mean(numeric_only=True)
	
	row = [
		dataset,
		data_option,
		init_pop,
		optimization,
		averages['Baseline Test Accuracy'],
		averages['Baseline IR'],
		averages['Optimized Test Accuracy'],
		averages['Optimized IR'],
		averages['Reduction Rate %'],
		averages['Optimized Test Accuracy'] - averages['Baseline Test Accuracy']
	]

	return row

header = [
	"Dataset",
	"Over Sample?",
	"Biased Initialization?",
	"Optimization",
	"Baseline Test Accuracy",
	"Baseline IR",
	"Optimized Test Accuracy",
	"Optimized IR",
	"Reduction Rate %",
	"Optimized Test Acc Diff"
]

# for run in runs:
# 	rows = [calculate_average_optimized_metrics((runs, run))]
	
rows = Parallel(n_jobs=-1)(delayed(calculate_average_optimized_metrics)((runs, run)) for run in runs)
table = pd.DataFrame(rows, columns=header)
table.to_csv("final_results_FIXED.csv", index=False)



In [None]:
# with tqdm(total=1200) as pbar:
count = 0
with tqdm(total=400) as pbar:
	for iter in range(10):
		for data_option in ['over_sample', 'regular_sample']:
			for init_pop in ['rand', 'bias']:
					for folder in os.listdir('Datasets'):
						pbar.update(1)
						optimization = "MultiStep"
						save_name = f"Experiments//{iter}__{data_option}__{init_pop}__{optimization}__{folder}.pickle"
						# print(save_name)
						X, y, X_train, X_val, X_test, y_train, y_val, y_test, name = parse_dataset(
							os.path.join('Datasets', folder, f"{folder}.csv"), 
							folder, 
							over_sample=True if data_option == "over_sample" else False
						)
						
						initial_population = BinaryRandomSampling() if init_pop == "rand" else BiasedBinarySampling(y_train, 0.5, 0.7)
						result = execute_optimization(
							X_train, y_train, X_val, y_val, 
							InstanceSelectionProblem_3_Obj, 
							initial_population,
							pop_size=100
						)

						fronts = NonDominatedSorting().do(result.F, only_non_dominated_front=True)
						_, pareto_indicies = np.unique(result.F[fronts], axis=0, return_index=True)

						min_indicies = np.argmin(result.F[pareto_indicies], axis=0)
						result_final = execute_optimization(
							X_train, y_train, X_val, y_val, 
							InstanceSelectionProblem_2_Obj_MinMaxAcc, 
							InheritedSampling(result.X[min_indicies], mutation_prob=0.0, num_rows_inherited=len(min_indicies))
						)

						save_var = {
							"iter": iter,
							"Sampler": data_option,
							"Population": init_pop,
							"Optimization": optimization,
							"Dataset": folder,
							"Result": result_final,
							"Data": (X_train, X_val, X_test, y_train, y_val, y_test)
						}

						with open(save_name, "wb") as fh:
							pickle.dump(save_var, fh, protocol=pickle.HIGHEST_PROTOCOL)
	count

 18%|█▊        | 74/400 [3:17:01<14:27:59, 159.75s/it]


KeyboardInterrupt: 

In [None]:

def execute_multi_step(folder):

	X, y, X_train, X_val, X_test, y_train, y_val, y_test, name = parse_dataset(
		os.path.join('Datasets', folder, f"{folder}.csv"), 
		folder, 
		over_sample=False
	)

	result = execute_optimization(
		X_train, y_train, X_val, y_val, 
		InstanceSelectionProblem_3_Obj, 
		BinaryRandomSampling()
	)
	fronts = NonDominatedSorting().do(result.F, only_non_dominated_front=True)
	_, pareto_indicies = np.unique(result.F[fronts], axis=0, return_index=True)
	_, _, x_filtered, y_filtered = select_optimal_instance(X_train, y_train, X_val, y_val, result)

	optimization_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
	optimization_knn.fit(x_filtered, y_filtered)
	y_pred = optimization_knn.predict(X_test)
	r1_test_acc = accuracy_score(y_test, y_pred)


	min_indicies = np.argmin(result.F[pareto_indicies], axis=0)
	result_final = execute_optimization(
		X_train, y_train, X_val, y_val, 
		InstanceSelectionProblem_2_Obj_MinMaxAcc, 
		InheritedSampling(result.X[min_indicies], mutation_prob=0.0, num_rows_inherited=len(min_indicies))
	)
	_, _, x_filtered, y_filtered = select_optimal_instance(X_train, y_train, X_val, y_val, result_final)

	optimization_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
	optimization_knn.fit(x_filtered, y_filtered)
	y_pred = optimization_knn.predict(X_test)
	r2_test_acc = accuracy_score(y_test, y_pred)


	row = [
		dataset,
		data_option,
		init_pop,
		"MultiStep",
		averages['Baseline Test Accuracy'],
		averages['Baseline IR'],
		averages['Optimized Test Accuracy'],
		averages['Optimized IR'],
		averages['Reduction Rate %'],
		averages['Optimized Test Accuracy'] - averages['Baseline Test Accuracy']
	]

	return row

names = ['australia-3Obj-Biased-regular-Sample', 'Round 2', 'MinMaj cold start']
for folder in os.listdir('Datasets'):
	do_parallel(folder)
	# results = Parallel(n_jobs=-1)(delayed(do_parallel)(folder) for _ in range(10))
	# pd.DataFrame(results, columns=names).to_csv("save_1.csv", index=False)
	break

In [9]:
def do_parallel(folder):

	X, y, X_train, X_val, X_test, y_train, y_val, y_test, name = parse_dataset(
		os.path.join('Datasets', folder, f"{folder}.csv"), 
		folder, 
		over_sample=False
	)
	


	result = execute_optimization(
		X_train, y_train, X_val, y_val, 
		InstanceSelectionProblem_2_Obj, 
		BiasedBinarySampling(y_train, 0.5, 0.7)
	)
	fronts = NonDominatedSorting().do(result.F, only_non_dominated_front=True)
	_, pareto_indicies = np.unique(result.F[fronts], axis=0, return_index=True)
	best_acc_1, best_instance_idx, x_filtered, y_filtered = select_optimal_instance(X_train, y_train, X_val, y_val, result)
	print(f"Round 1: {best_acc_1}")

	optimization_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
	optimization_knn.fit(x_filtered, y_filtered)
	y_pred = optimization_knn.predict(X_test)
	r1_test_acc = accuracy_score(y_test, y_pred)



	min_indicies = np.argmin(result.F[fronts], axis=0)


	result_final = execute_optimization(
		X_train, y_train, X_val, y_val, 
		InstanceSelectionProblem_2_Obj_MinMaxAcc, 
		InheritedSampling(result.X[min_indicies], mutation_prob=0.0, num_rows_inherited=len(min_indicies))
	)
	fronts = NonDominatedSorting().do(result_final.F, only_non_dominated_front=True)
	_, pareto_indicies = np.unique(result_final.F[fronts], axis=0, return_index=True)
	best_acc_2, best_instance_idx, x_filtered, y_filtered = select_optimal_instance(X_train, y_train, X_val, y_val, result_final)
	print(f"Round 2: {best_acc_2}")	



	optimization_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
	optimization_knn.fit(x_filtered, y_filtered)
	y_pred = optimization_knn.predict(X_test)
	r2_test_acc = accuracy_score(y_test, y_pred)


	result = execute_optimization(
		X_train, y_train, X_val, y_val, 
		InstanceSelectionProblem_2_Obj_MinMaxAcc, 
		BinaryRandomSampling(),
		pop_size=200
	)
	fronts = NonDominatedSorting().do(result.F, only_non_dominated_front=True)
	_, pareto_indicies = np.unique(result.F[fronts], axis=0, return_index=True)
	best_acc_3, best_instance_idx, x_filtered, y_filtered = select_optimal_instance(X_train, y_train, X_val, y_val, result)
	print(f"Cold start MinMajAcc: {best_acc_3}\n")

	optimization_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
	optimization_knn.fit(x_filtered, y_filtered)
	y_pred = optimization_knn.predict(X_test)
	r3_test_acc = accuracy_score(y_test, y_pred)


	return [r1_test_acc, r2_test_acc, r3_test_acc]

names = ['australia-3Obj-Biased-regular-Sample', 'Round 2', 'MinMaj cold start']
for folder in os.listdir('Datasets'):
	results = Parallel(n_jobs=-1)(delayed(do_parallel)(folder) for _ in range(10))
	pd.DataFrame(results, columns=names).to_csv("save_2.csv", index=False)
	break