In [7]:
from pymoo.core.problem import Problem
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.core.sampling import Sampling
from pymoo.operators.crossover.hux import HUX
from pymoo.operators.mutation.bitflip import BitflipMutation
from pymoo.optimize import minimize
from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from pymoo.operators.sampling.rnd import BinaryRandomSampling
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import os

from utils import *

display_pca = False
N_NEIGHBOURS = 5
RANDOM_SEED = None

blank_row = {"Label": '---', 
		    "Num Features": '---', 
		    "Num Examples": '---', 
		    "Training Size": '---', 
		    "Minority Class": '---', 
			"Base - Test C0 Acc": '---', 
		    "Base - Test C1 Acc": '---', 
		    "Base - Test Acc": '---', 
			"Base - Val C0 Acc": '---', 
		    "Base - Val C1 Acc": '---', 
		    "Base - Val Acc": '---', 
		    "Optimized - Test C0 Acc": '---', 
		    "Optimized - Test C1 Acc": '---', 
		    "Optimized - Test Acc": '---', 
			"Num Examples for Test Best": '---', 
			"Optimized - Val C0 Acc": '---', 
		    "Optimized - Val C1 Acc": '---', 
		    "Optimized - Val Acc": '---', 
			"Num Examples for Val Best": '---', 
			"Baseline IR": '---', 
		    "Optimized IR": '---'}

all_headers = ['Label', 
               'Num Features', 
               'Num Examples', 
               'Training Size', 
               'Minority Class', 
               'Base - Test C0 Acc', 
               'Base - Test C1 Acc', 
               'Base - Test Acc', 
               'Base - Val C0 Acc', 
               'Base - Val C1 Acc', 
               'Base - Val Acc', 
               'Optimized - Test C0 Acc', 
               'Optimized - Test C1 Acc', 
               'Optimized - Test Acc', 
               'Num Examples for Test Best', 
               'Optimized - Val C0 Acc', 
               'Optimized - Val C1 Acc', 
               'Optimized - Val Acc', 
               'Num Examples for Val Best', 
               'Baseline IR', 
               'Optimized IR']

In [18]:
class BiasedBinarySampling(Sampling):
	def __init__(self, labels, major_prob, minor_prob):
		
		self.labels = labels
		counts = pd.DataFrame(labels).value_counts()
		if counts[0] > counts[1]:
			self.c0_thresh = major_prob
			self.c1_thresh = minor_prob
		else:
			self.c0_thresh = minor_prob
			self.c1_thresh = major_prob

		super().__init__()

	def _do(self, problem, n_samples, **kwargs):

		rands = np.random.random((n_samples, problem.n_var))
		init_pops = np.zeros((n_samples, problem.n_var), dtype=bool)

		for idx, label in enumerate(self.labels):
			if label == 0:
				init_pops[:, idx] = (rands[:, idx] < self.c0_thresh).astype(bool)
			if label == 1:
				init_pops[:, idx] = (rands[:, idx] < self.c1_thresh).astype(bool)


		return init_pops
	
class InstanceSelectionProblem_2_Obj(Problem):
	def __init__(self, X_train, y_train, X_val, y_val):
		
		self.X_train = X_train
		self.y_train = y_train

		self.X_val = X_val
		self.y_val = y_val

		self.training_data = X_train
		self.n_instances = X_train.shape[0]
		
		super().__init__(
			n_var=self.n_instances,
			n_obj=2,               
			n_constr=0,            
			xl=0,                  
			xu=1,                  
			type_var=np.bool_,     
		)

	def _evaluate(self, x, out, *args, **kwargs):
		# Calculate number of examples in each instance
		f1 = np.sum(x, axis=1)

		# Calculate inverse accuracy
		f2 = Parallel(n_jobs=-1)(delayed(self.train_model)(instance) for instance in x)

		out["F"] = np.column_stack([f1, f2])

	def train_model(self, instance):
		x_train_filtered, y_train_filtered = self.X_train[instance], self.y_train[instance]
		
		num_included_instances = x_train_filtered.shape[0]

		if num_included_instances >= N_NEIGHBOURS:
			optimization_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
			optimization_knn.fit(x_train_filtered, y_train_filtered)

			y_pred = optimization_knn.predict(self.X_val)
			acc = accuracy_score(self.y_val, y_pred)
			return 1-acc
		else:
			return 1

class InstanceSelectionProblem_3_Obj(Problem):
	def __init__(self, X_train, y_train, X_val, y_val):
		
		self.X_train = X_train
		self.y_train = y_train

		self.X_val = X_val
		self.y_val = y_val

		self.training_data = X_train
		self.n_instances = X_train.shape[0]
		
		super().__init__(
			n_var=self.n_instances,
			n_obj=3,               
			n_constr=0,            
			xl=0,                  
			xu=1,                  
			type_var=np.bool_,     
		)

	def _evaluate(self, x, out, *args, **kwargs):
		# Calculate number of examples in each instance
		f1 = np.sum(x, axis=1)

		# Calculate inverse accuracy
		f2 = Parallel(n_jobs=-1)(delayed(self.train_model)(instance) for instance in x)

		f3 = Parallel(n_jobs=-1)(delayed(self.calculate_instance_IR)(instance) for instance in x)
		
		out["F"] = np.column_stack([f1, f2, f3])

	def calculate_instance_IR(self, instance):
		num_1_class = np.sum(self.y_train[instance])
		num_0_class = self.n_instances - num_1_class
		IR = max(num_0_class, num_1_class) / min(num_0_class, num_1_class)
		return IR

	def train_model(self, instance):
		x_train_filtered, y_train_filtered = self.X_train[instance], self.y_train[instance]
		
		num_included_instances = x_train_filtered.shape[0]

		if num_included_instances >= N_NEIGHBOURS:
			optimization_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
			optimization_knn.fit(x_train_filtered, y_train_filtered)

			y_pred = optimization_knn.predict(self.X_val)
			acc = accuracy_score(self.y_val, y_pred)
			return 1-acc
		else:
			return 1

class InstanceSelectionProblem_2_Obj_MinMaxAcc(Problem):
	def __init__(self, X_train, y_train, X_val, y_val):
		
		self.X_train = X_train
		self.y_train = y_train

		self.X_val = X_val
		self.y_val = y_val

		self.training_data = X_train
		self.n_instances = X_train.shape[0]
		
		super().__init__(
			n_var=self.n_instances,
			n_obj=2,               
			n_constr=0,            
			xl=0,                  
			xu=1,                  
			type_var=np.bool_,     
		)

	def _evaluate(self, x, out, *args, **kwargs):
		objectives = Parallel(n_jobs=-1)(delayed(self.train_model)(instance) for instance in x)
		f1 = [obj[0] for obj in objectives] # Class 0 error
		f2 = [obj[1] for obj in objectives] # Class 1 error
		out["F"] = np.column_stack([f1, f2])

	def train_model(self, instance):
		x_train_filtered, y_train_filtered = self.X_train[instance], self.y_train[instance]
		num_included_instances = x_train_filtered.shape[0]

		if num_included_instances >= N_NEIGHBOURS:
			optimization_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
			optimization_knn.fit(x_train_filtered, y_train_filtered)

			class_1_indices = np.where(self.y_val==1)
			class_0_indices = np.where(self.y_val==0)

			class_1_x_val = self.X_val[class_1_indices]
			class_0_x_val = self.X_val[class_0_indices]

			class_1_y_val = self.y_val[class_1_indices]
			class_0_y_val = self.y_val[class_0_indices]
			
			class_1_pred = optimization_knn.predict(class_1_x_val)
			class_1_acc = accuracy_score(class_1_y_val, class_1_pred)

			class_0_pred = optimization_knn.predict(class_0_x_val)
			class_0_acc = accuracy_score(class_0_y_val, class_0_pred)

			return (1-class_0_acc, 1-class_1_acc)
		else:
			return (1, 1)

def parse_dataset(path, name, over_sample=False):
	try:
		df = pd.read_csv(path, delimiter=', ', engine='python')
		X = df.drop(columns='Class')
		y = df['Class']
	except KeyError:
		df = pd.read_csv(path, delimiter=',')
		X = df.drop(columns='Class')
		y = df['Class']

	# Generate train, validation, and test sets
	label_encoder = LabelEncoder()
	y_encoded = label_encoder.fit_transform(y)
	X_train, X_val, X_test, y_train, y_val, y_test = split_and_scale_datasets(X, y_encoded, random_state=RANDOM_SEED)

	if over_sample:
		class_0_count, class_1_count, IR = set_summary(y_train, "y_train", False)
		if class_0_count > class_1_count:
			minority_class_indicies = np.where(y_train == 1)
		else:
			minority_class_indicies = np.where(y_train == 0)

		X_train = np.concatenate((X_train, X_train[minority_class_indicies]), axis=0)
		y_train = np.concatenate((y_train, y_train[minority_class_indicies]), axis=0)

	return [X, y, X_train, X_val, X_test, y_train, y_val, y_test, name]

def class_based_accuracy(model, x, y):
	class_1_indices = np.where(y==1)
	class_0_indices = np.where(y==0)

	class_1_x = x[class_1_indices]
	class_0_x = x[class_0_indices]

	class_1_y = y[class_1_indices]
	class_0_y = y[class_0_indices]
	
	class_1_pred = model.predict(class_1_x)
	class_1_acc = accuracy_score(class_1_y, class_1_pred)

	class_0_pred = model.predict(class_0_x)
	class_0_acc = accuracy_score(class_0_y, class_0_pred)

	overall_prediction = model.predict(x)
	overall_accuracy = accuracy_score(y, overall_prediction)

	return class_0_acc, class_1_acc, overall_accuracy

def assess_baseline_metrics(X_train, y_train, X_test, y_test):

	counts = pd.DataFrame(y_train).value_counts()

	# Determine baseline accuracy of classifier on all examples
	baseline_knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
	baseline_knn.fit(X_train, y_train)
	class_0_baseline_testAcc, class_1_baseline_testAcc, baseline_testAcc = class_based_accuracy(baseline_knn, X_test, y_test)

	return counts, class_0_baseline_testAcc, class_1_baseline_testAcc, baseline_testAcc

def execute_optimization(X_train, y_train, X_val, y_val, problem_defn, sampler):
	
	problem = problem_defn(X_train, y_train, X_val, y_val)

	algorithm = NSGA2(
		pop_size=100, 
		sampling=sampler, 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True)
	
	return minimize(problem, algorithm, ('n_gen', 100), verbose=False, seed=RANDOM_SEED)

def select_optimal_instance(X_train, y_train, X_val, y_val, result):

	fronts = NonDominatedSorting().do(result.F, only_non_dominated_front=True)
	_, pareto_indicies = np.unique(result.F[fronts], axis=0, return_index=True)

	best_instance_idx = 0
	best_acc = 0
	for idx, instance in enumerate(result.X[pareto_indicies]):
		x_filtered, y_filtered = X_train[instance], y_train[instance]
		if x_filtered.shape[0] < N_NEIGHBOURS: 
			acc = 1
		else:
			knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS)
			knn.fit(x_filtered, y_filtered)
			y_pred = knn.predict(X_val)
			acc = accuracy_score(y_val, y_pred)
		
		if acc > best_acc:
			best_acc = acc
			best_instance_idx = idx
	
	return best_acc, best_instance_idx, x_filtered, y_filtered


In [15]:
iter_resuls = []
for iter in range(10):
	for data_option in ['over_sample', 'regular_sample']:
		for init_pop in ['rand', 'bias']:
			for optimization in ['MinMajAcc', '2Obj', '3Obj']:
				for folder in os.listdir('Datasets'):

					try:
						X, y, X_train, X_val, X_test, y_train, y_val, y_test, name = parse_dataset(
							os.path.join('Datasets', folder, f"{folder}.csv"), 
							folder, 
							over_sample=True if data_option == "over_sample " else False
						)

						initial_population = BinaryRandomSampling() if init_pop == "rand" else BiasedBinarySampling(y_train, 0.5, 0.7)
						
						problem_defn_mapping = {
							"MinMajAcc": InstanceSelectionProblem_2_Obj_MinMaxAcc,
							"2Obj": InstanceSelectionProblem_2_Obj,
							"3Obj": InstanceSelectionProblem_3_Obj
						}

						result = execute_optimization(
							X_train, y_train, X_val, y_val, 
							problem_defn_mapping[optimization], 
							initial_population
						)

						save_var = {
							"iter": iter,
							"Sampler": data_option,
							"Population": init_pop,
							"Optimization": optimization,
							"Dataset": folder,
							"Result": result,
							"Data": (X_train, X_val, X_test, y_train, y_val, y_test)
						}

						with open(f"Experiments//{iter}__{data_option}__{init_pop}__{optimization}__{folder}.pickle", "wb") as fh:
							pickle.dump(save_var, fh, protocol=pickle.HIGHEST_PROTOCOL)

					except Exception as e:
						print(f"Error, {e} - {iter} - {data_option} - {init_pop} - {optimization} - {folder}")
						
					break
				break
			break
		break
	break



In [20]:
for file in os.listdir("Experiments"):
    
	with open(f'Experiments//{file}', 'rb') as fh:
		save_var = pickle.load(fh)

	X_train, X_val, X_test, y_train, y_val, y_test = save_var['Data']

	best_acc, best_instance_idx, x_filtered, y_filtered = select_optimal_instance(X_train, y_train, X_val, y_val, save_var['Result'])
	
	print(best_acc)

0.936046511627907


In [5]:
iter_resuls = []
for iter in range(10):
	for data_option in ['over_sample', 'regular_sample']:
		for init_pop in ['rand', 'bias']:
			for optimization in ['MinMajAcc']:#, '2Obj', '3Obj']:
				for folder in os.listdir('Datasets'):
					over_sample = False if data_option == "regular_sample" else True
					run_id = f"{data_option}-{init_pop}-{optimization}-{folder}"
					print(f"Iter {iter} - {run_id}")
				

Iter 0 - over_sample-rand-MinMajAcc-australian
Iter 0 - over_sample-rand-MinMajAcc-bupa
Iter 0 - over_sample-rand-MinMajAcc-glass1
Iter 0 - over_sample-rand-MinMajAcc-magic
Iter 0 - over_sample-rand-MinMajAcc-phoneme
Iter 0 - over_sample-rand-MinMajAcc-pima
Iter 0 - over_sample-rand-MinMajAcc-segment0
Iter 0 - over_sample-rand-MinMajAcc-sonar
Iter 0 - over_sample-rand-MinMajAcc-vehicle0
Iter 0 - over_sample-rand-MinMajAcc-yeast1
Iter 0 - over_sample-bias-MinMajAcc-australian
Iter 0 - over_sample-bias-MinMajAcc-bupa
Iter 0 - over_sample-bias-MinMajAcc-glass1
Iter 0 - over_sample-bias-MinMajAcc-magic
Iter 0 - over_sample-bias-MinMajAcc-phoneme
Iter 0 - over_sample-bias-MinMajAcc-pima
Iter 0 - over_sample-bias-MinMajAcc-segment0
Iter 0 - over_sample-bias-MinMajAcc-sonar
Iter 0 - over_sample-bias-MinMajAcc-vehicle0
Iter 0 - over_sample-bias-MinMajAcc-yeast1
Iter 0 - regular_sample-rand-MinMajAcc-australian
Iter 0 - regular_sample-rand-MinMajAcc-bupa
Iter 0 - regular_sample-rand-MinMajAcc-g

In [9]:
summations = {}
for result in iter_results:
	
	if result['Label'] not in summations:
		summations[result['Label']] = {}
		for key in result:
			if isinstance(result[key], float) or isinstance(result[key], int):
				summations[result['Label']][key] = 0
	
	for key in result:
		if key in summations[result['Label']]:
			summations[result['Label']][key] += result[key]
	
	

averages = []
for label in summations:
	temp = {}
	for key in summations[label]:
		temp[key] = summations[label][key] / 10
	
	temp["Label"] = label
	averages.append(temp)

pd.DataFrame(averages).to_csv("output.csv")