In [None]:
from pymoo.operators.sampling.rnd import BinaryRandomSampling
from pymoo.operators.mutation.bitflip import BitflipMutation
from pymoo.operators.crossover.hux import HUX
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.optimize import minimize

from generic_optimizer import GenericOptimizer, prepare_splits
from novel_sampling import BiasedBinarySampling, over_sample

from generic_optimizer import *

from pymoo.core.mutation import Mutation

from scipy.stats import ranksums

from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import os
import re

from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch
import torch.nn as nn

from sklearn.preprocessing import LabelEncoder

In [None]:
def get_name(objective):

	pattern = r'\.([a-zA-Z_][a-zA-Z0-9_]*)\s'
	return re.search(pattern, str(objective)).group(1)

def step_1(folder):
	x, y, splits = prepare_splits(os.path.join('../Datasets', folder, f"{folder}.csv"))

	train_idx, test_idx, validation_idx = None, None, None
	for train_idx, test_idx, validation_idx in splits:
		train_idx, test_idx, validation_idx = train_idx, test_idx, validation_idx
		break

	x_train, y_train = x[train_idx], y[train_idx]
	x_validation, y_validation = x[validation_idx], y[validation_idx]
	x_test, y_test = x[test_idx], y[test_idx]

	splits = {
		"X Train": x_train,
		"Y Train": y_train,
		"X Validation": x_validation,
		"Y Validation": y_validation,
		"X Test": x_test,
		"Y Test": y_test,
	}

	return splits

In [2]:
def step_2(x_train, y_train, x_validation, y_validation, primary_objective, secondary_objectives):
	def func(objectives):

		objectives.append(primary_objective)
		problem = GenericOptimizer(
			x_train, 
			y_train, 
			x_validation, 
			y_validation,
			objectives,
			"Sequential"
		)

		initial_population = BinaryRandomSampling()

		algorithm = NSGA2(
			pop_size=GenericOptimizer.population_size, 
			sampling=initial_population, 
			crossover=HUX(), 
			mutation=BitflipMutation(), 
			eliminate_duplicates=True)

		result = minimize(
			problem, 
			algorithm, 
			('n_gen', GenericOptimizer.population_size)
		)

		return result.X
	
	return Parallel(n_jobs=-1)(delayed(func)(objectives) for objectives in secondary_objectives)

In [3]:
def step_3(populations):
	all_outcomes = []
	for population in populations:
		for individual in population:
			individual = list(individual)
			if individual not in all_outcomes:
				all_outcomes.append(individual)

	return np.array(all_outcomes)

In [4]:
def step_4(all_outcomes):
	x = []
	y = []

	while len(x) < 500000:
		
		for y_true in all_outcomes:
			x_noised = []
			for idx, probability in enumerate(np.random.uniform(0.1, 1.0, y_true.shape[0])):
				if probability < 0.85:
					x_noised.append(y_true[idx])				
				else:
					x_noised.append(0 if y_true[idx] == 1 else 1)

			x.append(np.array(x_noised, dtype=np.float32))
			y.append(np.array(y_true, dtype=np.float32))

	return np.array(x), np.array(y)

In [5]:
def step_5(X_train, y_train):
	class CustomDataset(Dataset):
		def __init__(self, x_train, y_train):
			self.x_train = x_train
			self.y_train = y_train
		def __len__(self):
			return self.x_train.shape[0]
		def __getitem__(self, ind):
			x = self.x_train[ind]
			y = self.y_train[ind]
			return x, y

	train_set = CustomDataset(X_train, y_train)
	input_dim = X_train.shape[1]
	batch_size = 215
	train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	class MLP(nn.Module):
		def __init__(self):
			super(MLP, self).__init__()
			self.linear1 = nn.Linear(input_dim, input_dim//2)
			self.relu1 = nn.ReLU()
			self.linear2 = nn.Linear(input_dim//2, input_dim//3)
			self.relu2 = nn.ReLU()
			self.linear3 = nn.Linear(input_dim//3, input_dim)

		def forward(self, x):
			x = self.linear1(x)
			x = self.relu1(x)
			x = self.linear2(x)
			x = self.relu2(x)
			x = self.linear3(x)
			return x
		
	model = MLP().to(device)
	optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)
	criterion = nn.BCEWithLogitsLoss()

	epochs = 50

	model.train()
	for epoch in range(epochs):
		losses = []
		for batch_num, input_data in enumerate(train_loader):
			optimizer.zero_grad()
			x, y = input_data
			x, y = x.to(device).float(), y.to(device)

			output = model(x)
			loss = criterion(output, y)
			loss.backward()
			losses.append(loss.item())
			optimizer.step()

	return model

In [None]:
class CustomDataset(Dataset):
	def __init__(self, x_train, y_train):
		self.x_train = x_train
		self.y_train = y_train
	def __len__(self):
		return self.x_train.shape[0]
	def __getitem__(self, ind):
		x = self.x_train[ind]
		y = self.y_train[ind]
		return x, y

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def step_6(model, x_train, y_train, x_validation, y_validation):
	class CustomMutation(Mutation):
		def __init__(self, model):
			super().__init__()
			self.model = model

		def _do(self, problem, X, **kwargs):

			int_x = np.array(X, dtype=np.float32)
			dataset = CustomDataset(int_x, int_x)
			loader = DataLoader(dataset, batch_size=X.shape[0], shuffle=False)

			model.eval()
			with torch.no_grad():
				for data, _ in loader:
					data = data.to(device)
					outputs = model(data)
					predictions = (outputs > 0.5).bool()

			return np.array(predictions)
		
	objectives = [GenericOptimizer.calculate_overall_error, GenericOptimizer.calculate_num_examples]

	problem = GenericOptimizer(
		x_train, 
		y_train, 
		x_validation, 
		y_validation,
		objectives,
		"Sequential"
	)

	initial_population = BinaryRandomSampling()

	algorithm = NSGA2(
		pop_size=GenericOptimizer.population_size, 
		sampling=initial_population, 
		crossover=HUX(), 
		mutation=CustomMutation(model), 
		eliminate_duplicates=True)

	result = minimize(
		problem, 
		algorithm, 
		('n_gen', GenericOptimizer.population_size)
	)

	return result

In [None]:
def benchmark(result, x_train, y_train, x_validation, y_validation, x_test, y_test):
	validation_fitness = result.F
	optimization_population = result.X

	if len(optimization_population.shape) == 1:
		optimization_population = np.array([list(optimization_population)])

	validation_pareto_front, best_x_train, best_y_train = GenericOptimizer.calculate_optimal_instance(
		x_train,
		y_train,
		x_validation,
		y_validation,
		validation_fitness,
		optimization_population,
		GenericOptimizer.n_neighbours
	)

	reduction_rate = 1 - (best_x_train.shape[0] / x_train.shape[0])

	optimized_test_err = GenericOptimizer.calculate_overall_error(
		best_x_train,
		best_y_train,
		x_test,
		y_test,
		GenericOptimizer.n_neighbours
	)
	optimized_validation_err = GenericOptimizer.calculate_overall_error(
		best_x_train,
		best_y_train,
		x_validation,
		y_validation,
		GenericOptimizer.n_neighbours
	)
	baseline_validation_err = GenericOptimizer.calculate_overall_error(
		x_train,
		y_train,
		x_validation,
		y_validation,
		GenericOptimizer.n_neighbours
	)

	baseline_test_err = GenericOptimizer.calculate_overall_error(
		x_train,
		y_train,
		x_test,
		y_test,
		GenericOptimizer.n_neighbours
	)

	return baseline_validation_err, optimized_validation_err, baseline_test_err, optimized_test_err, reduction_rate

In [None]:
baseline_validation = []
baseline_test = []
optimized_test = []
optimized_validation = []

for _ in range(31):
		
	folder = os.listdir('../Datasets')[0]
	splits = step_1(folder)
	print(f"Step 1 complete...")

	primary_objective = GenericOptimizer.calculate_overall_error
	secondary_objectives = [
		[GenericOptimizer.calculate_num_examples],
		[GenericOptimizer.calculate_IR],
		[GenericOptimizer.calculate_class0_error],
		[GenericOptimizer.calculate_class0_inverse_f1],
		[GenericOptimizer.calculate_class0_inverse_precision],
		[GenericOptimizer.calculate_class0_inverse_recall],
		[GenericOptimizer.calculate_class1_error],
		[GenericOptimizer.calculate_class1_inverse_f1],
		[GenericOptimizer.calculate_class1_inverse_precision],
		[GenericOptimizer.calculate_class1_inverse_recall],
		[GenericOptimizer.calculate_overall_inverse_f1],
		[GenericOptimizer.calculate_overall_inverse_precision],
		[GenericOptimizer.calculate_overall_inverse_recall],
		[GenericOptimizer.calculate_class0_inverse_precision, GenericOptimizer.calculate_class1_inverse_precision],
		[GenericOptimizer.calculate_class0_inverse_recall, GenericOptimizer.calculate_class1_inverse_recall],
		[GenericOptimizer.calculate_class0_inverse_f1, GenericOptimizer.calculate_class1_inverse_f1],
		[GenericOptimizer.calculate_class0_error, GenericOptimizer.calculate_class1_error],
	]
	populations = step_2(
		splits["X Train"], 
		splits["Y Train"], 
		splits["X Validation"], 
		splits["Y Validation"], 
		primary_objective, 
		secondary_objectives
	)
	print(f"Step 2 complete...")

	all_outcomes = step_3(populations)
	print(f"Step 3 complete...")

	x, y = step_4(all_outcomes)
	print(f"Step 4 complete...")

	model = step_5(x, y)
	print(f"Step 5 complete...")

	result = step_6(
		model, 
		splits["X Train"], 
		splits["Y Train"], 
		splits["X Validation"], 
		splits["Y Validation"], 
	)
	print(f"Step 6 complete...")

	baseline_validation_err, optimized_validation_err, baseline_test_err, optimized_test_err, reduction_rate = benchmark(
		result, 
		splits["X Train"], 
		splits["Y Train"], 
		splits["X Test"], 
		splits["Y Test"], 
		splits["X Validation"], 
		splits["Y Validation"], 
	)
	print(f"Run complete...\n")
	
	baseline_test.append(baseline_test_err)
	baseline_validation.append(baseline_validation_err)
	optimized_test.append(optimized_test_err)
	optimized_validation.append(optimized_validation_err)

with open('iters.pickle', 'wb') as fh:
	pickle.dump((baseline_test, baseline_validation, optimized_test, optimized_validation), fh)


Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...

Compiled modules for significant speedup can not be used!
https://pymoo.org/installation.html#installation

from pymoo.config import Config



  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...

Step 1 complete...
Step 2 complete...
Step 3 complete...
Step 4 complete...
Step 5 complete...


  return np.array(predictions)


Step 6 complete...
Run complete...



In [25]:
validation_acc = np.subtract(1, optimized_validation )
test_acc = np.subtract(1, optimized_test)
baseline_acc = np.subtract(1, baselines)

print("Baseline")
print(np.mean(baseline_acc))
print(pd.DataFrame(baseline_acc).describe())

print("\n---\nvalidation_acc")

print(f"Pval: {ranksums(baseline_acc, validation_acc).pvalue}")
print(f"Mean: {np.mean(validation_acc)}")
print(pd.DataFrame(validation_acc).describe())

print("\n---\ntest_acc")

print(f"Pval: {ranksums(baseline_acc, test_acc).pvalue}")
print(f"Mean: {np.mean(test_acc)}")
print(pd.DataFrame(test_acc).describe())

Baseline
0.6440360090022507
               0
count  31.000000
mean    0.644036
std     0.025016
min     0.604651
25%     0.630814
50%     0.639535
75%     0.656977
max     0.703488

---
validation_acc
Pval: 0.00039897442666908734
Mean: 0.6658586611970914
               0
count  31.000000
mean    0.665859
std     0.030602
min     0.578035
25%     0.650289
50%     0.664740
75%     0.687861
max     0.710983

---
test_acc
Pval: 1.3353534438911939e-11
Mean: 0.8225806451612904
               0
count  31.000000
mean    0.822581
std     0.045231
min     0.738372
25%     0.793605
50%     0.825581
75%     0.857558
max     0.918605


In [13]:
print(baseline_test_err)
print(optimized_validation_err)
print(optimized_test_err)

0.33139534883720934
0.3179190751445087
0.12209302325581395
