In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, balanced_accuracy_score

from pymoo.operators.mutation.bitflip import BitflipMutation, Mutation
from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from pymoo.operators.sampling.rnd import BinaryRandomSampling, Sampling
from pymoo.operators.crossover.hux import HUX
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.indicators.hv import Hypervolume
from pymoo.core.problem import Problem
from pymoo.optimize import minimize

import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
import torch

from scipy.stats import ranksums

from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
import pandas as pd
import numpy as np
import pickle
import os
import re

import matplotlib.pyplot as plt

with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)

data_keys = list(data_mapper.keys())

In [48]:
class ClassSensitiveOptimizer(Problem):
	population_size = 100
	n_neighbours = 5
	sequential = False
	def __init__(self, X_train, y_train, X_val, y_val):
		self.mutation_history = {}
		self.generation_number = 0

		self.X_train = X_train
		self.y_train = y_train

		self.X_val = X_val
		self.y_val = y_val

		self.training_data = X_train
		self.n_instances = X_train.shape[0]
		
		super().__init__(
			n_var=self.n_instances,
			n_obj=2,               
			n_constr=0,            
			xl=0,                  
			xu=1,                  
			type_var=np.bool_,     
		)

	def _evaluate(self, x, out, *args, **kwargs):
		
		metrics = []
		for label in np.unique(self.y_train):
			label_idx = np.where(self.y_train==label)

			filtered_x_train = self.X_train[label_idx]
			filtered_y_train = self.y_train[label_idx]

			f_values = []
			for instance in x:
				error = 1
				if np.sum(instance) >= ClassSensitiveOptimizer.n_neighbours:
					model = KNeighborsClassifier(
						n_neighbors=ClassSensitiveOptimizer.n_neighbours
					)
					model.fit(
						filtered_x_train, 
						filtered_y_train
					)

					y_pred = model.predict(self.X_val)
					error = 1 - accuracy_score(self.y_val, y_pred)

				f_values.append(error)
			metrics.append(f_values)

		out["F"] = np.column_stack(metrics)

class StandardOptimizer(Problem):
	population_size = 100
	n_neighbours = 5
	sequential = False
	def __init__(self, X_train, y_train, X_val, y_val):
		self.mutation_history = {}
		self.generation_number = 0

		self.X_train = X_train
		self.y_train = y_train

		self.X_val = X_val
		self.y_val = y_val

		self.training_data = X_train
		self.n_instances = X_train.shape[0]
		
		super().__init__(
			n_var=self.n_instances,
			n_obj=2,               
			n_constr=0,            
			xl=0,                  
			xu=1,                  
			type_var=np.bool_,     
		)

	def _evaluate(self, x, out, *args, **kwargs):
		
		errors = []
		num_samples = []
		for instance in x:
			error = 1
			if np.sum(instance) >= ClassSensitiveOptimizer.n_neighbours:
				model = KNeighborsClassifier(
					n_neighbors=ClassSensitiveOptimizer.n_neighbours
				)
				model.fit(
					self.X_train[instance], 
					self.y_train[instance]
				)

				y_pred = model.predict(self.X_val)
				error = 1 - accuracy_score(self.y_val, y_pred)
			errors.append(error)
			num_samples.append(np.sum(instance))

		out["F"] = np.column_stack([errors, num_samples])
		
class DiverseCustomSampling(Sampling):
	def __init__(self):
		super().__init__()

	def _do(self, problem, n_samples, **kwargs):

		target_inclusions = np.random.randint(
			problem.n_var // 3,
			problem.n_var,
			n_samples
		)
		init_pops = []
		for target in target_inclusions:
			array = np.array([1]*target + [0]*(problem.n_var - target))
			np.random.shuffle(array)
			init_pops.append(array)
		init_pops = np.array(init_pops, dtype=np.bool)
		return init_pops
	
def optimize(x_train, y_train, x_validation, y_validation):
	problem = ClassSensitiveOptimizer(
		x_train,
		y_train,
		x_validation,
		y_validation,
	)
	algorithm = NSGA2(
		pop_size=ClassSensitiveOptimizer.population_size, 
		sampling=DiverseCustomSampling(),
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True,
	)
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', ClassSensitiveOptimizer.population_size), # <--- maybe increase
		save_history=False
	)
	return result

def select_ideal(result, x_train, y_train, x_compare, y_compare):
	fitness = []
	for instance in result.X:
		error = 1
		if np.sum(instance) >= ClassSensitiveOptimizer.n_neighbours:
			model = KNeighborsClassifier(
				n_neighbors=ClassSensitiveOptimizer.n_neighbours
			)
			model.fit(
				x_train[instance], 
				y_train[instance]
			)

			y_pred = model.predict(x_compare)
			error = 1 - accuracy_score(y_compare, y_pred)
		fitness.append(error)

	return result.X[np.argmin(fitness)]

class ConditionalVAE(nn.Module):
	def __init__(self, input_dim, label_dim, hidden_dim, latent_dim):
		super(ConditionalVAE, self).__init__()
		self.fc1 = nn.Linear(input_dim + label_dim, hidden_dim)
		self.fc_mu = nn.Linear(hidden_dim, latent_dim)
		self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
		self.fc2 = nn.Linear(latent_dim + label_dim, hidden_dim)
		self.fc3 = nn.Linear(hidden_dim, input_dim)

	def encode(self, x, y):
		# Concatenate input and label
		x = torch.cat([x, y], dim=1)
		h = torch.relu(self.fc1(x))
		return self.fc_mu(h), self.fc_logvar(h)

	def reparameterize(self, mu, logvar):
		std = torch.exp(0.5 * logvar)
		eps = torch.randn_like(std)
		return mu + eps * std

	def decode(self, z, y):
		# Concatenate latent vector and label
		z = torch.cat([z, y], dim=1)
		h = torch.relu(self.fc2(z))
		return self.fc3(h)

	def forward(self, x, y):
		mu, logvar = self.encode(x, y)
		z = self.reparameterize(mu, logvar)
		return self.decode(z, y), mu, logvar

def vae_loss(recon_x, x, mu, logvar):
	recon_loss = nn.MSELoss()(recon_x, x)
	kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
	return recon_loss + kl_div

class CustomDataset(Dataset):
	def __init__(self, x_synthetic, x_true):
		self.x = x_synthetic
		self.y = x_true
	def __len__(self):
		return self.x.shape[0]
	def __getitem__(self, ind):
		x = self.x[ind]
		y = self.y[ind]
		return x, y

def train(training_x, training_y, cvae, lr, epochs, batch_size):
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	train_set = CustomDataset(torch.from_numpy(training_x), torch.from_numpy(training_y))
	train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
	optimizer = optim.Adam(cvae.parameters(), lr=lr)
	
	for epoch in range(epochs):
		cvae.train()
		total_loss = 0
		for batch in train_loader:
			x_batch = batch[0].to(device).float()
			y_batch = batch[1].to(device).float().unsqueeze(1)
			
			recon, mu, logvar = cvae(x_batch, y_batch)
			loss = vae_loss(recon, x_batch, mu, logvar)
			
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()
			
			total_loss += loss.item()

		# print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")

	return cvae

def generate_synthetic_examples(x_samples, y_samples, sample_variance, cvae, num_samples=None):
	if num_samples is None:
		num_samples = len(x_samples)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	cvae.eval()

	synthetic_features = []

	while len(synthetic_features) < num_samples:
		with torch.no_grad():    
			x = torch.from_numpy(x_samples).to(device).float()
			y = torch.from_numpy(y_samples).to(device).float().unsqueeze(1)
			mu, logvar = cvae.encode(x, y)
			z = cvae.reparameterize(mu, logvar)
		minority_latents = z.cpu().numpy()

		minority_latents[:,0] += np.random.normal(-sample_variance[0]/2, sample_variance[0]/2, len(minority_latents))
		minority_latents[:,1] += np.random.normal(-sample_variance[1]/2, sample_variance[1]/2, len(minority_latents))

		with torch.no_grad():    
			z = torch.from_numpy(minority_latents).to(device).float()
			label_dim = torch.from_numpy(y_samples).to(device).float().unsqueeze(1)
			synthetic_minority_samples = cvae.decode(z, label_dim)
	
		for sample in synthetic_minority_samples.cpu().numpy():
			if len(synthetic_features) < num_samples:
				synthetic_features.append(sample)
			else:
				break
			

	return np.array(synthetic_features)

In [3]:
data_key = data_keys[0]
x_train = data_mapper[data_key]['x_train'] 
y_train = data_mapper[data_key]['y_train']
x_validation = data_mapper[data_key]['x_validation'] 
y_validation = data_mapper[data_key]['y_validation']
x_test = data_mapper[data_key]['x_test']
y_test = data_mapper[data_key]['y_test']
print(data_key)

0_abalone-20_vs_8-9-10


# Step 1

Define the samples used to pre-train the conditional variational autoencoder.

In [4]:
cold_start_x = x_train
cold_start_y = y_train
input_dim = cold_start_x[0].shape[0]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cvae = ConditionalVAE(input_dim, 1, input_dim//2, 2).to(device)

# Step 2
Pre-train the *CVAE* upon the entire training set (majority + minority).

In [5]:
cvae = train(
    cold_start_x, cold_start_y, 
    cvae,
    lr=1e-3,
    epochs=200,
    batch_size=20
)

Epoch 1, Loss: 2.8874
Epoch 2, Loss: 2.1066
Epoch 3, Loss: 1.7562
Epoch 4, Loss: 1.5561
Epoch 5, Loss: 1.4392
Epoch 6, Loss: 1.3312
Epoch 7, Loss: 1.2551
Epoch 8, Loss: 1.2116
Epoch 9, Loss: 1.1847
Epoch 10, Loss: 1.1542
Epoch 11, Loss: 1.1342
Epoch 12, Loss: 1.1181
Epoch 13, Loss: 1.1073
Epoch 14, Loss: 1.0976
Epoch 15, Loss: 1.0895
Epoch 16, Loss: 1.0850
Epoch 17, Loss: 1.0776
Epoch 18, Loss: 1.0721
Epoch 19, Loss: 1.0691
Epoch 20, Loss: 1.0674
Epoch 21, Loss: 1.0611
Epoch 22, Loss: 1.0582
Epoch 23, Loss: 1.0550
Epoch 24, Loss: 1.0541
Epoch 25, Loss: 1.0537
Epoch 26, Loss: 1.0493
Epoch 27, Loss: 1.0480
Epoch 28, Loss: 1.0470
Epoch 29, Loss: 1.0453
Epoch 30, Loss: 1.0440
Epoch 31, Loss: 1.0547
Epoch 32, Loss: 1.0424
Epoch 33, Loss: 1.0411
Epoch 34, Loss: 1.0406
Epoch 35, Loss: 1.0399
Epoch 36, Loss: 1.0382
Epoch 37, Loss: 1.0389
Epoch 38, Loss: 1.0374
Epoch 39, Loss: 1.0382
Epoch 40, Loss: 1.0365
Epoch 41, Loss: 1.0365
Epoch 42, Loss: 1.0350
Epoch 43, Loss: 1.0349
Epoch 44, Loss: 1.03

# Step 3 
Calculate the variance statistic of the entire cold start training set.

In [None]:
with torch.no_grad():    
	x = torch.from_numpy(cold_start_x).to(device).float()
	y = torch.from_numpy(cold_start_y).to(device).float().unsqueeze(1)
	mu, logvar = cvae.encode(x, y)
	z = cvae.reparameterize(mu, logvar)
	total_latents = z.cpu().numpy()

	variance = np.var(total_latents, axis=0)

# Step 4
Create new synthetic samples using the minority class latents, plus noise randomly added according to the global variance statistic.

In [None]:
minority_label = pd.DataFrame(cold_start_y).value_counts().argmin()
minority_indices = np.where(cold_start_y==minority_label)[0]
minority_features = cold_start_x[minority_indices]
minority_labels = cold_start_y[minority_indices]

synthetic_minority_features = generate_synthetic_examples(
    minority_features, 
    minority_labels, 
    variance, 
    cvae,
    num_samples=30
)

synthetic_minority_features.shape


(30, 7)

# Step 5 
Execute optimization with the synthetic examples plus the original training set

In [24]:
new_x_train = np.concatenate((x_train, synthetic_minority_features), axis=0)
new_y_train = np.concatenate((y_train, [minority_labels[0]] * len(synthetic_minority_features)), axis=0)

result = optimize(
    new_x_train,
    new_y_train,
    x_validation,
    y_validation
)


Compiled modules for significant speedup can not be used!
https://pymoo.org/installation.html#installation

from pymoo.config import Config



# Step 6
Extract any synthetic samples that were found to be ideal with respect to overall error after class sensitive optimization.

In [40]:
all_samples = []
for instance in result.X:
	for sample in new_x_train[instance]:
		for stored_sample in all_samples:
			if np.all(sample == stored_sample):
				break
		else:
			all_samples.append(sample)

validated_synthetic_examples = []
for sample in all_samples:
    for synthetic_sample in synthetic_minority_features:
        if np.all(sample == synthetic_sample):
            validated_synthetic_examples.append(synthetic_sample)
            break

# Retrain the CVAE to fit the synthetic examples aswell.

In [44]:
new_x_train = np.concatenate((x_train, validated_synthetic_examples), axis=0)
new_y_train = np.concatenate((y_train, [minority_labels[0]] * len(validated_synthetic_examples)), axis=0)

cvae = ConditionalVAE(input_dim, 1, input_dim//2, 2).to(device)

cvae = train(
    new_x_train,
    new_y_train,
    cvae,
    lr=1e-3,
    epochs=200,
    batch_size=20
)
    

Epoch 1, Loss: 5.1674
Epoch 2, Loss: 3.8390
Epoch 3, Loss: 2.9918
Epoch 4, Loss: 2.2786
Epoch 5, Loss: 1.6920
Epoch 6, Loss: 1.3328
Epoch 7, Loss: 1.1677
Epoch 8, Loss: 1.0890
Epoch 9, Loss: 1.0643
Epoch 10, Loss: 1.0593
Epoch 11, Loss: 1.0377
Epoch 12, Loss: 1.0272
Epoch 13, Loss: 1.0237
Epoch 14, Loss: 1.0265
Epoch 15, Loss: 1.0166
Epoch 16, Loss: 1.0149
Epoch 17, Loss: 1.0296
Epoch 18, Loss: 1.0063
Epoch 19, Loss: 1.0121
Epoch 20, Loss: 1.0070
Epoch 21, Loss: 1.0035
Epoch 22, Loss: 1.0181
Epoch 23, Loss: 1.0038
Epoch 24, Loss: 1.0080
Epoch 25, Loss: 1.0007
Epoch 26, Loss: 0.9993
Epoch 27, Loss: 1.0042
Epoch 28, Loss: 1.0062
Epoch 29, Loss: 0.9959
Epoch 30, Loss: 1.0065
Epoch 31, Loss: 1.0046
Epoch 32, Loss: 1.1825
Epoch 33, Loss: 0.9951
Epoch 34, Loss: 1.0005
Epoch 35, Loss: 1.0102
Epoch 36, Loss: 1.0174
Epoch 37, Loss: 1.0039
Epoch 38, Loss: 1.0050
Epoch 39, Loss: 1.0006
Epoch 40, Loss: 1.0174
Epoch 41, Loss: 1.0003
Epoch 42, Loss: 0.9985
Epoch 43, Loss: 0.9940
Epoch 44, Loss: 0.99

In [None]:
for _ in range(3):
	print(f"\nLoop {_}")

	minority_label = pd.DataFrame(new_y_train).value_counts().argmin()
	minority_indices = np.where(new_y_train==minority_label)[0]
	minority_features = new_x_train[minority_indices]
	minority_labels = new_y_train[minority_indices]

	synthetic_minority_features = generate_synthetic_examples(
		minority_features, 
		minority_labels, 
		variance, 
		cvae
	)

	print(f"\tGenerated {synthetic_minority_features.shape} synthetic features")

	new_x_train = np.concatenate((new_x_train, synthetic_minority_features), axis=0)
	new_y_train = np.concatenate((new_y_train, [minority_labels[0]] * len(synthetic_minority_features)), axis=0)

	result = optimize(
		new_x_train,
		new_y_train,
		x_validation,
		y_validation
	)

	print(f"\tExecuted optimization upon synthetic features")

	ideal_validation = select_ideal(
		result,
		new_x_train,
		new_y_train,
		x_validation,
		y_validation	
	)

	model = KNeighborsClassifier(
		n_neighbors=ClassSensitiveOptimizer.n_neighbours
	)
	model.fit(
		new_x_train[ideal_validation], 
		new_y_train[ideal_validation]
	)

	y_pred = model.predict(x_test)
	acc = accuracy_score(y_test, y_pred)

	print(f"\t - Latest optimized test accuracy = {round(acc, 4)}")

	all_samples = []
	for instance in result.X:
		for sample in new_x_train[instance]:
			for stored_sample in all_samples:
				if np.all(sample == stored_sample):
					break
			else:
				all_samples.append(sample)

	validated_synthetic_examples = []
	for sample in all_samples:
		for synthetic_sample in synthetic_minority_features:
			if np.all(sample == synthetic_sample):
				validated_synthetic_examples.append(synthetic_sample)
				break

	print(f"\tValidated performance of synthetic features... {len(validated_synthetic_examples)} examples remain")

	new_x_train = np.concatenate((x_train, validated_synthetic_examples), axis=0)
	new_y_train = np.concatenate((y_train, [minority_labels[0]] * len(validated_synthetic_examples)), axis=0)

	cvae = ConditionalVAE(input_dim, 1, input_dim//2, 2).to(device)

	cvae = train(
		new_x_train,
		new_y_train,
		cvae,
		lr=1e-3,
		epochs=200,
		batch_size=20
	)

	print(f"\tRetrained CVAE to use new synthetic examples")	


Loop 0
	Generated (42, 7) synthetic features
	Executed optimization upon synthetic features
	 - Latest optimized test accuracy = 0.9729
	Validated performance of synthetic features
	Retrained CVAE to use new synthetic examples

Loop 1
	Generated (14, 7) synthetic features
	Executed optimization upon synthetic features
	 - Latest optimized test accuracy = 0.9833
	Validated performance of synthetic features
	Retrained CVAE to use new synthetic examples

Loop 2
	Generated (27, 7) synthetic features
	Executed optimization upon synthetic features
	 - Latest optimized test accuracy = 0.9812
	Validated performance of synthetic features
	Retrained CVAE to use new synthetic examples


In [24]:
validation_F = []
for instance in result.X:
	error = 1
	if np.sum(instance) >= ClassSensitiveOptimizer.n_neighbours:
		model = KNeighborsClassifier(
			n_neighbors=ClassSensitiveOptimizer.n_neighbours
		)
		model.fit(
			new_x_train[instance], 
			new_y_train[instance]
		)

		y_pred = model.predict(x_validation)
		error = 1 - accuracy_score(y_validation, y_pred)
	validation_F.append(error)

ideal_validation = result.X[np.argmin(validation_F)]

model = KNeighborsClassifier(
	n_neighbors=ClassSensitiveOptimizer.n_neighbours
)
model.fit(
	new_x_train[ideal_validation], 
	new_y_train[ideal_validation]
)

y_pred = model.predict(x_validation)
error = 1 - accuracy_score(y_validation, y_pred)

print(1-error)

0.9895615866388309


In [None]:
# plot_x = x_train
# plot_y = y_train

# cvae.eval()
# with torch.no_grad():    
#     x = torch.from_numpy(plot_x).to(device).float()
#     y = torch.from_numpy(plot_y).to(device).float().unsqueeze(1)
#     mu, logvar = cvae.encode(x, y)
#     z = cvae.reparameterize(mu, logvar)

# z_np = z.cpu().numpy()

# # Define colors for the two classes
# colors = ['red', 'blue']
# label_names = ['Class 0', 'Class 1']

# # Create scatter plot
# plt.figure(figsize=(8, 6))
# for i, label in enumerate(np.unique(plot_y)):
#     idxs = plot_y == label
#     plt.scatter(z_np[idxs, 0], z_np[idxs, 1], c=colors[i], label=label_names[i], alpha=0.7, edgecolors='k')

# plt.xlabel('Latent Dimension 1')
# plt.ylabel('Latent Dimension 2')
# plt.legend()
# plt.tight_layout()
# plt.show()
