In [None]:
from model.model import ConditionalVAE
from model.dataset import TabularDataset
from model.utils.visualization import PCA_plot, PCA_plot_rare_on_top
from model.utils.optimization import *
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from collections import Counter
from matplotlib.cm import get_cmap
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, balanced_accuracy_score

from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np
from itertools import product
from imblearn.over_sampling import (
	SMOTE,
	ADASYN,
	BorderlineSMOTE,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def mask_features(x, min_mask: int = 1, max_mask: int = 3):
	x_masked = x.clone()
	for i in range(x.size(0)):
		k = torch.randint(min_mask, max_mask + 1, (1,)).item()
		idx = torch.randperm(x.size(1))[:k]
		x_masked[i, idx] = 0
	return x_masked

def execute(x_train, y_train, x_validation, y_validation, do_print=False):
	x_prior = np.concatenate((x_train, x_validation), axis=0)
	y_prior = np.concatenate((y_train, y_validation), axis=0)

	x_smote, y_smote = SMOTE().fit_resample(x_prior, y_prior)

	minority_label = pd.DataFrame(y_train).value_counts().argmin()
	minority_indices = np.where(y_train==minority_label)[0]
	minority_features = x_train[minority_indices]
	minority_labels = y_train[minority_indices]

	num_features = x_prior[0].shape[0]

	nearest_neighbours = NearestNeighbors(n_neighbors=5, metric="euclidean").fit(x_smote)
	dist, idx = nearest_neighbours.kneighbors(x_smote)

	dist = dist[:, 1:]          # shape: (n_samples, k)
	idx  = idx[:, 1:]           # shape: (n_samples, k)

	knn_features = [x_smote[row_idx] for row_idx in idx]
	knn_labels = [y_smote[row_idx] for row_idx in idx]

	input_set = []
	recon_set = []
	labels = []
	for s_idx, sample in enumerate(x_smote):
		
		for n_idx, neighbouring_sample in enumerate(knn_features[s_idx]):
			if y_smote[s_idx] == knn_labels[s_idx][n_idx]:
				input_set.append(sample)
				recon_set.append(neighbouring_sample)
				labels.append(y_smote[s_idx])

	h1 = num_features + (num_features//2)
	h2 = num_features * 2
	latent_dim = 20

	cvae = ConditionalVAE(
		input_dim=num_features, 
		h1=h1,
		h2 = h2,
		latent_dim=latent_dim).to(device)

	epochs = 900
	batch_size = 32
	lr = 1e-3
	beta = 0.8

	data = TabularDataset(x_smote, x_smote, y_smote)
	loader = DataLoader(data, batch_size=batch_size, shuffle=True)

	cvae.train()
	opt = optim.Adam(cvae.parameters(), lr=lr)
	for epoch in range(1, epochs + 1):
		running = 0
		for encode_in, decode_comp, label in loader:
			xb = encode_in.float().to(device)
			yb = decode_comp.float().to(device)
			label = label.float().to(device)

			# xb_mask = mask_features(xb)
			
			recon, mu, logvar = cvae(xb, label)
			# recon_loss = nn.MSELoss()(recon, xb)
			# kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			# loss = recon_loss + (kl_div*beta)
			recon_loss = nn.MSELoss(reduction='sum')(recon, xb)
			kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			loss = (recon_loss + beta * kl_div) / xb.size(0)   # per-batch average

			opt.zero_grad()
			loss.backward()
			opt.step()

			running += loss.item()
		
		if (epoch % 5 == 0 or epoch == epochs) and do_print:
			print(f"Epoch {epoch:03d} | loss: {running / len(loader):.4f}")

	epochs = 400

	data = TabularDataset(
		np.array(input_set), 
		np.array(recon_set), 
		np.array(labels)
	)
	loader = DataLoader(data, batch_size=batch_size, shuffle=True)

	cvae.train()
	opt = optim.Adam(cvae.parameters(), lr=lr)
	for epoch in range(1, epochs + 1):
		running = 0
		for encode_in, decode_comp, label in loader:
			xb = encode_in.float().to(device)
			xb_masked = mask_features(xb)
			yb = decode_comp.float().to(device)
			label = label.float().to(device)
			# xb_mask = mask_features(xb)

			recon, mu, logvar = cvae(xb_masked, label)
			# recon_loss = nn.MSELoss()(recon, xb)
			# kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			# loss = recon_loss + (kl_div*beta)
			recon_loss = nn.MSELoss(reduction='sum')(recon, yb)
			kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			loss = (recon_loss + beta * kl_div) / xb.size(0)   # per-batch average

			opt.zero_grad()
			loss.backward()
			opt.step()

			running += loss.item()

		if (epoch % 5 == 0 or epoch == epochs) and do_print:
			print(f"Epoch {epoch:03d} | loss: {running / len(loader):.4f}")

	feature_variance = np.var(x_prior, axis=0)
	feature_mins = np.min(x_prior, axis=0)
	feature_maxs = np.max(x_prior, axis=0)
	feature_grids = []
	for idx, var in enumerate(feature_variance):
		lo  = feature_mins[idx] - var      # lower bound  (min – variance)
		hi  = feature_maxs[idx] + var      # upper bound  (max + variance)
		step = var * 2 or 1e-8             # avoid step == 0 if var == 0
		grid = np.arange(lo, hi + step, step)
		feature_grids.append(grid)

	# --- Cartesian product  -------------------------------------------------
	# itertools.product is lazy ⇒ less memory than meshgrid on huge spaces
	synthetic_X = np.fromiter(
		(val for combo in product(*feature_grids) for val in combo),
		dtype=float
	).reshape(-1, len(feature_grids))

	if do_print:
		print(f"{synthetic_X.shape[0]:,} synthetic rows × {synthetic_X.shape[1]} features")

	test_labels = torch.tensor([minority_label] * synthetic_X.shape[0])

	cvae.eval()
	with torch.no_grad():
		xb = torch.tensor(synthetic_X).float().to(device)
		mu, logvar = cvae.encode(xb)
		z = cvae.reparameterize(mu, logvar)
		synthetic_X_DECODE = cvae.decode(z, test_labels)  

	synthetic_Y = torch.tensor([minority_label] * synthetic_X.shape[0])

	problem = NSGA_II_Filter(
		np.concatenate((x_prior, synthetic_X_DECODE), axis=0), 
		np.concatenate((y_prior, synthetic_Y), axis=0), 
		x_validation, y_validation,
	)
	algorithm = NSGA2(
		pop_size=500, 
		sampling=DiverseSampling(), 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True
	)
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', 10),
		save_history=False,
	)

	candidate_x = np.concatenate((x_prior, synthetic_X_DECODE), axis=0)
	candidate_y = np.concatenate((y_prior, synthetic_Y), axis=0)

	max_validation_auc = -1
	best_x = None
	best_y = None

	for x in result.X:
		filtered_x = candidate_x[x]
		filtered_y = candidate_y[x]
		
		model = KNeighborsClassifier(n_neighbors=5)
		model.fit(filtered_x, filtered_y)
		y_pred = model.predict(x_validation)
		auc = roc_auc_score(y_validation, y_pred)

		if auc > max_validation_auc:
			best_x = filtered_x
			best_y = filtered_y
			max_validation_auc = auc

	return best_x, best_y


In [None]:
with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)
splits = pd.read_csv('data_splits.csv')

data_keys = []
for split_name in splits:
	for idx in range(31):
		data_keys.append(f"{idx}_{split_name}")

In [26]:
results = {}
for dataset in splits:
	for idx in range(31):
		data_key = f"{idx}_{dataset}"
		
		if os.path.exists(f'results_new/{data_key}.pkl') is False: continue

		with open(f'results_new/{data_key}.pkl', 'rb') as fh:
			results[data_key] = pickle.load(fh)


In [27]:
cvae = []
# dataset_name = "abalone-17_vs_7-8-9-10"
dataset_name = "abalone-20_vs_8-9-10"
dataset_name = "abalone-21_vs_8"
for idx in range(31):
	data_key = f"{idx}_{dataset_name}"
	
	if data_key not in results: continue
	
	x_test = data_mapper[data_key]['x_test']
	y_test = data_mapper[data_key]['y_test']
	x, y = results[data_key]

	model = KNeighborsClassifier(n_neighbors=5)
	model.fit(x, y)
	y_pred = model.predict(x_test)
	cvae.append(roc_auc_score(y_test, y_pred))


In [28]:

baselines = []
smoted = []
for idx in range(31):
	data_key = f"{idx}_{dataset_name}"
	x_train = data_mapper[data_key]['x_train']
	y_train = data_mapper[data_key]['y_train']

	x_validation = data_mapper[data_key]['x_validation']
	y_validation = data_mapper[data_key]['y_validation']

	x_test = data_mapper[data_key]['x_test']
	y_test = data_mapper[data_key]['y_test']

	x_prior = np.concatenate((x_train, x_validation), axis=0)
	y_prior = np.concatenate((y_train, y_validation), axis=0)  

	model = KNeighborsClassifier(n_neighbors=5)
	model.fit(x_prior, y_prior)
	y_pred = model.predict(x_test)
	baselines.append(roc_auc_score(y_test, y_pred))

	try:
		x_smote, y_smote = SMOTE().fit_resample(x_prior, y_prior)


		model = KNeighborsClassifier(n_neighbors=5)
		model.fit(x_smote, y_smote)
		y_pred = model.predict(x_test)
		smoted.append(roc_auc_score(y_test, y_pred))
	except:
		smoted.append(0)

In [29]:
print(f"\t> B vs S pval {ranksums(baselines, smoted).pvalue}")
print(f"\t> B vs C pval {ranksums(baselines, cvae).pvalue}")
print(f"\t> S vs C pval {ranksums(smoted, cvae).pvalue}")
print(f"\t> S: {np.mean(smoted)}")
print(f"\t> B: {np.mean(baselines)}")
print(f"\t> C: {np.mean(cvae)}")
print("")

	> B vs S pval 0.0074745082907966
	> B vs C pval 0.008297147842779923
	> S vs C pval 0.9775368152989685
	> S: 0.8466606088141754
	> B: 0.6988111464485841
	> C: 0.8424579736483416

