In [1]:
from model.model import ConditionalVAE
from model.dataset import TabularDataset
from model.utils.visualization import PCA_plot, PCA_plot_rare_on_top
from model.utils.optimization import *
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from collections import Counter
from matplotlib.cm import get_cmap
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, balanced_accuracy_score

from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np
from itertools import product
from imblearn.over_sampling import (
	SMOTE,
	ADASYN,
	BorderlineSMOTE,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def mask_features(x, min_mask: int = 1, max_mask: int = 3):
	x_masked = x.clone()
	for i in range(x.size(0)):
		k = torch.randint(min_mask, max_mask + 1, (1,)).item()
		idx = torch.randperm(x.size(1))[:k]
		x_masked[i, idx] = 0
	return x_masked

def execute(x_train, y_train, x_validation, y_validation):
	x_prior = np.concatenate((x_train, x_validation), axis=0)
	y_prior = np.concatenate((y_train, y_validation), axis=0)

	x_smote, y_smote = SMOTE().fit_resample(x_prior, y_prior)

	minority_label = pd.DataFrame(y_train).value_counts().argmin()
	minority_indices = np.where(y_train==minority_label)[0]
	minority_features = x_train[minority_indices]
	minority_labels = y_train[minority_indices]

	num_features = x_prior[0].shape[0]

	nearest_neighbours = NearestNeighbors(n_neighbors=5, metric="euclidean").fit(x_smote)
	dist, idx = nearest_neighbours.kneighbors(x_smote)

	dist = dist[:, 1:]          # shape: (n_samples, k)
	idx  = idx[:, 1:]           # shape: (n_samples, k)

	knn_features = [x_smote[row_idx] for row_idx in idx]
	knn_labels = [y_smote[row_idx] for row_idx in idx]

	input_set = []
	recon_set = []
	labels = []
	for s_idx, sample in enumerate(x_smote):
		
		for n_idx, neighbouring_sample in enumerate(knn_features[s_idx]):
			if y_smote[s_idx] == knn_labels[s_idx][n_idx]:
				input_set.append(sample)
				recon_set.append(neighbouring_sample)
				labels.append(y_smote[s_idx])

	h1 = 15
	h2 = 18
	latent_dim = 20

	cvae = ConditionalVAE(
		input_dim=num_features, 
		h1=h1,
		h2 = h2,
		latent_dim=latent_dim).to(device)

	epochs = 900
	batch_size = 32
	lr = 1e-3
	beta = 0.8

	data = TabularDataset(x_smote, x_smote, y_smote)
	loader = DataLoader(data, batch_size=batch_size, shuffle=True)

	cvae.train()
	opt = optim.Adam(cvae.parameters(), lr=lr)
	for epoch in range(1, epochs + 1):
		running = 0
		for encode_in, decode_comp, label in loader:
			xb = encode_in.float().to(device)
			yb = decode_comp.float().to(device)
			label = label.float().to(device)

			# xb_mask = mask_features(xb)
			
			recon, mu, logvar = cvae(xb, label)
			# recon_loss = nn.MSELoss()(recon, xb)
			# kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			# loss = recon_loss + (kl_div*beta)
			recon_loss = nn.MSELoss(reduction='sum')(recon, xb)
			kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			loss = (recon_loss + beta * kl_div) / xb.size(0)   # per-batch average

			opt.zero_grad()
			loss.backward()
			opt.step()

			running += loss.item()
		
		if epoch % 5 == 0 or epoch == epochs:
			print(f"Epoch {epoch:03d} | loss: {running / len(loader):.4f}")

	epochs = 400

	data = TabularDataset(
		np.array(input_set), 
		np.array(recon_set), 
		np.array(labels)
	)
	loader = DataLoader(data, batch_size=batch_size, shuffle=True)

	cvae.train()
	opt = optim.Adam(cvae.parameters(), lr=lr)
	for epoch in range(1, epochs + 1):
		running = 0
		for encode_in, decode_comp, label in loader:
			xb = encode_in.float().to(device)
			xb_masked = mask_features(xb)
			yb = decode_comp.float().to(device)
			label = label.float().to(device)
			# xb_mask = mask_features(xb)

			recon, mu, logvar = cvae(xb_masked, label)
			# recon_loss = nn.MSELoss()(recon, xb)
			# kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			# loss = recon_loss + (kl_div*beta)
			recon_loss = nn.MSELoss(reduction='sum')(recon, yb)
			kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			loss = (recon_loss + beta * kl_div) / xb.size(0)   # per-batch average

			opt.zero_grad()
			loss.backward()
			opt.step()

			running += loss.item()

		if epoch % 5 == 0 or epoch == epochs:
			print(f"Epoch {epoch:03d} | loss: {running / len(loader):.4f}")

	feature_variance = np.var(x_prior, axis=0)
	feature_mins = np.min(x_prior, axis=0)
	feature_maxs = np.max(x_prior, axis=0)
	feature_grids = []
	for idx, var in enumerate(feature_variance):
		lo  = feature_mins[idx] - var      # lower bound  (min – variance)
		hi  = feature_maxs[idx] + var      # upper bound  (max + variance)
		step = var * 2 or 1e-8             # avoid step == 0 if var == 0
		grid = np.arange(lo, hi + step, step)
		feature_grids.append(grid)

	# --- Cartesian product  -------------------------------------------------
	# itertools.product is lazy ⇒ less memory than meshgrid on huge spaces
	synthetic_X = np.fromiter(
		(val for combo in product(*feature_grids) for val in combo),
		dtype=float
	).reshape(-1, len(feature_grids))

	print(f"{synthetic_X.shape[0]:,} synthetic rows × {synthetic_X.shape[1]} features")

	test_labels = torch.tensor([minority_label] * synthetic_X.shape[0])

	cvae.eval()
	with torch.no_grad():
		xb = torch.tensor(synthetic_X).float().to(device)
		mu, logvar = cvae.encode(xb)
		z = cvae.reparameterize(mu, logvar)
		synthetic_X_DECODE = cvae.decode(z, test_labels)  

	synthetic_X_DECODE.shape
	test_labels = torch.tensor([5] * synthetic_X.shape[0])
	synthetic_Y = torch.tensor([minority_label] * synthetic_X.shape[0])

	problem = NSGA_II_Filter(
		np.concatenate((x_prior, synthetic_X_DECODE), axis=0), 
		np.concatenate((y_prior, synthetic_Y), axis=0), 
		x_validation, y_validation,
	)
	algorithm = NSGA2(
		pop_size=500, 
		sampling=DiverseSampling(), 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True
	)
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', 10),
		save_history=False,
	)

	candidate_x = np.concatenate((x_prior, synthetic_X_DECODE), axis=0)
	candidate_y = np.concatenate((y_prior, synthetic_Y), axis=0)

	max_validation_auc = -1
	best_x = None
	best_y = None

	for x in result.X:
		filtered_x = candidate_x[x]
		filtered_y = candidate_y[x]
		
		model = KNeighborsClassifier(n_neighbors=5)
		model.fit(filtered_x, filtered_y)
		y_pred = model.predict(x_validation)
		auc = roc_auc_score(y_validation, y_pred)

		if auc > max_validation_auc:
			best_x = filtered_x
			best_y = filtered_y
			max_validation_auc = auc

	return best_x, best_y


In [2]:
with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)
splits = pd.read_csv('data_splits.csv')

data_keys = []
for split_name in splits:
	for idx in range(31):
		data_keys.append(f"{idx}_{split_name}")

In [4]:
data_key = data_keys[0]
x_train = data_mapper[data_key]['x_train'] 
y_train = data_mapper[data_key]['y_train']
x_validation = data_mapper[data_key]['x_validation'] 
y_validation = data_mapper[data_key]['y_validation']
x_test = data_mapper[data_key]['x_test'] 
y_test = data_mapper[data_key]['y_test']

In [5]:
resample_x, resample_y = execute(
    data_mapper[data_key]['x_train'],
	data_mapper[data_key]['y_train'],
    data_mapper[data_key]['x_validation'],
    data_mapper[data_key]['y_validation']
)

Epoch 005 | loss: 3.2177
Epoch 010 | loss: 2.4644
Epoch 015 | loss: 2.2331
Epoch 020 | loss: 2.1827
Epoch 025 | loss: 2.1687
Epoch 030 | loss: 2.1440
Epoch 035 | loss: 2.1615
Epoch 040 | loss: 2.1321
Epoch 045 | loss: 2.1441
Epoch 050 | loss: 2.1305
Epoch 055 | loss: 2.1418
Epoch 060 | loss: 2.1394
Epoch 065 | loss: 2.1153
Epoch 070 | loss: 2.1057
Epoch 075 | loss: 2.1238
Epoch 080 | loss: 2.1180
Epoch 085 | loss: 2.1062
Epoch 090 | loss: 2.1372
Epoch 095 | loss: 2.1440
Epoch 100 | loss: 2.1122
Epoch 105 | loss: 2.1220
Epoch 110 | loss: 2.1129
Epoch 115 | loss: 2.1419
Epoch 120 | loss: 2.1025
Epoch 125 | loss: 2.0955
Epoch 130 | loss: 2.1043
Epoch 135 | loss: 2.1195
Epoch 140 | loss: 2.0898
Epoch 145 | loss: 2.0879
Epoch 150 | loss: 2.0590
Epoch 155 | loss: 2.0117
Epoch 160 | loss: 1.9935
Epoch 165 | loss: 1.9635
Epoch 170 | loss: 1.9527
Epoch 175 | loss: 1.9543
Epoch 180 | loss: 1.9586
Epoch 185 | loss: 1.9258
Epoch 190 | loss: 1.9489
Epoch 195 | loss: 1.9284
Epoch 200 | loss: 1.9608


In [42]:
import numpy as np
from itertools import product
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# --- your grid construction (unchanged) --------------------------------
feature_variance = np.var(x_prior, axis=0)
feature_mins     = np.min(x_prior, axis=0)
feature_maxs     = np.max(x_prior, axis=0)

feature_grids = []
for idx, var in enumerate(feature_variance):
    lo   = feature_mins[idx] - var
    hi   = feature_maxs[idx] + var
    step = var * 2 or 1e-8       # avoid 0 step if var==0
    grid = np.arange(lo, hi + step, step)
    feature_grids.append(grid)

# Cartesian product → all combinations
synthetic_X = np.fromiter(
    (val for combo in product(*feature_grids) for val in combo),
    dtype=float
).reshape(-1, len(feature_grids))

# --- prune points too close to existing samples ------------------------
# Radius in *standard deviations* after z-scoring (e.g., 0.75 ~ within 0.75 SD)
radius_std = 2

# 1) scale to unit variance so distance is comparable across features
scaler = StandardScaler().fit(x_prior)
Xz = scaler.transform(x_prior)
Sz = scaler.transform(synthetic_X)

# 2) remove synthetic points that have any neighbor within the radius
nn = NearestNeighbors(algorithm="auto").fit(Xz)
# radius_neighbors returns, for each query row, indices of neighbors within 'radius'
neighbors_within = nn.radius_neighbors(Sz, radius=radius_std, return_distance=False)

keep_mask = np.fromiter((len(ix) == 0 for ix in neighbors_within),
                        dtype=bool, count=Sz.shape[0])
synthetic_X_pruned = synthetic_X[keep_mask]

print(f"Kept {synthetic_X_pruned.shape[0]} / {synthetic_X.shape[0]} synthetic points.")


Kept 503810 / 504000 synthetic points.


In [6]:

x_prior = np.concatenate((data_mapper[data_key]['x_train'], data_mapper[data_key]['x_validation']), axis=0)
y_prior = np.concatenate((data_mapper[data_key]['y_train'], data_mapper[data_key]['y_validation']), axis=0)    
x_smote, y_smote = SMOTE().fit_resample(x_prior, y_prior)

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_prior, y_prior)
y_pred = model.predict(x_test)
roc_auc_score(y_test, y_pred), accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)

(np.float64(0.5357142857142857),
 0.9777397260273972,
 np.float64(0.5357142857142857))

In [8]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_smote, y_smote)
y_pred = model.predict(x_test)
auc = roc_auc_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
bal_acc = balanced_accuracy_score(y_test, y_pred)
print(auc, acc, bal_acc)

0.7497493734335839 0.9195205479452054 0.7497493734335839


In [7]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(resample_x, resample_y)
y_pred = model.predict(x_test)
auc = roc_auc_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
bal_acc = balanced_accuracy_score(y_test, y_pred)
print(auc, acc, bal_acc)

0.76203007518797 0.9434931506849316 0.7620300751879698
