In [1]:
from model.model import ConditionalVAE
from model.dataset import TabularDataset
from model.utils.visualization import PCA_plot, PCA_plot_rare_on_top
from model.utils.optimization import *
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from collections import Counter
from matplotlib.cm import get_cmap
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, balanced_accuracy_score

from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np
from itertools import product
from imblearn.over_sampling import (
	SMOTE,
	ADASYN,
	BorderlineSMOTE,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def mask_features(x, min_mask: int = 1, max_mask: int = 3):
	x_masked = x.clone()
	for i in range(x.size(0)):
		k = torch.randint(min_mask, max_mask + 1, (1,)).item()
		idx = torch.randperm(x.size(1))[:k]
		x_masked[i, idx] = 0
	return x_masked

def execute(x_train, y_train, x_validation, y_validation, do_print=False):
	x_prior = np.concatenate((x_train, x_validation), axis=0)
	y_prior = np.concatenate((y_train, y_validation), axis=0)

	x_smote, y_smote = SMOTE().fit_resample(x_prior, y_prior)

	minority_label = pd.DataFrame(y_train).value_counts().argmin()
	minority_indices = np.where(y_train==minority_label)[0]
	minority_features = x_train[minority_indices]
	minority_labels = y_train[minority_indices]

	num_features = x_prior[0].shape[0]

	nearest_neighbours = NearestNeighbors(n_neighbors=5, metric="euclidean").fit(x_smote)
	dist, idx = nearest_neighbours.kneighbors(x_smote)

	dist = dist[:, 1:]          # shape: (n_samples, k)
	idx  = idx[:, 1:]           # shape: (n_samples, k)

	knn_features = [x_smote[row_idx] for row_idx in idx]
	knn_labels = [y_smote[row_idx] for row_idx in idx]

	input_set = []
	recon_set = []
	labels = []
	for s_idx, sample in enumerate(x_smote):
		
		for n_idx, neighbouring_sample in enumerate(knn_features[s_idx]):
			if y_smote[s_idx] == knn_labels[s_idx][n_idx]:
				input_set.append(sample)
				recon_set.append(neighbouring_sample)
				labels.append(y_smote[s_idx])

	h1 = num_features + (num_features//2)
	h2 = num_features * 2
	latent_dim = 20

	cvae = ConditionalVAE(
		input_dim=num_features, 
		h1=h1,
		h2 = h2,
		latent_dim=latent_dim).to(device)

	epochs = 900
	batch_size = 32
	lr = 1e-3
	beta = 0.8

	data = TabularDataset(x_smote, x_smote, y_smote)
	loader = DataLoader(data, batch_size=batch_size, shuffle=True)

	cvae.train()
	opt = optim.Adam(cvae.parameters(), lr=lr)
	for epoch in range(1, epochs + 1):
		running = 0
		for encode_in, decode_comp, label in loader:
			xb = encode_in.float().to(device)
			yb = decode_comp.float().to(device)
			label = label.float().to(device)

			# xb_mask = mask_features(xb)
			
			recon, mu, logvar = cvae(xb, label)
			# recon_loss = nn.MSELoss()(recon, xb)
			# kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			# loss = recon_loss + (kl_div*beta)
			recon_loss = nn.MSELoss(reduction='sum')(recon, xb)
			kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			loss = (recon_loss + beta * kl_div) / xb.size(0)   # per-batch average

			opt.zero_grad()
			loss.backward()
			opt.step()

			running += loss.item()
		
		if (epoch % 5 == 0 or epoch == epochs) and do_print:
			print(f"Epoch {epoch:03d} | loss: {running / len(loader):.4f}")

	epochs = 400

	data = TabularDataset(
		np.array(input_set), 
		np.array(recon_set), 
		np.array(labels)
	)
	loader = DataLoader(data, batch_size=batch_size, shuffle=True)

	cvae.train()
	opt = optim.Adam(cvae.parameters(), lr=lr)
	for epoch in range(1, epochs + 1):
		running = 0
		for encode_in, decode_comp, label in loader:
			xb = encode_in.float().to(device)
			xb_masked = mask_features(xb)
			yb = decode_comp.float().to(device)
			label = label.float().to(device)
			# xb_mask = mask_features(xb)

			recon, mu, logvar = cvae(xb_masked, label)
			# recon_loss = nn.MSELoss()(recon, xb)
			# kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			# loss = recon_loss + (kl_div*beta)
			recon_loss = nn.MSELoss(reduction='sum')(recon, yb)
			kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			loss = (recon_loss + beta * kl_div) / xb.size(0)   # per-batch average

			opt.zero_grad()
			loss.backward()
			opt.step()

			running += loss.item()

		if (epoch % 5 == 0 or epoch == epochs) and do_print:
			print(f"Epoch {epoch:03d} | loss: {running / len(loader):.4f}")

	feature_variance = np.var(x_prior, axis=0)
	feature_mins = np.min(x_prior, axis=0)
	feature_maxs = np.max(x_prior, axis=0)
	feature_grids = []
	for idx, var in enumerate(feature_variance):
		lo  = feature_mins[idx] - var      # lower bound  (min – variance)
		hi  = feature_maxs[idx] + var      # upper bound  (max + variance)
		step = var * 2 or 1e-8             # avoid step == 0 if var == 0
		grid = np.arange(lo, hi + step, step)
		feature_grids.append(grid)

	# --- Cartesian product  -------------------------------------------------
	# itertools.product is lazy ⇒ less memory than meshgrid on huge spaces
	synthetic_X = np.fromiter(
		(val for combo in product(*feature_grids) for val in combo),
		dtype=float
	).reshape(-1, len(feature_grids))

	if do_print:
		print(f"{synthetic_X.shape[0]:,} synthetic rows × {synthetic_X.shape[1]} features")

	test_labels = torch.tensor([minority_label] * synthetic_X.shape[0])

	cvae.eval()
	with torch.no_grad():
		xb = torch.tensor(synthetic_X).float().to(device)
		mu, logvar = cvae.encode(xb)
		z = cvae.reparameterize(mu, logvar)
		synthetic_X_DECODE = cvae.decode(z, test_labels)  

	synthetic_Y = torch.tensor([minority_label] * synthetic_X.shape[0])

	problem = NSGA_II_Filter(
		np.concatenate((x_prior, synthetic_X_DECODE), axis=0), 
		np.concatenate((y_prior, synthetic_Y), axis=0), 
		x_validation, y_validation,
	)
	algorithm = NSGA2(
		pop_size=500, 
		sampling=DiverseSampling(), 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True
	)
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', 10),
		save_history=False,
	)

	candidate_x = np.concatenate((x_prior, synthetic_X_DECODE), axis=0)
	candidate_y = np.concatenate((y_prior, synthetic_Y), axis=0)

	max_validation_auc = -1
	best_x = None
	best_y = None

	for x in result.X:
		filtered_x = candidate_x[x]
		filtered_y = candidate_y[x]
		
		model = KNeighborsClassifier(n_neighbors=5)
		model.fit(filtered_x, filtered_y)
		y_pred = model.predict(x_validation)
		auc = roc_auc_score(y_validation, y_pred)

		if auc > max_validation_auc:
			best_x = filtered_x
			best_y = filtered_y
			max_validation_auc = auc

	return best_x, best_y


In [2]:
with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)
splits = pd.read_csv('data_splits.csv')

data_keys = []
for split_name in splits:
	for idx in range(31):
		data_keys.append(f"{idx}_{split_name}")

In [4]:
data_key = data_keys[0]
x_train = data_mapper[data_key]['x_train'] 
y_train = data_mapper[data_key]['y_train']
x_validation = data_mapper[data_key]['x_validation'] 
y_validation = data_mapper[data_key]['y_validation']
x_test = data_mapper[data_key]['x_test'] 
y_test = data_mapper[data_key]['y_test']

In [4]:
splits

Unnamed: 0,abalone-17_vs_7-8-9-10,abalone-20_vs_8-9-10,abalone-21_vs_8,abalone-3_vs_11,abalone19,abalone9-18,cleveland-0_vs_4,ecoli-0-1-4-7_vs_2-3-5-6,ecoli-0-1-4-7_vs_5-6,ecoli-0-2-6-7_vs_3-5,ecoli-0-4-6_vs_5,ecoli-0-6-7_vs_3-5,ecoli4,glass-0-6_vs_5,winequality-red-4,winequality-red-8_vs_6-7,winequality-white-9_vs_4,wisconsin,yeast-0-2-5-7-9_vs_3-6-8,yeast-2_vs_4
0,0_abalone-17_vs_7-8-9-10,0_abalone-20_vs_8-9-10,0_abalone-21_vs_8,0_abalone-3_vs_11,0_abalone19,0_abalone9-18,0_cleveland-0_vs_4,0_ecoli-0-1-4-7_vs_2-3-5-6,0_ecoli-0-1-4-7_vs_5-6,0_ecoli-0-2-6-7_vs_3-5,0_ecoli-0-4-6_vs_5,0_ecoli-0-6-7_vs_3-5,0_ecoli4,0_glass-0-6_vs_5,0_winequality-red-4,0_winequality-red-8_vs_6-7,0_winequality-white-9_vs_4,0_wisconsin,0_yeast-0-2-5-7-9_vs_3-6-8,0_yeast-2_vs_4
1,1_abalone-17_vs_7-8-9-10,1_abalone-20_vs_8-9-10,1_abalone-21_vs_8,1_abalone-3_vs_11,1_abalone19,1_abalone9-18,1_cleveland-0_vs_4,1_ecoli-0-1-4-7_vs_2-3-5-6,1_ecoli-0-1-4-7_vs_5-6,1_ecoli-0-2-6-7_vs_3-5,1_ecoli-0-4-6_vs_5,1_ecoli-0-6-7_vs_3-5,1_ecoli4,1_glass-0-6_vs_5,1_winequality-red-4,1_winequality-red-8_vs_6-7,1_winequality-white-9_vs_4,1_wisconsin,1_yeast-0-2-5-7-9_vs_3-6-8,1_yeast-2_vs_4
2,2_abalone-17_vs_7-8-9-10,2_abalone-20_vs_8-9-10,2_abalone-21_vs_8,2_abalone-3_vs_11,2_abalone19,2_abalone9-18,2_cleveland-0_vs_4,2_ecoli-0-1-4-7_vs_2-3-5-6,2_ecoli-0-1-4-7_vs_5-6,2_ecoli-0-2-6-7_vs_3-5,2_ecoli-0-4-6_vs_5,2_ecoli-0-6-7_vs_3-5,2_ecoli4,2_glass-0-6_vs_5,2_winequality-red-4,2_winequality-red-8_vs_6-7,2_winequality-white-9_vs_4,2_wisconsin,2_yeast-0-2-5-7-9_vs_3-6-8,2_yeast-2_vs_4
3,3_abalone-17_vs_7-8-9-10,3_abalone-20_vs_8-9-10,3_abalone-21_vs_8,3_abalone-3_vs_11,3_abalone19,3_abalone9-18,3_cleveland-0_vs_4,3_ecoli-0-1-4-7_vs_2-3-5-6,3_ecoli-0-1-4-7_vs_5-6,3_ecoli-0-2-6-7_vs_3-5,3_ecoli-0-4-6_vs_5,3_ecoli-0-6-7_vs_3-5,3_ecoli4,3_glass-0-6_vs_5,3_winequality-red-4,3_winequality-red-8_vs_6-7,3_winequality-white-9_vs_4,3_wisconsin,3_yeast-0-2-5-7-9_vs_3-6-8,3_yeast-2_vs_4
4,4_abalone-17_vs_7-8-9-10,4_abalone-20_vs_8-9-10,4_abalone-21_vs_8,4_abalone-3_vs_11,4_abalone19,4_abalone9-18,4_cleveland-0_vs_4,4_ecoli-0-1-4-7_vs_2-3-5-6,4_ecoli-0-1-4-7_vs_5-6,4_ecoli-0-2-6-7_vs_3-5,4_ecoli-0-4-6_vs_5,4_ecoli-0-6-7_vs_3-5,4_ecoli4,4_glass-0-6_vs_5,4_winequality-red-4,4_winequality-red-8_vs_6-7,4_winequality-white-9_vs_4,4_wisconsin,4_yeast-0-2-5-7-9_vs_3-6-8,4_yeast-2_vs_4
5,5_abalone-17_vs_7-8-9-10,5_abalone-20_vs_8-9-10,5_abalone-21_vs_8,5_abalone-3_vs_11,5_abalone19,5_abalone9-18,5_cleveland-0_vs_4,5_ecoli-0-1-4-7_vs_2-3-5-6,5_ecoli-0-1-4-7_vs_5-6,5_ecoli-0-2-6-7_vs_3-5,5_ecoli-0-4-6_vs_5,5_ecoli-0-6-7_vs_3-5,5_ecoli4,5_glass-0-6_vs_5,5_winequality-red-4,5_winequality-red-8_vs_6-7,5_winequality-white-9_vs_4,5_wisconsin,5_yeast-0-2-5-7-9_vs_3-6-8,5_yeast-2_vs_4
6,6_abalone-17_vs_7-8-9-10,6_abalone-20_vs_8-9-10,6_abalone-21_vs_8,6_abalone-3_vs_11,6_abalone19,6_abalone9-18,6_cleveland-0_vs_4,6_ecoli-0-1-4-7_vs_2-3-5-6,6_ecoli-0-1-4-7_vs_5-6,6_ecoli-0-2-6-7_vs_3-5,6_ecoli-0-4-6_vs_5,6_ecoli-0-6-7_vs_3-5,6_ecoli4,6_glass-0-6_vs_5,6_winequality-red-4,6_winequality-red-8_vs_6-7,6_winequality-white-9_vs_4,6_wisconsin,6_yeast-0-2-5-7-9_vs_3-6-8,6_yeast-2_vs_4
7,7_abalone-17_vs_7-8-9-10,7_abalone-20_vs_8-9-10,7_abalone-21_vs_8,7_abalone-3_vs_11,7_abalone19,7_abalone9-18,7_cleveland-0_vs_4,7_ecoli-0-1-4-7_vs_2-3-5-6,7_ecoli-0-1-4-7_vs_5-6,7_ecoli-0-2-6-7_vs_3-5,7_ecoli-0-4-6_vs_5,7_ecoli-0-6-7_vs_3-5,7_ecoli4,7_glass-0-6_vs_5,7_winequality-red-4,7_winequality-red-8_vs_6-7,7_winequality-white-9_vs_4,7_wisconsin,7_yeast-0-2-5-7-9_vs_3-6-8,7_yeast-2_vs_4
8,8_abalone-17_vs_7-8-9-10,8_abalone-20_vs_8-9-10,8_abalone-21_vs_8,8_abalone-3_vs_11,8_abalone19,8_abalone9-18,8_cleveland-0_vs_4,8_ecoli-0-1-4-7_vs_2-3-5-6,8_ecoli-0-1-4-7_vs_5-6,8_ecoli-0-2-6-7_vs_3-5,8_ecoli-0-4-6_vs_5,8_ecoli-0-6-7_vs_3-5,8_ecoli4,8_glass-0-6_vs_5,8_winequality-red-4,8_winequality-red-8_vs_6-7,8_winequality-white-9_vs_4,8_wisconsin,8_yeast-0-2-5-7-9_vs_3-6-8,8_yeast-2_vs_4
9,9_abalone-17_vs_7-8-9-10,9_abalone-20_vs_8-9-10,9_abalone-21_vs_8,9_abalone-3_vs_11,9_abalone19,9_abalone9-18,9_cleveland-0_vs_4,9_ecoli-0-1-4-7_vs_2-3-5-6,9_ecoli-0-1-4-7_vs_5-6,9_ecoli-0-2-6-7_vs_3-5,9_ecoli-0-4-6_vs_5,9_ecoli-0-6-7_vs_3-5,9_ecoli4,9_glass-0-6_vs_5,9_winequality-red-4,9_winequality-red-8_vs_6-7,9_winequality-white-9_vs_4,9_wisconsin,9_yeast-0-2-5-7-9_vs_3-6-8,9_yeast-2_vs_4


In [None]:
dataset_name = "ecoli-0-4-6_vs_5"

# results = {}
for dataset_name in splits:
	cvae_auc = []
	cvae_acc = []
	for idx in range(31):
		data_key = f"{idx}_{dataset_name}"
		if os.path.exists(f'results_new/{data_key}.pkl'): continue
		print(data_key)

		for _ in range(3):
			try:
				resample_x, resample_y = execute(
					data_mapper[data_key]['x_train'],
					data_mapper[data_key]['y_train'],
					data_mapper[data_key]['x_validation'],
					data_mapper[data_key]['y_validation']
				)
				break
			except:
				continue
				
		# model = KNeighborsClassifier(n_neighbors=5)
		# model.fit(resample_x, resample_y)
		# y_pred = model.predict(data_mapper[data_key]['x_test'])
		# cvae_auc.append(roc_auc_score(data_mapper[data_key]['y_test'], y_pred))	
		# cvae_acc.append(accuracy_score(data_mapper[data_key]['y_test'], y_pred))	
		with open(f'results_new/{data_key}.pkl', 'wb') as fh:
			pickle.dump((resample_x, resample_y), fh)

	# results[dataset_name] = (resample_x, resample_y)

1_cleveland-0_vs_4


NameError: name 'resample_x' is not defined

In [42]:
import numpy as np
from itertools import product
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# --- your grid construction (unchanged) --------------------------------
feature_variance = np.var(x_prior, axis=0)
feature_mins     = np.min(x_prior, axis=0)
feature_maxs     = np.max(x_prior, axis=0)

feature_grids = []
for idx, var in enumerate(feature_variance):
    lo   = feature_mins[idx] - var
    hi   = feature_maxs[idx] + var
    step = var * 2 or 1e-8       # avoid 0 step if var==0
    grid = np.arange(lo, hi + step, step)
    feature_grids.append(grid)

# Cartesian product → all combinations
synthetic_X = np.fromiter(
    (val for combo in product(*feature_grids) for val in combo),
    dtype=float
).reshape(-1, len(feature_grids))

# --- prune points too close to existing samples ------------------------
# Radius in *standard deviations* after z-scoring (e.g., 0.75 ~ within 0.75 SD)
radius_std = 2

# 1) scale to unit variance so distance is comparable across features
scaler = StandardScaler().fit(x_prior)
Xz = scaler.transform(x_prior)
Sz = scaler.transform(synthetic_X)

# 2) remove synthetic points that have any neighbor within the radius
nn = NearestNeighbors(algorithm="auto").fit(Xz)
# radius_neighbors returns, for each query row, indices of neighbors within 'radius'
neighbors_within = nn.radius_neighbors(Sz, radius=radius_std, return_distance=False)

keep_mask = np.fromiter((len(ix) == 0 for ix in neighbors_within),
                        dtype=bool, count=Sz.shape[0])
synthetic_X_pruned = synthetic_X[keep_mask]

print(f"Kept {synthetic_X_pruned.shape[0]} / {synthetic_X.shape[0]} synthetic points.")


Kept 503810 / 504000 synthetic points.


In [6]:

x_prior = np.concatenate((data_mapper[data_key]['x_train'], data_mapper[data_key]['x_validation']), axis=0)
y_prior = np.concatenate((data_mapper[data_key]['y_train'], data_mapper[data_key]['y_validation']), axis=0)    
x_smote, y_smote = SMOTE().fit_resample(x_prior, y_prior)

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_prior, y_prior)
y_pred = model.predict(x_test)
roc_auc_score(y_test, y_pred), accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)

(np.float64(0.5357142857142857),
 0.9777397260273972,
 np.float64(0.5357142857142857))

In [8]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_smote, y_smote)
y_pred = model.predict(x_test)
auc = roc_auc_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
bal_acc = balanced_accuracy_score(y_test, y_pred)
print(auc, acc, bal_acc)

0.7497493734335839 0.9195205479452054 0.7497493734335839


In [7]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(resample_x, resample_y)
y_pred = model.predict(x_test)
auc = roc_auc_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
bal_acc = balanced_accuracy_score(y_test, y_pred)
print(auc, acc, bal_acc)

0.76203007518797 0.9434931506849316 0.7620300751879698


In [20]:
dataset_name = "ecoli-0-4-6_vs_5"

baselines = []
smoted = []
for idx in range(31):
	data_key = f"{idx}_{dataset_name}"
	x_train = data_mapper[data_key]['x_train']
	y_train = data_mapper[data_key]['y_train']

	x_validation = data_mapper[data_key]['x_validation']
	y_validation = data_mapper[data_key]['y_validation']

	x_test = data_mapper[data_key]['x_test']
	y_test = data_mapper[data_key]['y_test']

	x_prior = np.concatenate((x_train, x_validation), axis=0)
	y_prior = np.concatenate((y_train, y_validation), axis=0)  

	model = KNeighborsClassifier(n_neighbors=5)
	model.fit(x_prior, y_prior)
	y_pred = model.predict(x_test)
	baselines.append(roc_auc_score(y_test, y_pred))

	try:
		x_smote, y_smote = SMOTE().fit_resample(x_prior, y_prior)


		model = KNeighborsClassifier(n_neighbors=5)
		model.fit(x_smote, y_smote)
		y_pred = model.predict(x_test)
		smoted.append(roc_auc_score(y_test, y_pred))
	except:
		smoted.append(0)

print(f"\t> B vs S pval {ranksums(baselines, smoted).pvalue}")
print(f"\t> B vs C pval {ranksums(baselines, cvae).pvalue}")
print(f"\t> S vs C pval {ranksums(smoted, cvae).pvalue}")
print(f"\t> S: {np.mean(smoted)}")
print(f"\t> B: {np.mean(baselines)}")
print(f"\t> C: {np.mean(cvae)}")
print("")

		


	> B vs S pval 0.8327479062272571
	> B vs C pval 0.45132752491681716
	> S vs C pval 0.4222761086468382
	> S: 0.9083450210378682
	> B: 0.9018934081346425
	> C: 0.8930575035063114

