In [1]:
from model.model import ConditionalVAE
from model.dataset import TabularDataset
from model.utils.visualization import PCA_plot, PCA_plot_rare_on_top
from model.utils.optimization import *
from model.utils.analysis import *
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from collections import Counter
from matplotlib.cm import get_cmap
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, balanced_accuracy_score

from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np
from itertools import product
from imblearn.over_sampling import (
	SMOTE,
	ADASYN,
	RandomOverSampler,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def mask_features(x, min_mask: int = 1, max_mask: int = 3):
	x_masked = x.clone()
	for i in range(x.size(0)):
		k = torch.randint(min_mask, max_mask + 1, (1,)).item()
		idx = torch.randperm(x.size(1))[:k]
		x_masked[i, idx] = 0
	return x_masked


In [6]:
with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)
splits = pd.read_csv('data_splits.csv')

data_keys = []
for split_name in splits:
	for idx in range(31):
		data_keys.append(f"{idx}_{split_name}")

def KNN_analysis(x, y, x_test, y_test):
	model = KNeighborsClassifier(n_neighbors=5)
	model.fit(x, y)
	y_pred = model.predict(x_test)

	auc = roc_auc_score(y_test, y_pred)
	acc = accuracy_score(y_test, y_pred)

	counts = pd.DataFrame(y).value_counts()
	ir = counts.max()/counts.min()

	return ir, acc, auc


In [None]:
def train_model(X, Y):
	minority_label = get_minority_label(Y)
	num_features = X[0].shape[0]
	nearest_neighbours = NearestNeighbors(n_neighbors=5, metric="euclidean").fit(X)
	dist, idx = nearest_neighbours.kneighbors(X)
	dist = dist[:, 1:]
	idx  = idx[:, 1:]

	knn_features = [X[row_idx] for row_idx in idx]
	knn_labels = [X[row_idx] for row_idx in idx]

	input_set, recon_set, labels = [], [], []
	for s_idx, sample in enumerate(X):
		for n_idx, neighbouring_sample in enumerate(knn_features[s_idx]):
			if Y[s_idx] == knn_labels[s_idx][n_idx]:
				input_set.append(sample)
				recon_set.append(neighbouring_sample)
				labels.append(Y[s_idx])


	cvae = ConditionalVAE(
		input_dim=num_features, 
		h1=num_features + (num_features//2), 
		h2=num_features * 2, 
		latent_dim=20
		).to(device)

	epochs = 900
	batch_size = 32
	lr = 1e-3
	beta = 0.8

	data = TabularDataset(X, X, Y)
	loader = DataLoader(data, batch_size=batch_size, shuffle=True)

	cvae.train()
	opt = optim.Adam(cvae.parameters(), lr=lr)
	for epoch in range(1, epochs + 1):
		for encode_in, decode_comp, label in loader:
			xb = encode_in.float().to(device)
			yb = decode_comp.float().to(device)
			label = label.float().to(device)
			# xb_mask = mask_features(xb)
			recon, mu, logvar = cvae(xb, label)
			recon_loss = nn.MSELoss(reduction='sum')(recon, xb)
			kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			loss = (recon_loss + beta * kl_div) / xb.size(0)   # per-batch average

			opt.zero_grad()
			loss.backward()
			opt.step()

	epochs = 400

	data = TabularDataset(
		np.array(input_set), 
		np.array(recon_set), 
		np.array(labels)
	)
	loader = DataLoader(data, batch_size=batch_size, shuffle=True)

	cvae.train()
	opt = optim.Adam(cvae.parameters(), lr=lr)
	for epoch in range(1, epochs + 1):
		for encode_in, decode_comp, label in loader:
			xb = encode_in.float().to(device)
			xb_masked = mask_features(xb)
			yb = decode_comp.float().to(device)
			label = label.float().to(device)
			recon, mu, logvar = cvae(xb_masked, label)
			recon_loss = nn.MSELoss(reduction='sum')(recon, yb)
			kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
			loss = (recon_loss + beta * kl_div) / xb.size(0)   # per-batch average

			opt.zero_grad()
			loss.backward()
			opt.step()

	return cvae

for dataset_name in splits:
	if dataset_name in ['cleveland-0_vs_4', 'wisconsin']: continue
	if any(flag in dataset_name for flag in ['winequality', 'yeast']): continue
	
	for idx in range(31):
		data_key = f"{idx}_{dataset_name}"
		print(data_key)
		
		X = np.concatenate((data_mapper[data_key]['x_train'], data_mapper[data_key]['x_validation']), axis=0)
		Y = np.concatenate((data_mapper[data_key]['y_train'], data_mapper[data_key]['y_validation']), axis=0)
		
		# model = train_model(X, Y)
		# with open(f'model_types/base_{data_key}.pkl') as fh: pickle.dump(model, fh)

		ros_x, ros_y = RandomOverSampler().fit_resample(X, Y)	
		model = train_model(X, Y)
		with open(f'model_types/ros_{data_key}.pkl') as fh: pickle.dump(model, fh)

		smote_x, smote_y = SMOTE().fit_resample(X, Y)	
		model = train_model(X, Y)
		with open(f'model_types/smote_{data_key}.pkl') as fh: pickle.dump(model, fh)

		adasyn_x, adasyn_y = ADASYN().fit_resample(X, Y)	
		model = train_model(X, Y)
		with open(f'model_types/adasyn_{data_key}.pkl') as fh: pickle.dump(model, fh)


		# if os.path.exists(f'results_new/{data_key}.pkl'): continue
		
		for _ in range(3):
			try:
				pass
				# torch.save(model.state_dict(), f"models/{data_key}.mdl")
			except Exception as e:
				pass
		

0_abalone-17_vs_7-8-9-10
(np.float64(38.86363636363637), 0.9777397260273972, np.float64(0.5357142857142857))
(np.float64(1.0), 0.9434931506849316, np.float64(0.7271929824561404))
(np.float64(1.0), 0.910958904109589, np.float64(0.7453634085213032))
(np.float64(1.0058823529411764), 0.916095890410959, np.float64(0.7828320802005012))
