In [None]:
from imblearn.over_sampling import (
	SMOTE,
	ADASYN,
	BorderlineSMOTE,
)
from imblearn.combine import (
	SMOTETomek,
	SMOTEENN
)
from collections import defaultdict

from scipy.stats import ranksums


In [None]:
from joblib import Parallel, delayed
from datetime import datetime
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

import wandb

with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)

In [3]:
splits = pd.read_csv('data_splits.csv')
splits.columns

Index(['abalone-17_vs_7-8-9-10', 'abalone-20_vs_8-9-10', 'abalone-21_vs_8',
       'abalone-3_vs_11', 'abalone19', 'abalone9-18', 'cleveland-0_vs_4',
       'ecoli-0-1-4-7_vs_2-3-5-6', 'ecoli-0-1-4-7_vs_5-6',
       'ecoli-0-2-6-7_vs_3-5', 'ecoli-0-4-6_vs_5', 'ecoli-0-6-7_vs_3-5',
       'ecoli4', 'glass-0-6_vs_5', 'winequality-red-4',
       'winequality-red-8_vs_6-7', 'winequality-white-9_vs_4', 'wisconsin',
       'yeast-0-2-5-7-9_vs_3-6-8', 'yeast-2_vs_4'],
      dtype='object')

In [4]:
# data_mapper.keys()
x_train = data_mapper['0_abalone-17_vs_7-8-9-10']["x_train"]
y_train = data_mapper['0_abalone-17_vs_7-8-9-10']["y_train"]

x_test = data_mapper['0_abalone-17_vs_7-8-9-10']["x_test"]
y_test = data_mapper['0_abalone-17_vs_7-8-9-10']["y_test"]

In [5]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
roc_auc_score(y_test, y_pred)

np.float64(0.5357142857142857)

In [20]:
competitor = {}
for split_name in splits:
	for idx in range(31):

		results = {}
		data_key = f"{idx}_{split_name}"

		x_train = data_mapper[data_key]["x_train"]
		y_train = data_mapper[data_key]["y_train"]

		x_validation = data_mapper[data_key]["x_validation"]
		y_validation = data_mapper[data_key]["y_validation"]

		x_comb = np.concatenate((x_train, x_validation), axis=0)
		y_comb = np.concatenate((y_train, y_validation), axis=0)
		
		x_test = data_mapper[data_key]["x_test"]
		y_test = data_mapper[data_key]["y_test"]
		# x_test = x_validation
		# y_test = y_validation

		try:
			smote = SMOTE(sampling_strategy='minority') 
			x, y = smote.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE'] = 0

		try:
			adasyn = ADASYN(sampling_strategy='minority')
			x, y = adasyn.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['ADASYN'] = roc_auc_score(y_test, y_pred)
		except:
			results['ADASYN'] = 0
		
		try:
			blsmote = BorderlineSMOTE(sampling_strategy='minority', kind='borderline-1')
			x, y = blsmote.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['Borderline SMOTE'] = roc_auc_score(y_test, y_pred)
		except:
			results['Borderline SMOTE'] = 0

		try:
			smt_tomek = SMOTETomek(sampling_strategy='auto')
			x, y = smt_tomek.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE Tomek'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE Tomek'] = 0
		
		try:
			smt_eenn = SMOTEENN(sampling_strategy='auto')
			x, y = smt_eenn.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE EENN'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE EENN'] = 0

		competitor[data_key] = results
competitor

{'0_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.712280701754386),
  'ADASYN': np.float64(0.7436090225563909),
  'Borderline SMOTE': np.float64(0.7567669172932331),
  'SMOTE Tomek': np.float64(0.7078947368421052),
  'SMOTE EENN': np.float64(0.7714285714285715)},
 '1_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.7924812030075188),
  'ADASYN': np.float64(0.7550125313283209),
  'Borderline SMOTE': np.float64(0.7593984962406015),
  'SMOTE Tomek': np.float64(0.7933583959899749),
  'SMOTE EENN': np.float64(0.7802005012531328)},
 '2_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.7114035087719298),
  'ADASYN': np.float64(0.7105263157894737),
  'Borderline SMOTE': np.float64(0.6513784461152883),
  'SMOTE Tomek': np.float64(0.7105263157894737),
  'SMOTE EENN': np.float64(0.7365914786967418)},
 '3_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.8525062656641603),
  'ADASYN': np.float64(0.8481203007518796),
  'Borderline SMOTE': np.float64(0.7497493734335839),
  'SMOTE Tomek': np.float64(0.8

In [None]:
pd.read_csv('long_run_2025-07-07-2obj.csv')

In [45]:

results = pd.read_csv('long_run_2025-07-07-2obj.csv')
# results = pd.read_csv('long_run_2025-07-08.csv')
# my_scheme = results['Optimized test AUC'].where(results['Dataset'] == 'abalone-20_vs_8-9-10')
my_scheme = results['Optimized test AUC'].where(results['Dataset'] == 'abalone-17_vs_7-8-9-10')
# my_scheme = results['Optimized validation AUC'].where(results['Dataset'] == 'abalone-20_vs_8-9-10')
# my_scheme = results['Ideal test AUC'].where(results['Dataset'] == 'abalone19')

# my_scheme = results['Optimized validation AUC']
# my_scheme = results['Optimized test AUC']
# my_scheme = results['Ideal test AUC']
# my_scheme = my_scheme.dropna()

In [47]:
values = defaultdict(list)
for data_key in competitor:
	# if "abalone19" in data_key:
	if "abalone-17_vs_7-8-9-10" in data_key:
	# if "abalone-20_vs_8-9-10" in data_key:
		
		for t in competitor[data_key]:
			values[t].append(competitor[data_key][t])

records = []
for t in values:
	
	record = {
		f"Competitor": t,
		f"Pval < 0.05": str(ranksums(values[t], my_scheme).pvalue < 0.05),
		f"Comp min": np.min(values[t]),
		f"Comp avg": np.mean(values[t]),
		f"Comp max": np.max(values[t]),
		f"cVAE min": np.min(my_scheme),
		f"cVAE avg": np.mean(my_scheme),
		f"cVAE max": np.max(my_scheme),
	}
	records.append(record)
pd.DataFrame.from_records(records)#.to_csv('comparison.csv', index=False)
	

Unnamed: 0,Competitor,Pval < 0.05,Comp min,Comp avg,Comp max,cVAE min,cVAE avg,cVAE max
0,SMOTE,True,0.674812,0.782064,0.895238,0.723434,0.843338,0.932456
1,ADASYN,True,0.676566,0.780706,0.895238,0.723434,0.843338,0.932456
2,Borderline SMOTE,True,0.64787,0.764924,0.87005,0.723434,0.843338,0.932456
3,SMOTE Tomek,True,0.640852,0.772451,0.895238,0.723434,0.843338,0.932456
4,SMOTE EENN,False,0.671303,0.815494,0.957018,0.723434,0.843338,0.932456


In [None]:
adasyn = ADASYN(sampling_strategy='minority')
x, y = adasyn.fit_resample(x_train, y_train)

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x, y)
y_pred = model.predict(x_test)
roc_auc_score(y_test, y_pred)


np.float64(0.6765664160401003)