In [2]:
from imblearn.over_sampling import (
	SMOTE,
	ADASYN,
	BorderlineSMOTE,
)
from imblearn.combine import (
	SMOTETomek,
	SMOTEENN
)
from collections import defaultdict

from scipy.stats import ranksums


In [3]:
from joblib import Parallel, delayed
from datetime import datetime
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

import wandb

with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)

In [5]:
splits = pd.read_csv('data_splits.csv')
splits.columns

Index(['abalone-17_vs_7-8-9-10', 'abalone-20_vs_8-9-10', 'abalone-21_vs_8',
       'abalone-3_vs_11', 'abalone19', 'abalone9-18', 'cleveland-0_vs_4',
       'ecoli-0-1-4-7_vs_2-3-5-6', 'ecoli-0-1-4-7_vs_5-6',
       'ecoli-0-2-6-7_vs_3-5', 'ecoli-0-4-6_vs_5', 'ecoli-0-6-7_vs_3-5',
       'ecoli4', 'glass-0-6_vs_5', 'winequality-red-4',
       'winequality-red-8_vs_6-7', 'winequality-white-9_vs_4', 'wisconsin',
       'yeast-0-2-5-7-9_vs_3-6-8', 'yeast-2_vs_4'],
      dtype='object')

In [6]:
competitor = {}
for split_name in splits:
	for idx in range(31):

		results = {}
		data_key = f"{idx}_{split_name}"

		x_train = data_mapper[data_key]["x_train"]
		y_train = data_mapper[data_key]["y_train"]

		x_validation = data_mapper[data_key]["x_validation"]
		y_validation = data_mapper[data_key]["y_validation"]

		x_comb = np.concatenate((x_train, x_validation), axis=0)
		y_comb = np.concatenate((y_train, y_validation), axis=0)
		
		x_test = data_mapper[data_key]["x_test"]
		y_test = data_mapper[data_key]["y_test"]
		# x_test = x_validation
		# y_test = y_validation

		try:
			smote = SMOTE(sampling_strategy='minority') 
			x, y = smote.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE'] = 0

		try:
			adasyn = ADASYN(sampling_strategy='minority')
			x, y = adasyn.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['ADASYN'] = roc_auc_score(y_test, y_pred)
		except:
			results['ADASYN'] = 0
		
		try:
			blsmote = BorderlineSMOTE(sampling_strategy='minority', kind='borderline-1')
			x, y = blsmote.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['Borderline SMOTE'] = roc_auc_score(y_test, y_pred)
		except:
			results['Borderline SMOTE'] = 0

		try:
			smt_tomek = SMOTETomek(sampling_strategy='auto')
			x, y = smt_tomek.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE Tomek'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE Tomek'] = 0
		
		try:
			smt_eenn = SMOTEENN(sampling_strategy='auto')
			x, y = smt_eenn.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE EENN'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE EENN'] = 0

		competitor[data_key] = results

In [7]:
records = []
from collections import defaultdict
by_comp = defaultdict(list)

results = pd.read_csv('long_run_2025-07-11.csv')
for dataset in np.unique(results['Dataset']):
	my_scheme = results['Optimized test AUC'][results['Dataset'] == dataset]
	values = defaultdict(list)
	for data_key in competitor:
		if dataset in data_key:	
			for t in competitor[data_key]:
				values[t].append(competitor[data_key][t])

	for t in values:
		p = ranksums(values[t], my_scheme).pvalue

		outcome = 'tie' if p > 0.05 else 'win'
		outcome = outcome if np.mean(values[t]) < np.mean(my_scheme) else "lose"
		record = {
			f"Dataset": dataset,
			f"Competitor": t,
			"Outcome": outcome,
			f"Pval < 0.05": f"{str(p < 0.05)} - {round(p,3)}",
			f"Comp min": np.min(values[t]),
			f"Comp avg": np.mean(values[t]),
			f"Comp max": np.max(values[t]),
			f"cVAE min": np.min(my_scheme),
			f"cVAE avg": np.mean(my_scheme),
			f"cVAE max": np.max(my_scheme),
		}
		by_comp[t].append(outcome)

		records.append(record)
		
	records.append(record)
save = pd.DataFrame.from_records(records)#.to_csv('comparison.csv', index=False)

In [16]:
comparison = []
for t in by_comp:
    for val in pd.DataFrame(by_comp[t]).value_counts().items():
        comparison.append({
            "Competitor": t,
            "Outcome": val[0][0],
            "Count": val[1]
		})
pd.DataFrame.from_records(comparison).to_excel('wlt.xlsx', index=False)

In [53]:
save['Outcome'].value_counts()

Outcome
lose    38
tie     29
win     11
Name: count, dtype: int64