In [1]:
from imblearn.over_sampling import (
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
)
from imblearn.combine import (
    SMOTETomek,
    SMOTEENN
)

In [2]:
from joblib import Parallel, delayed
from datetime import datetime
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

import wandb

with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)

In [3]:
splits = pd.read_csv('data_splits.csv')
splits.columns

Index(['abalone-17_vs_7-8-9-10', 'abalone-20_vs_8-9-10', 'abalone-21_vs_8',
       'abalone-3_vs_11', 'abalone19', 'abalone9-18', 'cleveland-0_vs_4',
       'ecoli-0-1-4-7_vs_2-3-5-6', 'ecoli-0-1-4-7_vs_5-6',
       'ecoli-0-2-6-7_vs_3-5', 'ecoli-0-4-6_vs_5', 'ecoli-0-6-7_vs_3-5',
       'ecoli4', 'glass-0-6_vs_5', 'winequality-red-4',
       'winequality-red-8_vs_6-7', 'winequality-white-9_vs_4', 'wisconsin',
       'yeast-0-2-5-7-9_vs_3-6-8', 'yeast-2_vs_4'],
      dtype='object')

In [4]:
# data_mapper.keys()
x_train = data_mapper['0_abalone-17_vs_7-8-9-10']["x_train"]
y_train = data_mapper['0_abalone-17_vs_7-8-9-10']["y_train"]

x_test = data_mapper['0_abalone-17_vs_7-8-9-10']["x_test"]
y_test = data_mapper['0_abalone-17_vs_7-8-9-10']["y_test"]

In [5]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
roc_auc_score(y_test, y_pred)

np.float64(0.5357142857142857)

In [6]:
competitor = {}
for split_name in splits:
	for idx in range(31):

		results = {}
		data_key = f"{idx}_{split_name}"

		x_train = data_mapper[data_key]["x_train"]
		y_train = data_mapper[data_key]["y_train"]

		x_validation = data_mapper[data_key]["x_validation"]
		y_validation = data_mapper[data_key]["y_validation"]

		x_comb = np.concatenate((x_train, x_validation), axis=0)
		y_comb = np.concatenate((y_train, y_validation), axis=0)
		
		x_test = data_mapper[data_key]["x_test"]
		y_test = data_mapper[data_key]["y_test"]

		try:
			smote = SMOTE(sampling_strategy='minority') 
			x, y = smote.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE'] = 0

		try:
			adasyn = ADASYN(sampling_strategy='minority')
			x, y = adasyn.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['ADASYN'] = roc_auc_score(y_test, y_pred)
		except:
			results['ADASYN'] = 0
		
		try:
			blsmote = BorderlineSMOTE(sampling_strategy='minority', kind='borderline-1')
			x, y = blsmote.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['Borderline SMOTE'] = roc_auc_score(y_test, y_pred)
		except:
			results['Borderline SMOTE'] = 0

		try:
			smt_tomek = SMOTETomek(sampling_strategy='auto')
			x, y = smt_tomek.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE Tomek'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE Tomek'] = 0
		
		try:
			smt_eenn = SMOTEENN(sampling_strategy='auto')
			x, y = smt_eenn.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE EENN'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE EENN'] = 0

		competitor[data_key] = results
competitor

{'0_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.7078947368421052),
  'ADASYN': np.float64(0.7436090225563909),
  'Borderline SMOTE': np.float64(0.7567669172932331),
  'SMOTE Tomek': np.float64(0.7096491228070174),
  'SMOTE EENN': np.float64(0.7723057644110275)},
 '1_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.7550125313283209),
  'ADASYN': np.float64(0.7550125313283209),
  'Borderline SMOTE': np.float64(0.7933583959899749),
  'SMOTE Tomek': np.float64(0.7898496240601504),
  'SMOTE EENN': np.float64(0.7837092731829574)},
 '2_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.7105263157894737),
  'ADASYN': np.float64(0.713157894736842),
  'Borderline SMOTE': np.float64(0.6139097744360902),
  'SMOTE Tomek': np.float64(0.680952380952381),
  'SMOTE EENN': np.float64(0.7401002506265663)},
 '3_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.8159147869674186),
  'ADASYN': np.float64(0.8141604010025063),
  'Borderline SMOTE': np.float64(0.7506265664160401),
  'SMOTE Tomek': np.float64(0.85

In [9]:
from collections import defaultdict
values = defaultdict(list)
for data_key in competitor:
    if "abalone-17_vs_7-8-9-10" in data_key:
        # print(data_key)
        for t in competitor[data_key]:
            values[t].append(competitor[data_key][t])

for t in values:
    print(f"{t}: {np.mean(values[t])}")

SMOTE: 0.7763036623817607
ADASYN: 0.773255719945024
Borderline SMOTE: 0.7575875171800469
SMOTE Tomek: 0.7759358072600857
SMOTE EENN: 0.82125474977767


In [None]:
adasyn = ADASYN(sampling_strategy='minority')
x, y = adasyn.fit_resample(x_train, y_train)

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x, y)
y_pred = model.predict(x_test)
roc_auc_score(y_test, y_pred)


np.float64(0.6765664160401003)