In [47]:
from imblearn.over_sampling import (
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
)
from imblearn.combine import (
    SMOTETomek,
    SMOTEENN
)

In [12]:
from joblib import Parallel, delayed
from datetime import datetime
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

import wandb

with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)

In [3]:
splits = pd.read_csv('data_splits.csv')
splits.columns

Index(['abalone-17_vs_7-8-9-10', 'abalone-20_vs_8-9-10', 'abalone-21_vs_8',
       'abalone-3_vs_11', 'abalone19', 'abalone9-18', 'cleveland-0_vs_4',
       'ecoli-0-1-4-7_vs_2-3-5-6', 'ecoli-0-1-4-7_vs_5-6',
       'ecoli-0-2-6-7_vs_3-5', 'ecoli-0-4-6_vs_5', 'ecoli-0-6-7_vs_3-5',
       'ecoli4', 'glass-0-6_vs_5', 'winequality-red-4',
       'winequality-red-8_vs_6-7', 'winequality-white-9_vs_4', 'wisconsin',
       'yeast-0-2-5-7-9_vs_3-6-8', 'yeast-2_vs_4'],
      dtype='object')

In [13]:
# data_mapper.keys()
x_train = data_mapper['0_abalone-17_vs_7-8-9-10']["x_train"]
y_train = data_mapper['0_abalone-17_vs_7-8-9-10']["y_train"]

x_test = data_mapper['0_abalone-17_vs_7-8-9-10']["x_test"]
y_test = data_mapper['0_abalone-17_vs_7-8-9-10']["y_test"]

In [14]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
roc_auc_score(y_test, y_pred)

np.float64(0.5357142857142857)

In [53]:
competitor = {}
for split_name in splits:
	for idx in range(31):

		results = {}
		data_key = f"{idx}_{split_name}"

		x_train = data_mapper[data_key]["x_train"]
		y_train = data_mapper[data_key]["y_train"]

		x_validation = data_mapper[data_key]["x_validation"]
		y_validation = data_mapper[data_key]["y_validation"]

		x_comb = np.concatenate((x_train, x_validation), axis=0)
		y_comb = np.concatenate((y_train, y_validation), axis=0)
		
		x_test = data_mapper[data_key]["x_test"]
		y_test = data_mapper[data_key]["y_test"]

		try:
			smote = SMOTE(sampling_strategy='minority') 
			x, y = smote.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE'] = 0

		try:
			adasyn = ADASYN(sampling_strategy='minority')
			x, y = adasyn.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['ADASYN'] = roc_auc_score(y_test, y_pred)
		except:
			results['ADASYN'] = 0
		
		try:
			blsmote = BorderlineSMOTE(sampling_strategy='minority', kind='borderline-1')
			x, y = blsmote.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['Borderline SMOTE'] = roc_auc_score(y_test, y_pred)
		except:
			results['Borderline SMOTE'] = 0

		try:
			smt_tomek = SMOTETomek(sampling_strategy='auto')
			x, y = smt_tomek.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE Tomek'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE Tomek'] = 0
		
		try:
			smt_eenn = SMOTEENN(sampling_strategy='auto')
			x, y = smt_eenn.fit_resample(x_comb, y_comb)
			model = KNeighborsClassifier(n_neighbors=5)
			model.fit(x, y)
			y_pred = model.predict(x_test)
			results['SMOTE EENN'] = roc_auc_score(y_test, y_pred)
		except:
			results['SMOTE EENN'] = 0

		competitor[data_key] = results
competitor

{'0_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.7810776942355889),
  'ADASYN': np.float64(0.7436090225563909),
  'Borderline SMOTE': np.float64(0.7550125313283209),
  'SMOTE Tomek': np.float64(0.7462406015037593),
  'SMOTE EENN': np.float64(0.7714285714285715)},
 '1_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.7916040100250626),
  'ADASYN': np.float64(0.7880952380952381),
  'Borderline SMOTE': np.float64(0.7593984962406015),
  'SMOTE Tomek': np.float64(0.7916040100250626),
  'SMOTE EENN': np.float64(0.7793233082706766)},
 '2_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.7436090225563909),
  'ADASYN': np.float64(0.7471177944862155),
  'Borderline SMOTE': np.float64(0.6879699248120301),
  'SMOTE Tomek': np.float64(0.7479949874686717),
  'SMOTE EENN': np.float64(0.7392230576441103)},
 '3_abalone-17_vs_7-8-9-10': {'SMOTE': np.float64(0.8159147869674186),
  'ADASYN': np.float64(0.8498746867167919),
  'Borderline SMOTE': np.float64(0.7479949874686717),
  'SMOTE Tomek': np.float64(0.

In [None]:
adasyn = ADASYN(sampling_strategy='minority')
x, y = adasyn.fit_resample(x_train, y_train)

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x, y)
y_pred = model.predict(x_test)
roc_auc_score(y_test, y_pred)


np.float64(0.6765664160401003)