In [30]:
import datetime

from joblib import Parallel, delayed

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from smote_variants.oversampling import SMOTE
from common_datasets.binary_classification import get_filtered_data_loaders

In [31]:
import logging
logger = logging.getLogger('smote_variants')
logger.setLevel(logging.ERROR)

In [32]:
classifiers = {
DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in range(4, 10, 2)],
RandomForestClassifier: [{'max_depth': md, 'random_state': 5} for md in range(4, 10, 2)],
KNeighborsClassifier: [{'n_neighbors': nn} for nn in range(1, 10, 2)],
SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\
            + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\
            + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\
}

In [33]:
datasets = get_filtered_data_loaders(n_col_bounds=(2, 40),
                                        n_bounds=(10, 500),
                                        n_minority_bounds=(10, 500),
                                        n_from_phenotypes=1,
                                        n_smallest=20)

In [34]:
smote_params = [
    {'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
]

In [35]:
for data_loader in datasets:
        results = []

In [36]:
def job_generator(data_loader):

    dataset = data_loader()
    print(datetime.datetime.now(), dataset['name'])
    if dataset['name'] in ['iris0', 'dermatology-6']:
        continue
    X = dataset['data']
    y = dataset['target']

    validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=40, random_state=5)

    for fidx, (train, test) in enumerate(validator.split(X, y, y)):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]

        ss = StandardScaler()
        ss.fit(X_train)
        X_train = ss.transform(X_train)
        X_test = ss.transform(X_test)

        for sparam in smote_params:
            oversampling = SMOTE(**sparam)
            X_samp, y_samp = oversampling.sample(X_train, y_train)

            for classifier, cparams in classifiers.items():
                for param in cparams:
                    job = {
                        'X_samp': X_samp,
                        'y_samp': y_samp,
                        'X_test': X_test,
                        'y_test': y_test,
                        'classifier': classifier,
                        'param': param
                    }
                    description = {
                        'name': dataset['name'],
                        'fold': fidx,
                        'sparam': sparam,
                        'classifier': classifier.__name__,
                        'cparam': param
                    }
                    yield job, description

In [37]:
def do_job(job, description):
    classifier_obj = job['classifier'](**job['param'])
    classifier_obj.fit(job['X_samp'], job['y_samp'])
    y_pred = classifier_obj.predict_proba(job['X_test'])
    auc = roc_auc_score(job['y_test'], y_pred[:, 1])
    return description | {'auc': auc}

In [40]:
for data_loader in datasets:
    dataset = data_loader()
    results = Parallel(n_jobs=3)(delayed(do_job)(*x) for x in job_generator(data_loader))
    results = pd.DataFrame.from_dict(results)
    results.to_csv(f"{dataset['name']}.csv")

2023-12-14 22:50:59.240523 bupa
2023-12-14 23:21:28.186055 cleveland-0_vs_4
2023-12-14 23:53:18.753317 CM1
2023-12-15 00:54:19.832280 dermatology-6
2023-12-15 01:33:56.591067 ecoli1
2023-12-15 02:09:46.543042 glass0
2023-12-15 02:39:14.222835 haberman
2023-12-15 03:12:29.604080 hepatitis
2023-12-15 03:41:29.622140 ionosphere
2023-12-15 04:13:36.586725 iris0
2023-12-15 04:29:00.888280 led7digit-0-2-4-6-7-8-9_vs_1
2023-12-15 05:02:23.042133 monk-2
2023-12-15 05:38:21.101381 new_thyroid1
2023-12-15 06:05:34.747387 page-blocks-1-3_vs_4
2023-12-15 06:53:06.746426 saheart
2023-12-15 07:30:53.274009 shuttle-6_vs_2-3
2023-12-15 07:58:28.826586 yeast-1_vs_7


In [39]:
results.to_csv(f"{dataset['name']}.csv")

In [None]:
do_job(*next(job_generator(datasets[0])))

2023-12-14 21:42:48.609799 appendicitis


{'name': 'appendicitis',
 'fold': 0,
 'sparam': {'n_neighbors': 3,
  'proportion': 0.5,
  'random_state': 5,
  'ss_params': {'within_simplex_sampling': 'random'}},
 'classifier': 'DecisionTreeClassifier',
 'cparam': {'max_depth': 2, 'random_state': 5},
 'auc': 0.5823529411764707}

In [None]:
tmp = pd.DataFrame.from_dict(results)

Unnamed: 0,name,fold,sparam,classifier,cparam,auc
0,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.582353
1,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.723529
2,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.764706
3,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.782353
4,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",RandomForestClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.976471
...,...,...,...,...,...,...
441,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.647059
442,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.661765
443,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.661765
444,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",RandomForestClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.647059
