In [8]:
import datetime

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from smote_variants.oversampling import SMOTE
from common_datasets.binary_classification import get_filtered_data_loaders
import common_datasets.binary_classification as binclas

In [2]:
import logging
logger = logging.getLogger('smote_variants')
logger.setLevel(logging.ERROR)

In [13]:
binclas.load_cm1()['data'].shape

(498, 21)

In [3]:
classifiers = {
DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in range(2, 10, 2)],
RandomForestClassifier: [{'max_depth': md, 'random_state': 5} for md in range(2, 10, 2)],
KNeighborsClassifier: [{'n_neighbors': nn} for nn in range(1, 10, 2)],
SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\
            + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\
            + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\
}

In [4]:
datasets = get_filtered_data_loaders(n_col_bounds=(2, 40),
                                        n_bounds=(10, 500),
                                        n_minority_bounds=(10, 500),
                                        n_from_phenotypes=1,
                                        n_smallest=20)

In [5]:
smote_params = [
    {'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},
    {'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
    {'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},
]

In [6]:
for data_loader in datasets:
    results = []
    dataset = data_loader()
    print(datetime.datetime.now(), dataset['name'])
    X = dataset['data']
    y = dataset['target']

    validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=20, random_state=5)

    for fidx, (train, test) in enumerate(validator.split(X, y, y)):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]

        ss = StandardScaler()
        ss.fit(X_train)
        X_train = ss.transform(X_train)
        X_test = ss.transform(X_test)

        for sparam in smote_params:
            oversampling = SMOTE(**sparam)
            X_samp, y_samp = oversampling.sample(X_train, y_train)

            for classifier, cparams in classifiers.items():
                for param in cparams:
                    classifier_obj = classifier(**param)
                    classifier_obj.fit(X_samp, y_samp)
                    y_pred = classifier_obj.predict_proba(X_test)
                    auc = roc_auc_score(y_test, y_pred[:, 1])
                    results.append({'name': dataset['name'],
                                    'fold': fidx,
                                    'sparam': sparam,
                                    'classifier': classifier.__name__,
                                    'cparam': param,
                                    'auc': auc})
    data = pd.DataFrame.from_dict(results)
    data.to_csv(f'{dataset["name"]}.csv')

2023-12-14 21:06:31.104964 appendicitis
2023-12-14 21:23:36.600311 bupa
2023-12-14 21:47:51.281804 cleveland-0_vs_4
2023-12-14 22:07:33.462380 CM1


KeyboardInterrupt: 

In [None]:
tmp = pd.DataFrame.from_dict(results)

Unnamed: 0,name,fold,sparam,classifier,cparam,auc
0,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.582353
1,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.723529
2,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.764706
3,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.782353
4,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",RandomForestClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.976471
...,...,...,...,...,...,...
441,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.647059
442,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.661765
443,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.661765
444,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",RandomForestClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.647059
