In [37]:
import numpy as np
import pandas as pd
import pathlib
import sys
sys.path.append(str(pathlib.Path.cwd().parents[1]))
from handle_data import HandleData
from evaluate_oversampler_cross_val import cross_validation

# oversamplers
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, SVMSMOTE, ADASYN, BorderlineSMOTE
from sv_synthsonic import synthsonic
import smote_variants as sv

# classifiers
from xgboost.sklearn import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [14]:
datafolder = pathlib.Path.cwd().parents[1] / 'Datasets'
train = pd.read_csv(datafolder.joinpath('train.csv'),index_col=0)
test = pd.read_csv(datafolder.joinpath('test.csv'),index_col=0)

In [58]:
train.dtypes

Survived      int64
Pclass        int64
Name          int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Ticket        int64
Fare        float64
Cabin         int64
Embarked     object
dtype: object

In [59]:
train['Cabin'] = train.Cabin.astype(str)
train['Embarked'] = train.Embarked.astype(str)
train = HandleData().label_encode(train, ['Name','Sex','Ticket','Cabin','Embarked'])
train.Cabin = train.Cabin.astype(float)
train.Embarked = train.Embarked.astype(float)

In [77]:
X = train.iloc[:,1:].values
y = train.iloc[:,0].values
title = 'Titanic'

In [62]:
random_state=5
oversamplers = [RandomOverSampler(random_state=random_state), 
                SMOTE(random_state=random_state, n_jobs=-1),
                SMOTENC(categorical_features=[],random_state=random_state),
                SVMSMOTE(random_state=random_state, n_jobs=-1), 
                ADASYN(random_state=random_state, n_jobs=-1), 
                BorderlineSMOTE(random_state=random_state, n_jobs=-1),
                synthsonic(distinct_threshold=20),
                sv.polynom_fit_SMOTE(random_state=random_state),
                sv.Random_SMOTE(random_state=random_state),
                sv.NoSMOTE()]
proportions = np.array([0.2, 0.4, 0.6, 0.8, 1.])
respath = pathlib.Path.cwd() / 'CSV_results' / 'Cross_validation' / 'cross_validation_titanic.csv'

In [78]:
cv = cross_validation()
cv.loop_over_oversamplers(X,y,title, oversamplers, proportions)

oversampler:  60%|██████    | 6/10 [00:08<00:06,  1.68s/it]2021-07-30 12:15:07,081:INFO:synthsonic: Running sampling via ('synthsonic', "{'proportion': 0.8, 'distinct_threshold': 20, 'do_PCA': True, 'ordering': 'pca'}")
2021-07-30 12:15:07,081:INFO:synthsonic: Running sampling via ('synthsonic', "{'proportion': 0.8, 'distinct_threshold': 20, 'do_PCA': True, 'ordering': 'pca'}")
n_quantiles (500) is greater than the total number of samples (257). n_quantiles is set to num samples.

  0%|          | 0/28.0 [00:00<?, ?it/s][A
Building tree: 100%|██████████| 28/28.0 [00:00<00:00, 1622.53it/s]

  0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 1:   0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 0:   0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 0:  22%|██▏       | 2/9 [00:00<00:00,  9.69it/s][A
  "Found unknown state name. Trying to switch to using all state names as state numbers"

Generating for node: 5:  33%|███▎      | 3/9 [00:00<00:00,  6.80it/s

Generating for node: 6:   0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 8:   0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 8:  44%|████▍     | 4/9 [00:00<00:00, 30.77it/s][A
Generating for node: 7:  44%|████▍     | 4/9 [00:00<00:00, 30.77it/s][A
Generating for node: 2:  44%|████▍     | 4/9 [00:00<00:00, 30.77it/s][A
Generating for node: 4:  44%|████▍     | 4/9 [00:00<00:00, 30.77it/s][A
Generating for node: 3:  44%|████▍     | 4/9 [00:00<00:00, 30.77it/s][A
Generating for node: 3:  89%|████████▉ | 8/9 [00:00<00:00, 15.11it/s][A
Generating for node: 5: 100%|██████████| 9/9 [00:00<00:00, 15.42it/s][A
2021-07-30 12:15:20,262:INFO:synthsonic: Running sampling via ('synthsonic', "{'proportion': 0.8, 'distinct_threshold': 20, 'do_PCA': True, 'ordering': 'pca'}")
2021-07-30 12:15:20,262:INFO:synthsonic: Running sampling via ('synthsonic', "{'proportion': 0.8, 'distinct_threshold': 20, 'do_PCA': True, 'ordering': 'pca'}")
n_quantiles (500) is greater than the

Generating for node: 7:  78%|███████▊  | 7/9 [00:01<00:00,  3.54it/s][A
Generating for node: 5:  78%|███████▊  | 7/9 [00:01<00:00,  3.54it/s][A
Generating for node: 5:  89%|████████▉ | 8/9 [00:02<00:00,  3.57it/s][A
Generating for node: 2:  89%|████████▉ | 8/9 [00:02<00:00,  3.57it/s][A
Generating for node: 2: 100%|██████████| 9/9 [00:02<00:00,  3.94it/s][A

  0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 1:   0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 0:   0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 6:   0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 3:   0%|          | 0/9 [00:00<?, ?it/s][A
Generating for node: 3:  44%|████▍     | 4/9 [00:00<00:00, 39.80it/s][A
Generating for node: 4:  44%|████▍     | 4/9 [00:00<00:00, 39.80it/s][A
Generating for node: 8:  44%|████▍     | 4/9 [00:00<00:00, 39.80it/s][A
Generating for node: 7:  44%|████▍     | 4/9 [00:00<00:00, 39.80it/s][A
Generating for node: 5:  44%|████▍     | 

2021-07-30 12:15:42,864:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star', 'random_state': 5}")
2021-07-30 12:15:42,864:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star', 'random_state': 5}")
2021-07-30 12:15:42,978:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star', 'random_state': 5}")
2021-07-30 12:15:42,978:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star', 'random_state': 5}")
2021-07-30 12:15:43,095:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star', 'random_state': 5}")
2021-07-30 12:15:43,095:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star', 'random_state': 5}")
2021-07-30 12:15:43,211:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit

Unnamed: 0,dataset,oversampler,proportion,balanced_accuracy,balanced_accuracy_std,G_mean,G_mean_std,f1,f1_std,precision,precision_std,recall,recall_std,pr_auc,pr_auc_std,runtime,runtime_std
0,Titanic,"RandomOverSampler(random_state=5, sampling_str...",0.8,0.769459,0.014219,0.765249,0.013615,0.714238,0.017994,0.740948,0.03263,0.690048,0.013141,0.808083,0.018514,0.000803,1.8e-05
0,Titanic,"RandomOverSampler(random_state=5, sampling_str...",1.0,0.769823,0.015307,0.766715,0.014771,0.715514,0.019357,0.730519,0.032773,0.70171,0.013945,0.801127,0.026958,0.000789,2.6e-05
0,Titanic,"SMOTE(n_jobs=-1, random_state=5, sampling_stra...",0.8,0.764018,0.02059,0.761144,0.02071,0.708846,0.026237,0.715223,0.041033,0.704685,0.035508,0.789124,0.022605,0.00324,0.002082
0,Titanic,"SMOTE(n_jobs=-1, random_state=5, sampling_stra...",1.0,0.76871,0.015535,0.765105,0.014787,0.713844,0.019483,0.733772,0.036211,0.695862,0.015086,0.785233,0.020463,0.002027,4.7e-05
0,Titanic,"SMOTENC(categorical_features=[0, 2, 3, 4, 8], ...",0.8,0.763831,0.01575,0.760779,0.013828,0.708663,0.018883,0.720702,0.046339,0.698837,0.009287,0.795342,0.014956,0.045698,0.006304
0,Titanic,"SMOTENC(categorical_features=[0, 2, 3, 4, 8], ...",1.0,0.766975,0.019028,0.764513,0.017808,0.71316,0.022516,0.717971,0.047652,0.710534,0.017007,0.782599,0.01936,0.045819,0.000393
0,Titanic,"SVMSMOTE(n_jobs=-1, random_state=5, sampling_s...",0.8,0.773359,0.017938,0.77043,0.016514,0.720552,0.021616,0.733113,0.048552,0.710568,0.016475,0.800666,0.019064,0.226645,0.007007
0,Titanic,"SVMSMOTE(n_jobs=-1, random_state=5, sampling_s...",1.0,0.784563,0.024835,0.781796,0.025143,0.733928,0.031558,0.747341,0.040696,0.722093,0.035387,0.800008,0.012864,0.227878,0.002395
0,Titanic,"ADASYN(n_jobs=-1, random_state=5, sampling_str...",0.8,0.774408,0.020807,0.770786,0.019147,0.7214,0.025559,0.743995,0.052205,0.701744,0.006103,0.802566,0.024207,0.21222,0.0014
0,Titanic,"ADASYN(n_jobs=-1, random_state=5, sampling_str...",1.0,0.768944,0.017013,0.765659,0.015652,0.714737,0.020861,0.730191,0.045659,0.701778,0.017136,0.792359,0.012592,0.212584,0.002438
