In [1]:
import time
import pprint

import torch
import pandas as pd
from sklearn.metrics import f1_score, balanced_accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from tqdm.notebook import tqdm

from utils import *
from models.CreditcardDataset import CreditcardDataset
from models.OversamplingDataset import OversamplingDataset
from models.LorasDataset import LorasDataset
from models.Autoencoder import Autoencoder
from models.Classifier import Classifier
from models.HiddenReprClassifier import HiddenReprClassifier

from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE
from pyloras import LORAS


DATA_PATH = "./data/"
pp = pprint.PrettyPrinter(indent=4)

In [2]:
BATCH_SIZE = 32

LEARNING_RATE_AUTOENC = 1e-3
LEARNING_RATE_CLASSIFIER = 1e-4

AUTOENC_EPOCHS = 20
CLASSIFIER_EPOCHS = 25

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cuda


In [3]:
train_df = pd.read_parquet(DATA_PATH+"train.parquet")
valid_df = pd.read_parquet(DATA_PATH+"valid.parquet")
test_df = pd.read_parquet(DATA_PATH+"test.parquet")

In [4]:
oversamplers = {
    "SMOTE":SMOTE(random_state=42, k_neighbors=29, n_jobs=-1),
    "ADASYN": ADASYN(random_state=42, n_neighbors=29, n_jobs=-1),
    "SVMSMOTE": SVMSMOTE(random_state=42, k_neighbors=29, n_jobs=-1)
}

In [5]:
train_datasets = {key: OversamplingDataset(train_df, oversamplers[key])for key in oversamplers.keys()}
valid_dataset = CreditcardDataset(valid_df)
test_dataset = CreditcardDataset(test_df)

train_datasets['None'] = CreditcardDataset(train_df)#baseline(no oversampling)
train_datasets['LORAS'] = LorasDataset(train_df)

In [6]:
autoenc_models = {key:Autoencoder().cuda() for key in train_datasets.keys()}
autoenc_optims = {key:torch.optim.Adam(autoenc_models[key].parameters(),
                                       lr=LEARNING_RATE_AUTOENC) for key in autoenc_models.keys()}

In [7]:
def calculate_valid_loss_autoenc(model, valid_dataset, loss_fn):
    dataloader = torch.utils.data.DataLoader(valid_dataset, 1, shuffle=False)
    loss_list = []

    for x, _ in dataloader:
        x = x.to(DEVICE)
        logits = model(x)
        loss = loss_fn(logits, x)
        loss_val = loss.item()
        loss_list.append(loss_val)

    return np.mean(loss_list)


def train_autoenc_model(model, optimizer, train_dataset, valid_dataset):
    model.train()
    loss_fn = torch.nn.MSELoss()

    for epoch in range(AUTOENC_EPOCHS):
        print("\n Start of epoch {}/{}...".format(epoch + 1, AUTOENC_EPOCHS))
        epoch_start_time = time.time()

        loss_list = []
        dataloader = torch.utils.data.DataLoader(train_dataset, BATCH_SIZE, shuffle=True)

        for x_orig, _ in dataloader:
            x_orig_gpu = x_orig.cpu().to(DEVICE)
            x_noisy = get_noised_data(x_orig)
            x_noisy = x_noisy.to(DEVICE)

            logits = model(x_noisy)
            loss = loss_fn(logits, x_orig_gpu)
#             loss_val = loss.item()
#             loss_list.append(loss_val)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

#         train_loss = np.mean(loss_list)
#         valid_loss = calculate_valid_loss_autoenc(model, valid_dataset, loss_fn)

        epoch_time = time.time() - epoch_start_time
        print(
            "\n Epoch {} complete in {:.2f}".format(epoch + 1, epoch_time))

In [8]:
for key in autoenc_models.keys():
    print("Training DAE with oversamlping method:", key)
    train_autoenc_model(autoenc_models[key], autoenc_optims[key], train_datasets[key], valid_dataset)
    print("==========================")

Training DAE with oversamlping method: SMOTE

 Start of epoch 1/20...

 Epoch 1 complete in 37.04

 Start of epoch 2/20...

 Epoch 2 complete in 37.58

 Start of epoch 3/20...

 Epoch 3 complete in 37.46

 Start of epoch 4/20...

 Epoch 4 complete in 36.99

 Start of epoch 5/20...

 Epoch 5 complete in 36.74

 Start of epoch 6/20...

 Epoch 6 complete in 37.47

 Start of epoch 7/20...

 Epoch 7 complete in 38.03

 Start of epoch 8/20...

 Epoch 8 complete in 37.50

 Start of epoch 9/20...

 Epoch 9 complete in 37.97

 Start of epoch 10/20...

 Epoch 10 complete in 37.84

 Start of epoch 11/20...

 Epoch 11 complete in 37.59

 Start of epoch 12/20...

 Epoch 12 complete in 37.68

 Start of epoch 13/20...

 Epoch 13 complete in 38.74

 Start of epoch 14/20...

 Epoch 14 complete in 37.46

 Start of epoch 15/20...

 Epoch 15 complete in 37.43

 Start of epoch 16/20...

 Epoch 16 complete in 37.43

 Start of epoch 17/20...

 Epoch 17 complete in 38.77

 Start of epoch 18/20...

 Epoch 18 c

In [9]:
classifier_models = {key:Classifier().cuda() for key in autoenc_models.keys()}
classifier_optims = {key:torch.optim.Adam(classifier_models[key].parameters(),
                                       lr=LEARNING_RATE_AUTOENC) for key in classifier_models.keys()}

In [10]:
def pass_through_autoenc_classifier(x, y, autoenc, classifier, loss_fn, passthrough_fnc):
    autoenc_out = passthrough_fnc(x)
    logits = classifier(autoenc_out)
    loss = loss_fn(logits, y)
    loss_val = loss.item()
    
    return loss, loss_val


def calculate_valid_loss_classifier(autoenc, model, valid_dataset, loss_fn, passthrough_fnc):

    dataloader = torch.utils.data.DataLoader(valid_dataset, 1, shuffle=False)
    loss_list = []

    for x, y in dataloader:
        x = x.to(DEVICE)
        y = y.type(torch.LongTensor).to(DEVICE)

        _, loss_val = pass_through_autoenc_classifier(x ,y, autoenc, model, loss_fn, passthrough_fnc)
        loss_list.append(loss_val)

    return np.mean(loss_list)


def train_classifier_model(autoenc_model, model, optimizer, train_dataset, valid_dataset, passthrough_fnc):
    model.train()

    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(CLASSIFIER_EPOCHS):
        print("\n Start of epoch {}/{}".format(epoch + 1, CLASSIFIER_EPOCHS))
        epoch_start_time = time.time()

        loss_list = []
        dataloader = torch.utils.data.DataLoader(train_dataset, BATCH_SIZE, shuffle=True)

        for x, y in dataloader:
            x = x.to(DEVICE)
            y = y.type(torch.LongTensor).to(DEVICE)

            loss, loss_val = pass_through_autoenc_classifier(x, y, autoenc_model, model, loss_fn, passthrough_fnc)
            loss_list.append(loss_val)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_loss = np.mean(loss_list)
#         valid_loss = calculate_valid_loss_classifier(autoenc_model, model, valid_dataset, loss_fn, passthrough_fnc)

        epoch_time = time.time() - epoch_start_time
        print(
            "\n Epoch {} complete in {:.2f}, train loss = {}, valid_loss = {}".format(epoch + 1, epoch_time, train_loss,
                                                                                      '?'))

In [11]:
for key in classifier_models.keys():
    print("Training classifier with oversamlping method:", key)
    train_classifier_model(autoenc_models[key], classifier_models[key], classifier_optims[key],
                          train_datasets[key], valid_dataset, autoenc_models[key].forward)

Training classifier with oversamlping method: SMOTE

 Start of epoch 1/25

 Epoch 1 complete in 36.15, train loss = 0.06077597077312617, valid_loss = ?

 Start of epoch 2/25

 Epoch 2 complete in 36.10, train loss = 0.0398899016909229, valid_loss = ?

 Start of epoch 3/25

 Epoch 3 complete in 36.09, train loss = 0.03528883981652329, valid_loss = ?

 Start of epoch 4/25

 Epoch 4 complete in 36.13, train loss = 0.033270876584353876, valid_loss = ?

 Start of epoch 5/25

 Epoch 5 complete in 36.05, train loss = 0.03143739296007118, valid_loss = ?

 Start of epoch 6/25

 Epoch 6 complete in 35.97, train loss = 0.029507105608741722, valid_loss = ?

 Start of epoch 7/25

 Epoch 7 complete in 35.68, train loss = 0.027667125031576024, valid_loss = ?

 Start of epoch 8/25

 Epoch 8 complete in 36.18, train loss = 0.026521356384298166, valid_loss = ?

 Start of epoch 9/25

 Epoch 9 complete in 36.07, train loss = 0.025792352658712565, valid_loss = ?

 Start of epoch 10/25

 Epoch 10 complete i


 Epoch 5 complete in 18.18, train loss = 0.0031117762414468817, valid_loss = ?

 Start of epoch 6/25

 Epoch 6 complete in 18.23, train loss = 0.003018954306251035, valid_loss = ?

 Start of epoch 7/25

 Epoch 7 complete in 18.18, train loss = 0.0030071986195802963, valid_loss = ?

 Start of epoch 8/25

 Epoch 8 complete in 18.17, train loss = 0.003009539748166411, valid_loss = ?

 Start of epoch 9/25

 Epoch 9 complete in 18.22, train loss = 0.002919702983515384, valid_loss = ?

 Start of epoch 10/25

 Epoch 10 complete in 18.15, train loss = 0.002882251771460444, valid_loss = ?

 Start of epoch 11/25

 Epoch 11 complete in 18.07, train loss = 0.0028652210792783433, valid_loss = ?

 Start of epoch 12/25

 Epoch 12 complete in 17.40, train loss = 0.002835739053687866, valid_loss = ?

 Start of epoch 13/25

 Epoch 13 complete in 18.09, train loss = 0.002823328433252966, valid_loss = ?

 Start of epoch 14/25

 Epoch 14 complete in 18.02, train loss = 0.0028163610784406722, valid_loss = 

In [12]:
def evaluate_predictions(auto_enc, model, dataset, passthrough_fnc):
    dataloader = torch.utils.data.DataLoader(dataset, 1, shuffle=False)

    predictions = []
    ground_truth = []
    for x, y in tqdm(dataloader):
        x = x.to(DEVICE)
        ground_truth.append(y.item())

        autoenc_out = passthrough_fnc(x)
        logits = model(autoenc_out)
        predictions.append(np.argmax(logits.detach().cpu().numpy()))
        
    return {
        'precision': precision_score(ground_truth, predictions),
        'recall': recall_score(ground_truth, predictions),
        'F1': f1_score(ground_truth, predictions),
        'balanced accuracy': balanced_accuracy_score(ground_truth, predictions)
    }

In [13]:
for key in classifier_models.keys():
    print("Results for model with oversampling algorthm:", key)
    pp.pprint(evaluate_predictions(autoenc_models[key], classifier_models[key],
                               test_dataset, autoenc_models[key].forward))

Results for model with oversampling algorthm: SMOTE


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.2591463414634146,
    'balanced accuracy': 0.9295144232425665,
    'precision': 0.15232974910394265,
    'recall': 0.8673469387755102}
Results for model with oversampling algorthm: ADASYN


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.21366459627329193,
    'balanced accuracy': 0.9333151134680095,
    'precision': 0.12164073550212164,
    'recall': 0.8775510204081632}
Results for model with oversampling algorthm: SVMSMOTE


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.7897435897435898,
    'balanced accuracy': 0.8926812846691856,
    'precision': 0.7938144329896907,
    'recall': 0.7857142857142857}
Results for model with oversampling algorthm: None


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.8191489361702127,
    'balanced accuracy': 0.8927428350349706,
    'precision': 0.8555555555555555,
    'recall': 0.7857142857142857}
Results for model with oversampling algorthm: LORAS


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.7853403141361257,
    'balanced accuracy': 0.8824947888553283,
    'precision': 0.8064516129032258,
    'recall': 0.7653061224489796}


In [14]:
hidden_classifier_models = {key:HiddenReprClassifier().cuda() for key in autoenc_models.keys()}
hidden_classifier_optims = {key:torch.optim.Adam(hidden_classifier_models[key].parameters(),
                                       lr=LEARNING_RATE_AUTOENC) for key in hidden_classifier_models.keys()}

In [15]:
for key in hidden_classifier_models.keys():
    print("Training hidden classifiers with oversamlping method:", key)
    train_classifier_model(autoenc_models[key], hidden_classifier_models[key], hidden_classifier_optims[key],
                          train_datasets[key], valid_dataset, autoenc_models[key].get_enc)

Training hidden classifiers with oversamlping method: SMOTE

 Start of epoch 1/25

 Epoch 1 complete in 32.64, train loss = 0.10833573541197057, valid_loss = ?

 Start of epoch 2/25

 Epoch 2 complete in 31.95, train loss = 0.09863112777757475, valid_loss = ?

 Start of epoch 3/25

 Epoch 3 complete in 32.29, train loss = 0.09591889886426012, valid_loss = ?

 Start of epoch 4/25

 Epoch 4 complete in 33.60, train loss = 0.0937302698819541, valid_loss = ?

 Start of epoch 5/25

 Epoch 5 complete in 33.59, train loss = 0.09250914201190417, valid_loss = ?

 Start of epoch 6/25

 Epoch 6 complete in 33.59, train loss = 0.09169315536259699, valid_loss = ?

 Start of epoch 7/25

 Epoch 7 complete in 33.49, train loss = 0.09118113176147633, valid_loss = ?

 Start of epoch 8/25

 Epoch 8 complete in 33.37, train loss = 0.09098566979420215, valid_loss = ?

 Start of epoch 9/25

 Epoch 9 complete in 33.47, train loss = 0.09067507905036389, valid_loss = ?

 Start of epoch 10/25

 Epoch 10 complet


 Epoch 5 complete in 16.05, train loss = 0.0037916131368301996, valid_loss = ?

 Start of epoch 6/25

 Epoch 6 complete in 15.87, train loss = 0.0037241510673292704, valid_loss = ?

 Start of epoch 7/25

 Epoch 7 complete in 15.88, train loss = 0.0036977889092045225, valid_loss = ?

 Start of epoch 8/25

 Epoch 8 complete in 15.96, train loss = 0.0036313081735810434, valid_loss = ?

 Start of epoch 9/25

 Epoch 9 complete in 15.92, train loss = 0.00356350007679697, valid_loss = ?

 Start of epoch 10/25

 Epoch 10 complete in 15.93, train loss = 0.0035677957279397505, valid_loss = ?

 Start of epoch 11/25

 Epoch 11 complete in 15.77, train loss = 0.003578845823126373, valid_loss = ?

 Start of epoch 12/25

 Epoch 12 complete in 15.67, train loss = 0.003561236699552974, valid_loss = ?

 Start of epoch 13/25

 Epoch 13 complete in 15.94, train loss = 0.003477544680678083, valid_loss = ?

 Start of epoch 14/25

 Epoch 14 complete in 16.02, train loss = 0.0035375586282327923, valid_loss =

In [16]:
for key in hidden_classifier_models.keys():
    print("Results for hidden_repr model with oversampling algorthm:", key)
    pp.pprint(evaluate_predictions(autoenc_models[key], hidden_classifier_models[key],
                               test_dataset, autoenc_models[key].get_enc))

Results for hidden_repr model with oversampling algorthm: SMOTE


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.20306965761511214,
    'balanced accuracy': 0.9329458112732993,
    'precision': 0.11481975967957277,
    'recall': 0.8775510204081632}
Results for hidden_repr model with oversampling algorthm: ADASYN


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.23337856173677068,
    'balanced accuracy': 0.9339130313070642,
    'precision': 0.13458528951486698,
    'recall': 0.8775510204081632}
Results for hidden_repr model with oversampling algorthm: SVMSMOTE


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.7096774193548386,
    'balanced accuracy': 0.8924878406624326,
    'precision': 0.6470588235294118,
    'recall': 0.7857142857142857}
Results for hidden_repr model with oversampling algorthm: None


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.782608695652174,
    'balanced accuracy': 0.8672238380439401,
    'precision': 0.8372093023255814,
    'recall': 0.7346938775510204}
Results for hidden_repr model with oversampling algorthm: LORAS


  0%|          | 0/56962 [00:00<?, ?it/s]

{   'F1': 0.29354207436399216,
    'balanced accuracy': 0.8796810578480125,
    'precision': 0.18159806295399517,
    'recall': 0.7653061224489796}


In [17]:
train_X, train_y = get_x_y(train_df)

valid_X, valid_y = get_x_y(valid_df)
test_X, test_y = get_x_y(test_df)

X = pd.concat([train_X, valid_X])
y = pd.concat([train_y, valid_y])

train_datasets = {key: oversamplers[key].fit_resample(X,y) for key in oversamplers.keys()}
train_datasets['LORAS'] = loras_oversample_dataframe(pd.concat([X, y] ,axis=1))

In [20]:
def eval_for_thresholds(y, predictions):
    return {thr: eval_for_threshhold(y, predictions, thr) for thr in np.arange(0.2, 0.9, 0.1)}


def eval_for_threshhold(ground_truth, predictions, thr):
    class_predicitons = (predictions[:,1] >= thr).astype(bool)
    return  {
        'precision': precision_score(ground_truth, class_predicitons),
        'recall': recall_score(ground_truth, class_predicitons),
        'F1': f1_score(ground_truth, class_predicitons),
        'balanced accuracy': balanced_accuracy_score(ground_truth, class_predicitons)
    } 


def random_forest_train_eval(train_X, train_y, testX, test_y, autoencoder = None):
    rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=42, n_jobs=-1)
    if autoencoder is not None:
        if type(train_X) is not np.ndarray:
            train_X = autoencoder(torch.tensor(train_X.to_numpy()).cuda()).cpu().detach().numpy()
        else:
            train_X = autoencoder(torch.tensor(train_X).cuda()).cpu().detach().numpy()
    rf.fit(train_X, train_y)
    predictions = rf.predict_proba(test_X)
    return eval_for_thresholds(test_y.values, predictions)


def linear_regression_train_eval(train_X, train_y, testX, test_y, autoencoder = None):
    lr = LogisticRegression(random_state=42, C=.005, solver='lbfgs', multi_class='multinomial',
                            max_iter=685, n_jobs=-1)
    if autoencoder is not None:
        if type(train_X) is not np.ndarray:
            train_X = autoencoder(torch.tensor(train_X.to_numpy()).cuda()).cpu().detach().numpy()
        else:
            train_X = autoencoder(torch.tensor(train_X).cuda()).cpu().detach().numpy()
    lr.fit(train_X, train_y)
    predictions = lr.predict_proba(test_X)
    return eval_for_thresholds(test_y.values, predictions)

In [21]:
for key in train_datasets.keys():
    print("Using oversampling algorithm:", key)
    curr_X, curr_y = train_datasets[key]
    
    lr_results = linear_regression_train_eval(curr_X, curr_y, test_X, test_y)
    print("Logistic regression(No DAE):")
    pp.pprint(lr_results)
    
    lr_dae_results = linear_regression_train_eval(curr_X, curr_y, test_X, test_y, autoenc_models[key])
    print("Logistic regression (With DAE):")
    pp.pprint(lr_dae_results)
    
    rf_results = random_forest_train_eval(curr_X, curr_y, test_X, test_y)
    print("Random forest (No DAE):")
    pp.pprint(rf_results)
    
    rf_dae_results = random_forest_train_eval(curr_X, curr_y, test_X, test_y, autoenc_models[key])
    print("Random forest(DAE):")
    pp.pprint(rf_dae_results)
    
    print("==========================")

Using oversampling algorithm: SMOTE
Logistic regression(No DAE):
{   0.2: {   'F1': 0.06761177753544166,
             'balanced accuracy': 0.9519799478598417,
             'precision': 0.03505465510742556,
             'recall': 0.9489795918367347},
    0.30000000000000004: {   'F1': 0.11526282457251426,
                             'balanced accuracy': 0.9520635702226867,
                             'precision': 0.06144496961512492,
                             'recall': 0.9285714285714286},
    0.4000000000000001: {   'F1': 0.17164898746383797,
                            'balanced accuracy': 0.9466076596648789,
                            'precision': 0.09478168264110756,
                            'recall': 0.9081632653061225},
    0.5000000000000001: {   'F1': 0.22361809045226128,
                            'balanced accuracy': 0.9487267508297635,
                            'precision': 0.12750716332378223,
                            'recall': 0.9081632653061225},
    0.60000

Logistic regression (With DAE):
{   0.2: {   'F1': 0.04613890237979602,
             'balanced accuracy': 0.9501817081644137,
             'precision': 0.0236318407960199,
             'recall': 0.9693877551020408},
    0.30000000000000004: {   'F1': 0.07416563658838073,
                             'balanced accuracy': 0.9394963493275756,
                             'precision': 0.038643194504079006,
                             'recall': 0.9183673469387755},
    0.4000000000000001: {   'F1': 0.11335012594458438,
                            'balanced accuracy': 0.9468736003123817,
                            'precision': 0.06040268456375839,
                            'recall': 0.9183673469387755},
    0.5000000000000001: {   'F1': 0.15808170515097691,
                            'balanced accuracy': 0.9458250907284692,
                            'precision': 0.08657587548638132,
                            'recall': 0.9081632653061225},
    0.6000000000000001: {   'F1': 0.21090909

Random forest (No DAE):
{   0.2: {   'F1': 0.1754756871035941,
             'balanced accuracy': 0.9167428120657379,
             'precision': 0.09787735849056604,
             'recall': 0.8469387755102041},
    0.30000000000000004: {   'F1': 0.38388625592417064,
                             'balanced accuracy': 0.9111286291387686,
                             'precision': 0.25,
                             'recall': 0.826530612244898},
    0.4000000000000001: {   'F1': 0.6584362139917695,
                            'balanced accuracy': 0.9075917261952615,
                            'precision': 0.5517241379310345,
                            'recall': 0.8163265306122449},
    0.5000000000000001: {   'F1': 0.7669902912621359,
                            'balanced accuracy': 0.9028062301172579,
                            'precision': 0.7314814814814815,
                            'recall': 0.8061224489795918},
    0.6000000000000001: {   'F1': 0.8,
                            'balan

Random forest(DAE):
{   0.2: {   'F1': 0.012010509195546103,
             'balanced accuracy': 0.8509382931563171,
             'precision': 0.006042296072507553,
             'recall': 0.9795918367346939},
    0.30000000000000004: {   'F1': 0.06376811594202898,
                             'balanced accuracy': 0.926346643046639,
                             'precision': 0.03305785123966942,
                             'recall': 0.8979591836734694},
    0.4000000000000001: {   'F1': 0.18640350877192982,
                            'balanced accuracy': 0.927263438436714,
                            'precision': 0.10442260442260443,
                            'recall': 0.8673469387755102},
    0.5000000000000001: {   'F1': 0.314176245210728,
                            'balanced accuracy': 0.9153601719247069,
                            'precision': 0.19339622641509435,
                            'recall': 0.8367346938775511},
    0.6000000000000001: {   'F1': 0.5190311418685121,
    