In [1]:
import time
import pprint

import torch
import pandas as pd
from sklearn.metrics import f1_score, balanced_accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from tqdm.notebook import tqdm

from utils import *
from models.CreditcardDataset import CreditcardDataset
from models.OversamplingDataset import OversamplingDataset
from models.LorasDataset import LorasDataset
from models.Autoencoder import Autoencoder
from models.Classifier import Classifier
from models.HiddenReprClassifier import HiddenReprClassifier

from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE


DATA_PATH = "./data/"
pp = pprint.PrettyPrinter(indent=4)

In [2]:
BATCH_SIZE = 64

LEARNING_RATE_AUTOENC = 8e-3
LEARNING_RATE_CLASSIFIER = 3e-4
LEARNING_RATE_SMALL_CLASS = 1e-4
NOISE_STR = 0

AUTOENC_EPOCHS = 100
CLASSIFIER_EPOCHS = 25

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cpu


In [3]:
train_df = pd.read_parquet(DATA_PATH+"train.parquet")
valid_df = pd.read_parquet(DATA_PATH+"valid.parquet")
test_df = pd.read_parquet(DATA_PATH+"test.parquet")

In [4]:
oversamplers = {
    "SMOTE":SMOTE(random_state=42, k_neighbors=29, n_jobs=-1),
    "ADASYN": ADASYN(random_state=42, n_neighbors=29, n_jobs=-1),
    "SVMSMOTE": SVMSMOTE(random_state=42, k_neighbors=29, n_jobs=-1)
}

In [5]:
train_datasets = {key: OversamplingDataset(train_df, oversamplers[key])for key in oversamplers.keys()}
valid_dataset = CreditcardDataset(valid_df)
test_dataset = CreditcardDataset(test_df)

train_datasets['None'] = CreditcardDataset(train_df)#baseline(no oversampling)
train_datasets['LORAS'] = LorasDataset(train_df)

In [6]:
autoenc_models = {key:Autoencoder().to(DEVICE) for key in train_datasets.keys()}
autoenc_optims = {key:torch.optim.Adam(autoenc_models[key].parameters(),
                                       lr=LEARNING_RATE_AUTOENC) for key in autoenc_models.keys()}

In [7]:
def calculate_valid_loss_autoenc(model, valid_dataset, loss_fn):
    dataloader = torch.utils.data.DataLoader(valid_dataset, 1, shuffle=False)
    loss_list = []

    for x, _ in dataloader:
        x = x.to(DEVICE)
        logits = model(x)
        loss = loss_fn(logits, x)
        loss_val = loss.item()
        loss_list.append(loss_val)

    return np.mean(loss_list)


def train_autoenc_model(model, optimizer, train_dataset, valid_dataset):
    model.train()
    loss_fn = torch.nn.MSELoss()

    for epoch in range(AUTOENC_EPOCHS):
        print("\n Start of epoch {}/{}...".format(epoch + 1, AUTOENC_EPOCHS))
        epoch_start_time = time.time()

        loss_list = []
        dataloader = torch.utils.data.DataLoader(train_dataset, BATCH_SIZE, shuffle=True)

        for x_orig, _ in dataloader:
            x_orig_gpu = x_orig.cpu().to(DEVICE)
            x_noisy = get_noised_data(x_orig, NOISE_STR)
            x_noisy = x_noisy.to(DEVICE)

            logits = model(x_noisy)
            loss = loss_fn(logits, x_orig_gpu)
            loss_val = loss.item()
            loss_list.append(loss_val)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_loss = np.mean(loss_list)
        valid_loss = calculate_valid_loss_autoenc(model, valid_dataset, loss_fn)

        epoch_time = time.time() - epoch_start_time
        print(
            "\n Epoch {} complete in {:.2f},train loss = {}, valid_loss = {}".format(epoch + 1, epoch_time, train_loss, valid_loss))

In [8]:
for key in autoenc_models.keys():
    print("Training DAE with oversamlping method:", key)
    train_autoenc_model(autoenc_models[key], autoenc_optims[key], train_datasets[key], valid_dataset)
    print("==========================")

Training DAE with oversamlping method: SMOTE

 Start of epoch 1/100...

 Epoch 1 complete in 16.43,train loss = 0.4832826071038507, valid_loss = 0.47591918513441994

 Start of epoch 2/100...

 Epoch 2 complete in 15.35,train loss = 0.41513265639160224, valid_loss = 0.47378907341384885

 Start of epoch 3/100...

 Epoch 3 complete in 14.89,train loss = 0.41232843147980053, valid_loss = 0.4751977593257083

 Start of epoch 4/100...

 Epoch 4 complete in 16.68,train loss = 0.40764708298099767, valid_loss = 0.48235902383163326

 Start of epoch 5/100...

 Epoch 5 complete in 17.05,train loss = 0.41394639936578237, valid_loss = 0.47936839404198767

 Start of epoch 6/100...

 Epoch 6 complete in 16.72,train loss = 0.4027502883956363, valid_loss = 0.4865581867182101

 Start of epoch 7/100...

 Epoch 7 complete in 17.05,train loss = 0.40493346490564835, valid_loss = 0.4777521627011194

 Start of epoch 8/100...

 Epoch 8 complete in 16.84,train loss = 0.4052172507503792, valid_loss = 0.47404407061


 Epoch 68 complete in 14.90,train loss = 0.4074758564735915, valid_loss = 0.4749772398777107

 Start of epoch 69/100...

 Epoch 69 complete in 15.75,train loss = 0.4165946960353391, valid_loss = 0.4776311983181048

 Start of epoch 70/100...

 Epoch 70 complete in 16.89,train loss = 0.4149154374765238, valid_loss = 0.477453417033046

 Start of epoch 71/100...

 Epoch 71 complete in 16.83,train loss = 0.40795825873636354, valid_loss = 0.4769804421121022

 Start of epoch 72/100...

 Epoch 72 complete in 15.72,train loss = 0.405055199702453, valid_loss = 0.47544716879511467

 Start of epoch 73/100...

 Epoch 73 complete in 15.87,train loss = 0.4040875570135868, valid_loss = 0.47756244966373834

 Start of epoch 74/100...

 Epoch 74 complete in 15.80,train loss = 0.4068843572373559, valid_loss = 0.48104974649390586

 Start of epoch 75/100...

 Epoch 75 complete in 16.97,train loss = 0.4125361914515879, valid_loss = 0.4764673741347506

 Start of epoch 76/100...

 Epoch 76 complete in 17.21,t


 Epoch 35 complete in 16.06,train loss = 0.40091938316295, valid_loss = 0.4591655082633638

 Start of epoch 36/100...

 Epoch 36 complete in 16.84,train loss = 0.3882328489825273, valid_loss = 0.4631176741197242

 Start of epoch 37/100...

 Epoch 37 complete in 15.65,train loss = 0.3895199132789753, valid_loss = 0.45963458353548586

 Start of epoch 38/100...

 Epoch 38 complete in 15.40,train loss = 0.4128658711838377, valid_loss = 0.4759394155082703

 Start of epoch 39/100...

 Epoch 39 complete in 17.25,train loss = 0.40319019941032125, valid_loss = 0.46012789747296107

 Start of epoch 40/100...

 Epoch 40 complete in 16.88,train loss = 0.39282174983879375, valid_loss = 0.4587612131054028

 Start of epoch 41/100...

 Epoch 41 complete in 15.91,train loss = 0.3988798173126492, valid_loss = 0.4587781241580926

 Start of epoch 42/100...

 Epoch 42 complete in 16.08,train loss = 0.4008713311752322, valid_loss = 0.46117998075971905

 Start of epoch 43/100...

 Epoch 43 complete in 15.61,


 Epoch 2 complete in 16.36,train loss = 0.5395634672936902, valid_loss = 0.4968668383757643

 Start of epoch 3/100...

 Epoch 3 complete in 16.58,train loss = 0.5264913189353667, valid_loss = 0.49092319142063584

 Start of epoch 4/100...

 Epoch 4 complete in 15.86,train loss = 0.5179731653194719, valid_loss = 0.4994909171632265

 Start of epoch 5/100...

 Epoch 5 complete in 16.61,train loss = 0.5324180098448152, valid_loss = 0.48993982194426905

 Start of epoch 6/100...

 Epoch 6 complete in 14.97,train loss = 0.5157547670497389, valid_loss = 0.49336451014848426

 Start of epoch 7/100...

 Epoch 7 complete in 15.10,train loss = 0.534021519138882, valid_loss = 0.499401868932324

 Start of epoch 8/100...

 Epoch 8 complete in 15.66,train loss = 0.5193833015787257, valid_loss = 0.49527328765464296

 Start of epoch 9/100...

 Epoch 9 complete in 15.90,train loss = 0.515102963595145, valid_loss = 0.5021331495988683

 Start of epoch 10/100...

 Epoch 10 complete in 15.54,train loss = 0.51


 Epoch 70 complete in 14.65,train loss = 0.5170997071256591, valid_loss = 0.5236857028826583

 Start of epoch 71/100...

 Epoch 71 complete in 15.93,train loss = 0.5184984945743031, valid_loss = 0.49687615821916215

 Start of epoch 72/100...

 Epoch 72 complete in 15.54,train loss = 0.5123229282461945, valid_loss = 0.49355617093913723

 Start of epoch 73/100...

 Epoch 73 complete in 14.79,train loss = 0.5156408599622764, valid_loss = 0.49167210886171914

 Start of epoch 74/100...

 Epoch 74 complete in 16.54,train loss = 0.5172848498945835, valid_loss = 0.5158376123590448

 Start of epoch 75/100...

 Epoch 75 complete in 16.99,train loss = 0.5167005654364537, valid_loss = 0.49282954635302556

 Start of epoch 76/100...

 Epoch 76 complete in 14.34,train loss = 0.5185490826462242, valid_loss = 0.49052196611456017

 Start of epoch 77/100...

 Epoch 77 complete in 15.11,train loss = 0.5210476066330239, valid_loss = 0.4936622915616372

 Start of epoch 78/100...

 Epoch 78 complete in 16.6


 Epoch 38 complete in 24.10,train loss = 0.4267884340036818, valid_loss = 0.3905076611276833

 Start of epoch 39/100...

 Epoch 39 complete in 24.66,train loss = 0.42657515328659634, valid_loss = 0.42788471590214444

 Start of epoch 40/100...

 Epoch 40 complete in 23.65,train loss = 0.4178510653775601, valid_loss = 0.46364965032405375

 Start of epoch 41/100...

 Epoch 41 complete in 24.53,train loss = 0.46607254869250797, valid_loss = 0.4003367051096343

 Start of epoch 42/100...

 Epoch 42 complete in 23.33,train loss = 0.4246810155709074, valid_loss = 0.44051865602556917

 Start of epoch 43/100...

 Epoch 43 complete in 24.00,train loss = 0.44400643768202663, valid_loss = 0.4657621846477512

 Start of epoch 44/100...

 Epoch 44 complete in 22.71,train loss = 0.4227604405124778, valid_loss = 0.44235671687533623

 Start of epoch 45/100...

 Epoch 45 complete in 24.11,train loss = 0.4581540595448369, valid_loss = 0.42883416088631177

 Start of epoch 46/100...

 Epoch 46 complete in 2

KeyboardInterrupt: 

In [None]:
classifier_models = {key:Classifier().to(DEVICE) for key in autoenc_models.keys()}
classifier_optims = {key:torch.optim.Adam(classifier_models[key].parameters(),
                                       lr=LEARNING_RATE_AUTOENC) for key in classifier_models.keys()}

In [None]:
def pass_through_autoenc_classifier(x, y, autoenc, classifier, loss_fn, passthrough_fnc):
    autoenc_out = passthrough_fnc(x)
    logits = classifier(autoenc_out)
    loss = loss_fn(logits, y)
    loss_val = loss.item()
    
    return loss, loss_val


def calculate_valid_loss_classifier(autoenc, model, valid_dataset, loss_fn, passthrough_fnc):

    dataloader = torch.utils.data.DataLoader(valid_dataset, 1, shuffle=False)
    loss_list = []

    for x, y in dataloader:
        x = x.to(DEVICE)
        y = y.type(torch.LongTensor).to(DEVICE)

        _, loss_val = pass_through_autoenc_classifier(x ,y, autoenc, model, loss_fn, passthrough_fnc)
        loss_list.append(loss_val)

    return np.mean(loss_list)


def train_classifier_model(autoenc_model, model, optimizer, train_dataset, valid_dataset, passthrough_fnc):
    model.train()

    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(CLASSIFIER_EPOCHS):
        print("\n Start of epoch {}/{}".format(epoch + 1, CLASSIFIER_EPOCHS))
        epoch_start_time = time.time()

        loss_list = []
        dataloader = torch.utils.data.DataLoader(train_dataset, BATCH_SIZE, shuffle=True)

        for x, y in dataloader:
            x = x.to(DEVICE)
            y = y.type(torch.LongTensor).to(DEVICE)

            loss, loss_val = pass_through_autoenc_classifier(x, y, autoenc_model, model, loss_fn, passthrough_fnc)
            loss_list.append(loss_val)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_loss = np.mean(loss_list)
#         valid_loss = calculate_valid_loss_classifier(autoenc_model, model, valid_dataset, loss_fn, passthrough_fnc)

        epoch_time = time.time() - epoch_start_time
        print(
            "\n Epoch {} complete in {:.2f}, train loss = {}, valid_loss = {}".format(epoch + 1, epoch_time, train_loss,
                                                                                      '?'))

In [None]:
for key in classifier_models.keys():
    print("Training classifier with oversamlping method:", key)
    train_classifier_model(autoenc_models[key], classifier_models[key], classifier_optims[key],
                          train_datasets[key], valid_dataset, autoenc_models[key].forward)

In [None]:
def evaluate_predictions(auto_enc, model, dataset, passthrough_fnc):
    dataloader = torch.utils.data.DataLoader(dataset, 1, shuffle=False)

    predictions = []
    ground_truth = []
    for x, y in tqdm(dataloader):
        x = x.to(DEVICE)
        ground_truth.append(y.item())

        autoenc_out = passthrough_fnc(x)
        logits = model(autoenc_out)
        predictions.append(np.argmax(logits.detach().cpu().numpy()))
        
    return {
        'precision': precision_score(ground_truth, predictions),
        'recall': recall_score(ground_truth, predictions),
        'F1': f1_score(ground_truth, predictions),
        'balanced accuracy': balanced_accuracy_score(ground_truth, predictions)
    }

In [None]:
for key in classifier_models.keys():
    print("Results for model with oversampling algorthm:", key)
    pp.pprint(evaluate_predictions(autoenc_models[key], classifier_models[key],
                               test_dataset, autoenc_models[key].forward))

In [None]:
hidden_classifier_models = {key:HiddenReprClassifier().to(DEVICE) for key in autoenc_models.keys()}
hidden_classifier_optims = {key:torch.optim.Adam(hidden_classifier_models[key].parameters(),
                                       lr=LEARNING_RATE_AUTOENC) for key in hidden_classifier_models.keys()}

In [None]:
for key in hidden_classifier_models.keys():
    print("Training hidden classifiers with oversamlping method:", key)
    train_classifier_model(autoenc_models[key], hidden_classifier_models[key], hidden_classifier_optims[key],
                          train_datasets[key], valid_dataset, autoenc_models[key].get_enc)

In [None]:
for key in hidden_classifier_models.keys():
    print("Results for hidden_repr model with oversampling algorthm:", key)
    pp.pprint(evaluate_predictions(autoenc_models[key], hidden_classifier_models[key],
                               test_dataset, autoenc_models[key].get_enc))

In [None]:
train_X, train_y = get_x_y(train_df)

valid_X, valid_y = get_x_y(valid_df)
test_X, test_y = get_x_y(test_df)

X = pd.concat([train_X, valid_X])
y = pd.concat([train_y, valid_y])

train_datasets = {key: oversamplers[key].fit_resample(X,y) for key in oversamplers.keys()}
train_datasets['LORAS'] = loras_oversample_dataframe(pd.concat([X, y] ,axis=1))

In [None]:
def eval_for_thresholds(y, predictions):
    return {thr: eval_for_threshhold(y, predictions, thr) for thr in np.arange(0.2, 0.9, 0.1)}


def eval_for_threshhold(ground_truth, predictions, thr):
    class_predicitons = (predictions[:,1] >= thr).astype(bool)
    return  {
        'precision': precision_score(ground_truth, class_predicitons),
        'recall': recall_score(ground_truth, class_predicitons),
        'F1': f1_score(ground_truth, class_predicitons),
        'balanced accuracy': balanced_accuracy_score(ground_truth, class_predicitons)
    } 


def random_forest_train_eval(train_X, train_y, testX, test_y, autoencoder = None):
    rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=42, n_jobs=-1)
    if autoencoder is not None:
        if type(train_X) is not np.ndarray:
            train_X = autoencoder(torch.tensor(train_X.to_numpy())).to(DEVICE).cpu().detach().numpy()
        else:
            train_X = autoencoder(torch.tensor(train_X)).to(DEVICE).cpu().detach().numpy()
    rf.fit(train_X, train_y)
    predictions = rf.predict_proba(test_X)
    return eval_for_thresholds(test_y.values, predictions)


def linear_regression_train_eval(train_X, train_y, testX, test_y, autoencoder = None):
    lr = LogisticRegression(random_state=42, C=.005, solver='lbfgs', multi_class='multinomial',
                            max_iter=685, n_jobs=-1)
    if autoencoder is not None:
        if type(train_X) is not np.ndarray:
            train_X = autoencoder(torch.tensor(train_X.to_numpy())).to(DEVICE).cpu().detach().numpy()
        else:
            train_X = autoencoder(torch.tensor(train_X)).to(DEVICE).cpu().detach().numpy()
    lr.fit(train_X, train_y)
    predictions = lr.predict_proba(test_X)
    return eval_for_thresholds(test_y.values, predictions)

In [None]:
for key in train_datasets.keys():
    print("Using oversampling algorithm:", key)
    curr_X, curr_y = train_datasets[key]
    
    lr_results = linear_regression_train_eval(curr_X, curr_y, test_X, test_y)
    print("Logistic regression(No DAE):")
    pp.pprint(lr_results)
    
    lr_dae_results = linear_regression_train_eval(curr_X, curr_y, test_X, test_y, autoenc_models[key])
    print("Logistic regression (With DAE):")
    pp.pprint(lr_dae_results)
    
    rf_results = random_forest_train_eval(curr_X, curr_y, test_X, test_y)
    print("Random forest (No DAE):")
    pp.pprint(rf_results)
    
    rf_dae_results = random_forest_train_eval(curr_X, curr_y, test_X, test_y, autoenc_models[key])
    print("Random forest(DAE):")
    pp.pprint(rf_dae_results)
    
    print("==========================")