In [1]:
import time

import torch
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm

from utils import *
from models.CreditcardDataset import CreditcardDataset
from models.OversamplingDataset import OversamplingDataset
from models.Autoencoder import Autoencoder
from models.Classifier import Classifier
from models.HiddenReprClassifier import HiddenReprClassifier

DATA_PATH = "./data/"

In [2]:
BATCH_SIZE = 32

LEARNING_RATE_AUTOENC = 1e-3
LEARNING_RATE_CLASSIFIER = 1e-4

AUTOENC_EPOCHS = 20
CLASSIFIER_EPOCHS = 25

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cuda


In [3]:
from imblearn.over_sampling import SMOTE
from pyloras import LORAS

oversampler = LORAS(random_state = 42, n_affine=30, n_neighbors=30)
#oversampler = SMOTE(random_state=42)

train_dataset = OversamplingDataset(DATA_PATH + 'train.parquet', oversampler)
valid_dataset = CreditcardDataset(DATA_PATH + 'valid.parquet')
test_dataset = CreditcardDataset(DATA_PATH + 'test.parquet')

(186904, 29)
(372934, 29)


In [4]:
autoenc_model = Autoencoder()
autoenc_model = autoenc_model.cuda()
autoenc_optim = torch.optim.Adam(autoenc_model.parameters(), lr=LEARNING_RATE_AUTOENC)

In [5]:
def calculate_valid_loss_autoenc(model, valid_dataset, loss_fn):
    dataloader = torch.utils.data.DataLoader(valid_dataset, 1, shuffle=False)
    loss_list = []

    for x, _ in tqdm(dataloader):
        x = x.to(DEVICE)
        logits = model(x)
        loss = loss_fn(logits, x)
        loss_val = loss.item()
        loss_list.append(loss_val)

    return np.mean(loss_list)


def train_autoenc_model(model, optimizer, train_dataset, valid_dataset):
    model.train()
    loss_fn = torch.nn.MSELoss()

    for epoch in range(AUTOENC_EPOCHS):
        print("\n Start of epoch {}/{}".format(epoch + 1, AUTOENC_EPOCHS))
        epoch_start_time = time.time()

        loss_list = []
        dataloader = torch.utils.data.DataLoader(train_dataset, BATCH_SIZE, shuffle=True)

        for x_orig, _ in tqdm(dataloader):
            x_orig_gpu = x_orig.cpu().to(DEVICE)
            x_noisy = get_noised_data(x_orig)
            x_noisy = x_noisy.to(DEVICE)

            logits = model(x_noisy)
            loss = loss_fn(logits, x_orig_gpu)
            loss_val = loss.item()
            loss_list.append(loss_val)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_loss = np.mean(loss_list)
        valid_loss = calculate_valid_loss_autoenc(model, valid_dataset, loss_fn)

        epoch_time = time.time() - epoch_start_time
        print(
            "\n Epoch {} complete in {:.2f}, train loss = {}, valid_loss = {}".format(epoch + 1, epoch_time, train_loss,
                                                                                      valid_loss))


train_autoenc_model(autoenc_model, autoenc_optim, train_dataset, valid_dataset)


 Start of epoch 1/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 1 complete in 59.65, train loss = 0.460721644626212, valid_loss = 0.36446052093224435

 Start of epoch 2/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 2 complete in 59.45, train loss = 0.24908675704581296, valid_loss = 0.2870736166289034

 Start of epoch 3/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 3 complete in 59.37, train loss = 0.22083869733072975, valid_loss = 0.25512237402253907

 Start of epoch 4/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 4 complete in 61.16, train loss = 0.20519953538155872, valid_loss = 0.2373710259047229

 Start of epoch 5/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 5 complete in 60.82, train loss = 0.19668938767137778, valid_loss = 0.23013897279143436

 Start of epoch 6/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 6 complete in 60.43, train loss = 0.19061977436285785, valid_loss = 0.21850059578734943

 Start of epoch 7/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 7 complete in 60.54, train loss = 0.18434952444999017, valid_loss = 0.2130836047758695

 Start of epoch 8/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 8 complete in 60.52, train loss = 0.18130614525550223, valid_loss = 0.20646083963115697

 Start of epoch 9/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 9 complete in 61.52, train loss = 0.17945010621613672, valid_loss = 0.20456266163414333

 Start of epoch 10/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 10 complete in 63.27, train loss = 0.17814190989461695, valid_loss = 0.20291010587433675

 Start of epoch 11/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 11 complete in 61.86, train loss = 0.17710939210047286, valid_loss = 0.20226577985001426

 Start of epoch 12/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 12 complete in 62.46, train loss = 0.17595124143622864, valid_loss = 0.19783900957024408

 Start of epoch 13/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 13 complete in 60.66, train loss = 0.17504331461545072, valid_loss = 0.194942542114147

 Start of epoch 14/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 14 complete in 60.27, train loss = 0.1735656495739441, valid_loss = 0.19600888854208345

 Start of epoch 15/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 15 complete in 59.68, train loss = 0.1727530584608675, valid_loss = 0.19236819068287533

 Start of epoch 16/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 16 complete in 60.58, train loss = 0.17191014246897654, valid_loss = 0.19162269384478806

 Start of epoch 17/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 17 complete in 60.27, train loss = 0.1709666102261617, valid_loss = 0.1904129502312404

 Start of epoch 18/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 18 complete in 60.50, train loss = 0.16952868899348875, valid_loss = 0.1930600793367479

 Start of epoch 19/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 19 complete in 60.21, train loss = 0.16872324020642163, valid_loss = 0.18733094913358245

 Start of epoch 20/20


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 20 complete in 60.57, train loss = 0.16792287958992852, valid_loss = 0.18585041650319062


In [6]:
classifier_model = Classifier()
classifier_model = classifier_model.cuda()
classifier_optim = torch.optim.Adam(classifier_model.parameters(), lr=LEARNING_RATE_CLASSIFIER)

In [7]:
def pass_through_autoenc_classifier(x, y, autoenc, classifier, loss_fn, passthrough_fnc):
    autoenc_out = passthrough_fnc(x)
    logits = classifier(autoenc_out)
    loss = loss_fn(logits, y)
    loss_val = loss.item()
    
    return loss, loss_val


def calculate_valid_loss_classifier(autoenc, model, valid_dataset, loss_fn, passthrough_fnc):

    dataloader = torch.utils.data.DataLoader(valid_dataset, 1, shuffle=False)
    loss_list = []

    for x, y in tqdm(dataloader):
        x = x.to(DEVICE)
        y = y.type(torch.LongTensor).to(DEVICE)

        _, loss_val = pass_through_autoenc_classifier(x ,y, autoenc, model, loss_fn, passthrough_fnc)
        loss_list.append(loss_val)

    return np.mean(loss_list)


def train_classifier_model(autoenc_model, model, optimizer, train_dataset, valid_dataset, passthrough_fnc):
    model.train()

    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(CLASSIFIER_EPOCHS):
        print("\n Start of epoch {}/{}".format(epoch + 1, CLASSIFIER_EPOCHS))
        epoch_start_time = time.time()

        loss_list = []
        dataloader = torch.utils.data.DataLoader(train_dataset, BATCH_SIZE, shuffle=True)

        for x, y in tqdm(dataloader):
            x = x.to(DEVICE)
            y = y.type(torch.LongTensor).to(DEVICE)

            loss, loss_val = pass_through_autoenc_classifier(x, y, autoenc_model, model, loss_fn, passthrough_fnc)
            loss_list.append(loss_val)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_loss = np.mean(loss_list)
        valid_loss = calculate_valid_loss_classifier(autoenc_model, model, valid_dataset, loss_fn, passthrough_fnc)

        epoch_time = time.time() - epoch_start_time
        print(
            "\n Epoch {} complete in {:.2f}, train loss = {}, valid_loss = {}".format(epoch + 1, epoch_time, train_loss,
                                                                                      valid_loss))


train_classifier_model(autoenc_model, classifier_model, classifier_optim, train_dataset, valid_dataset, autoenc_model.forward)


 Start of epoch 1/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 1 complete in 63.14, train loss = 0.20845383417607632, valid_loss = 0.12625187396227966

 Start of epoch 2/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 2 complete in 62.79, train loss = 0.14405850605576426, valid_loss = 0.10728320379600989

 Start of epoch 3/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 3 complete in 63.18, train loss = 0.12969357884077076, valid_loss = 0.11257626201582739

 Start of epoch 4/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 4 complete in 63.29, train loss = 0.12091646724688597, valid_loss = 0.10945523221715188

 Start of epoch 5/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 5 complete in 63.26, train loss = 0.11294259583199551, valid_loss = 0.09936697888937486

 Start of epoch 6/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 6 complete in 62.70, train loss = 0.10441024219137802, valid_loss = 0.0956648403904479

 Start of epoch 7/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 7 complete in 63.14, train loss = 0.09633204626871959, valid_loss = 0.09858152457775643

 Start of epoch 8/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 8 complete in 63.29, train loss = 0.08856765683181024, valid_loss = 0.09238899106834482

 Start of epoch 9/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 9 complete in 63.04, train loss = 0.08179002478238313, valid_loss = 0.08778953249969189

 Start of epoch 10/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 10 complete in 62.86, train loss = 0.0764746375701377, valid_loss = 0.07892589919597054

 Start of epoch 11/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 11 complete in 62.64, train loss = 0.07244459808957318, valid_loss = 0.08230784407999828

 Start of epoch 12/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 12 complete in 62.82, train loss = 0.06912426506182624, valid_loss = 0.08256283040877258

 Start of epoch 13/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 13 complete in 63.18, train loss = 0.06655585283193984, valid_loss = 0.08583933449955115

 Start of epoch 14/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 14 complete in 62.84, train loss = 0.06437218811357039, valid_loss = 0.07498902253685555

 Start of epoch 15/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 15 complete in 63.43, train loss = 0.06241176080171481, valid_loss = 0.0762250896445968

 Start of epoch 16/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 16 complete in 63.19, train loss = 0.06058851009622987, valid_loss = 0.07662778569109033

 Start of epoch 17/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 17 complete in 63.20, train loss = 0.05895999054806673, valid_loss = 0.07651016210708339

 Start of epoch 18/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 18 complete in 63.16, train loss = 0.05765349567625696, valid_loss = 0.07091241220307537

 Start of epoch 19/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 19 complete in 63.13, train loss = 0.05644481265640241, valid_loss = 0.07365672396576116

 Start of epoch 20/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 20 complete in 62.89, train loss = 0.05533316114440874, valid_loss = 0.0685530654776854

 Start of epoch 21/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 21 complete in 62.98, train loss = 0.05449193940264804, valid_loss = 0.0690571961630937

 Start of epoch 22/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 22 complete in 62.98, train loss = 0.053678594842116775, valid_loss = 0.06694165089609248

 Start of epoch 23/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 23 complete in 62.96, train loss = 0.0529403340655564, valid_loss = 0.06368456986683639

 Start of epoch 24/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 24 complete in 63.35, train loss = 0.052339152005616306, valid_loss = 0.06307050945060894

 Start of epoch 25/25


  0%|          | 0/11655 [00:00<?, ?it/s]

  0%|          | 0/26701 [00:00<?, ?it/s]


 Epoch 25 complete in 63.04, train loss = 0.05173344231122039, valid_loss = 0.07230952305022821


In [15]:
def evaluate_predictions(auto_enc, model, dataset, passthrough_fnc):
    dataloader = torch.utils.data.DataLoader(dataset, 1, shuffle=False)

    predictions = []
    ground_truth = []
    for x, y in tqdm(dataloader):
        x = x.to(DEVICE)
        ground_truth.append(y.item())

        autoenc_out = passthrough_fnc(x)
        logits = model(autoenc_out)
        predictions.append(np.argmax(logits.detach().cpu().numpy()))
        
    return classification_report(ground_truth, predictions, target_names=['non-fraud', 'fraud'], digits=3,
                                 output_dict=True)

evaluate_predictions(autoenc_model, classifier_model, test_dataset, autoenc_model.forward)

  0%|          | 0/71202 [00:00<?, ?it/s]

{'non-fraud': {'precision': 0.9997688462538646,
  'recall': 0.9736749539206169,
  'f1-score': 0.986549386632072,
  'support': 71073},
 'fraud': {'precision': 0.05695564516129032,
  'recall': 0.875968992248062,
  'f1-score': 0.10695693327023188,
  'support': 129},
 'accuracy': 0.9734979354512514,
 'macro avg': {'precision': 0.5283622457075775,
  'recall': 0.9248219730843394,
  'f1-score': 0.5467531599511519,
  'support': 71202},
 'weighted avg': {'precision': 0.9980607073962351,
  'recall': 0.9734979354512514,
  'f1-score': 0.9849557877656965,
  'support': 71202}}

In [None]:
hidden_classifier = HiddenReprClassifier()
hiddenClassifier_optim = torch.optim.Adam(hidden_classifier.parameters(), lr=LEARNING_RATE_CLASSIFIER)

In [None]:
train_classifier_model(autoenc_model, hidden_classifier, hiddenClassifier_optim, train_dataset, valid_dataset, autoenc_model.get_enc)

In [None]:
evaluate_predictions(autoenc_model, hidden_classifier, test_dataset, autoenc_model.get_enc)

In [50]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

train_df = pd.read_parquet(DATA_PATH + 'train.parquet')
valid_df = pd.read_parquet(DATA_PATH + 'valid.parquet')
test_df = pd.read_parquet(DATA_PATH + 'test.parquet')


def get_x_y(df):
    y = df['Class'].astype(np.float32)
    X = df.drop('Class', axis=1).astype(np.float32)

    return X,y

train_X, train_y = get_x_y(train_df)

valid_X, valid_y = get_x_y(valid_df)
test_X, test_y = get_x_y(test_df)

X = pd.concat([train_X, valid_X])
y = pd.concat([train_y, valid_y])

X = autoenc_model(torch.tensor(X.to_numpy()).cuda()).cpu().detach().numpy()


rf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=42)
rf.fit(X,y)
predictions = rf.predict(test_X)
classification_report(test_y, predictions, target_names=['non-fraud', 'fraud'], digits=3,
                                 output_dict=True)

{'non-fraud': {'precision': 0.9994373646158607,
  'recall': 0.9997608025777039,
  'f1-score': 0.9995990574332643,
  'support': 71071},
 'fraud': {'precision': 0.8425925925925926,
  'recall': 0.6946564885496184,
  'f1-score': 0.7615062761506277,
  'support': 131},
 'accuracy': 0.9991994606893064,
 'macro avg': {'precision': 0.9210149786042267,
  'recall': 0.8472086455636612,
  'f1-score': 0.880552666791946,
  'support': 71202},
 'weighted avg': {'precision': 0.9991487959642071,
  'recall': 0.9991994606893064,
  'f1-score': 0.999161005772524,
  'support': 71202}}