<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Financial-Transactions" data-toc-modified-id="Financial-Transactions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Financial Transactions</a></span><ul class="toc-item"><li><span><a href="#The-Leaderboard-Predict-function" data-toc-modified-id="The-Leaderboard-Predict-function-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>The Leaderboard Predict function</a></span></li><li><span><a href="#Testing-your-Implementation" data-toc-modified-id="Testing-your-Implementation-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Testing your Implementation</a></span></li></ul></li></ul></div>

# Financial Transactions

The ability to identify fraudulent transactions is of great interest to the payments industry. In this notebook, you will make use of the binary classifier you trained on the transcations dataset to detect fraud.

In [43]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import roc_auc_score
import pathlib
import joblib
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
from torchvision import datasets, transforms
from tqdm.notebook import tqdm
from torchvision import datasets, transforms


In [45]:
path = "/data/mlproject22" if os.path.exists("/data/mlproject22") else "."
train_data = pd.read_csv(os.path.join(path, "transactions.csv.zip"))
X_train = train_data.drop(columns = "Class")
y_train = train_data["Class"]

## The Leaderboard Predict function
Replace the comment and `NotImplementedError` in the `leader_board_predict_fn` with code that loads your model parameters and returns the likelyhood of fraud for each transaction (i.e. row) in the values dataframe. Note that the returned array should contain a single decision function value for each transaction, indicating whether the transaction is fraudulent (i.e. it belongs to target class $1$). The higher the decision function value, the more likely that the transaction is fraud.
You can import the packages you require.

In [None]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.output_size = output_size
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.view(-1, self.input_size)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x  

In [61]:
# Define hyperparameters
LEARNING_RATE = 0.001
MOMENTUM = 0.9
NUM_EPOCHS = 1
HIDDEN_SIZE = 100
TRAIN_BATCH_SIZE = 64
TEST_BATCH_SIZE = 64
INPUT_SIZE = 784  
OUTPUT_SIZE = 10  

In [70]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [62]:
from torch.utils.data import TensorDataset

# Convert your DataFrame to tensors
train_data = TensorDataset(torch.tensor(X_train.values), torch.tensor(y_train.values))
test_data = TensorDataset(torch.tensor(X_test.values), torch.tensor(y_test.values))

# Create your DataLoaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True, drop_last=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=TEST_BATCH_SIZE, shuffle=False, drop_last=False)

In [63]:
num_train_batches = len(train_loader)
num_test_batches = len(test_loader)

print(f"num_train_batches: {num_train_batches}, num_test_batches: {num_test_batches}")

num_train_batches: 3560, num_test_batches: 3561


In [64]:
net_mnist = Net(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)

criterion = nn.CrossEntropyLoss()  
optimizer = optim.SGD(net_mnist.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

In [65]:
summary(net_mnist, input_size=(INPUT_SIZE,), device="cpu")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 100]          78,500
            Linear-2                  [-1, 100]          10,100
            Linear-3                   [-1, 10]           1,010
Total params: 89,610
Trainable params: 89,610
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.34
Estimated Total Size (MB): 0.35
----------------------------------------------------------------


In [66]:
def train_neural_network_pytorch_minibatch(net, train_loader, optimizer, criterion, num_epochs):

    net.train() 

    for epoch in range(num_epochs):
        for batch_idx, (data, target) in enumerate(tqdm(train_loader)):
            
            optimizer.zero_grad()
            outputs = net(data)
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
        

In [67]:
def calc_accuracy_minibatch(net, data_loader):
    """
    Calculates the overall accuracy by using minibatches
    """
    net.eval()
    correct = 0
    with torch.no_grad():
        for data, target in data_loader:
            output = net(data)
            pred = output.argmax(dim=1, keepdim=True)  
            correct += pred.eq(target.view_as(pred)).sum().item()

    accuracy = correct/len(data_loader.dataset)
    return accuracy

In [68]:
train_neural_network_pytorch_minibatch(net_mnist, train_loader, optimizer, criterion, NUM_EPOCHS)

  0%|          | 0/3560 [00:00<?, ?it/s]

KeyError: 117323

In [7]:
def leader_board_predict_fn(values):
    
    decision_function_values = np.zeros(values.shape[0])

     
    model = joblib.load("./logistic_regression_model.pkl")
    return model.predict(values)
    
    return decision_function_values

## Testing your Implementation
Your model should return the probability or decision function value that indicates the likelyhood of fraud for each input transaction. To verify that this is the case, we run your model on a subset of the transactions dataset it was trained on. There is a hidden cell that performs the actual test on the unseen test set and computes your score for the leaderboard using the [ROC AUC](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html) score.

In [4]:
def get_score():
    """
    Function to compute scores for train and test datasets.
    """
    import pandas as pd
    import numpy as np
    import os
    from sklearn.metrics import roc_auc_score
    import pathlib

    try:
        path = "/data/mlproject22" if os.path.exists("/data/mlproject22") else "."
        test_data = pd.read_csv(os.path.join(path, "transactions.csv.zip"))
        X_test = test_data.drop(columns = "Class")
        y_test = test_data["Class"]
        decision_function_values = leader_board_predict_fn(X_test)
        assert decision_function_values.shape == (X_test.shape[0],)
        dataset_score = roc_auc_score(y_test, decision_function_values)
        assert dataset_score >= 0.0 and dataset_score <= 1.0
    except Exception:
        dataset_score = float("nan")
    print(f"Train Dataset Score: {dataset_score}")

    import os
    import pwd
    import time
    import datetime
    import pandas as pd
    user_id = pwd.getpwuid( os.getuid() ).pw_name
    curtime = time.time()
    dt_now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")

    try:
        HIDDEN_DATASET_PATH = os.path.expanduser("/data/mlproject22-test-data")
        test_data = pd.read_csv(os.path.join(HIDDEN_DATASET_PATH,"transactions_scoreboard.csv.zip"))
        X_test = test_data.drop(columns=["Class"])
        y_test = test_data["Class"]
        decision_function_values = leader_board_predict_fn(X_test)
        hiddendataset_score = roc_auc_score(y_test, decision_function_values)
        print(f"Test Dataset Score: {hiddendataset_score}")
        score_dict = dict(
            score_hidden=hiddendataset_score,
            score_train=dataset_score,
            unixtime=curtime,
            user=user_id,
            dt=dt_now,
            comment="",
        )
    except Exception as e:
        err = str(e)
        score_dict = dict(
            score_hidden=float("nan"),
            score_train=dataset_score,
            unixtime=curtime,
            user=user_id,
            dt=dt_now,
            comment=err
        )

    if list(pathlib.Path(os.getcwd()).parents)[0].name == 'source':
        print("we are in the source directory... replacing values.")
        print(pd.DataFrame([score_dict]))
        score_dict["score_hidden"] = -1
        score_dict["score_train"] = -1
        print("new values:")
        print(pd.DataFrame([score_dict]))

    pd.DataFrame([score_dict]).to_csv("transactions.csv", index=False)
    
get_score()

Train Dataset Score: 0.5
