# Fraud Detection model with Differential Privacy using Opacus Framework

In [1]:
import pandas as pd

url ="fraud_transactions.csv"

df_actual = pd.read_csv(url, sep=",")
df_actual.head()

Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,TX_FRAUD.1
0,0,0,2023-02-01 00:43:37,901,8047,82,1,1
1,1,1,2023-02-01 01:20:13,2611,7777,15,0,0
2,2,2,2023-02-01 01:22:52,4212,3336,53,0,0
3,3,3,2023-02-01 01:26:40,1293,7432,59,0,0
4,4,4,2023-02-01 01:52:23,2499,1024,25,0,0


In [2]:
df_transactions = df_actual[['CUSTOMER_ID','TERMINAL_ID','TX_AMOUNT','TX_FRAUD']]
#df_transactions=df_transactions.head(10)
df_transactions=df_transactions.head(50000)
df_transactions

Unnamed: 0,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD
0,901,8047,82,1
1,2611,7777,15,0
2,4212,3336,53,0
3,1293,7432,59,0
4,2499,1024,25,0
...,...,...,...,...
49995,1541,3469,66,0
49996,489,3854,89,1
49997,4125,7519,12,0
49998,3360,5215,86,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

print("No of Fraud Transactions:", df_transactions['TX_FRAUD'].value_counts()[0])
print("No of Non Fraud Transactions:", df_transactions['TX_FRAUD'].value_counts()[1])

print('No Frauds', round(df_transactions['TX_FRAUD'].value_counts()[0]/len(df_transactions) * 100,2), '% of the dataset')
print('Frauds', round(df_transactions['TX_FRAUD'].value_counts()[1]/len(df_transactions) * 100,2), '% of the dataset')



X = df_transactions.drop('TX_FRAUD', axis=1)
y = df_transactions['TX_FRAUD']



No of Fraud Transactions: 37870
No of Non Fraud Transactions: 12130
No Frauds 75.74 % of the dataset
Frauds 24.26 % of the dataset


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [5]:
import torch
x = torch.rand(5, 3)
print(x)

tensor([[0.4648, 0.7065, 0.4202],
        [0.9944, 0.6119, 0.7367],
        [0.8882, 0.4868, 0.5254],
        [0.2498, 0.8801, 0.7946],
        [0.0160, 0.7540, 0.7839]])


In [6]:
x_train = torch.FloatTensor(X_train.values)
x_test = torch.FloatTensor(X_test.values)
y_train = torch.FloatTensor(y_train.values)
y_test = torch.FloatTensor(y_test.values)

In [7]:
if torch.cuda.is_available():
    DEVICE = "cuda" 
else:
    DEVICE = "cpu"
print("Selected device is",DEVICE)

Selected device is cpu


In [8]:
class FraudDataset(torch.utils.data.Dataset):
    
    def __init__(self, x, y):
        'Initialization'
        self.x = x
        self.y = y

    def __len__(self):
        'Returns the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample index
        if self.y is not None:
            return self.x[index].to(DEVICE), self.y[index].to(DEVICE)
        else:
            return self.x[index].to(DEVICE)

In [9]:
train_loader_params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 0}
test_loader_params = {'batch_size': 64,
          'num_workers': 0}

# Generators

training_set = FraudDataset(x_train, y_train)

testing_set = FraudDataset(x_test, y_test)


train_loader = torch.utils.data.DataLoader(training_set, **train_loader_params)
test_loader = torch.utils.data.DataLoader(testing_set, **test_loader_params)

In [10]:
class SimpleFraudMLP(torch.nn.Module):
    
    def __init__(self):
        
        super().__init__()
        self.first_sec = torch.nn.Sequential(
                           torch.nn.Linear(3, 450),
                           torch.nn.ReLU(),
                         )
        self.second_sec = torch.nn.Sequential(
                           torch.nn.Linear(450, 450),
                           torch.nn.ReLU(),
                           torch.nn.Linear(450, 1),
                           torch.nn.Sigmoid(),
                         )
        
    def forward(self, x):
        return self.second_sec(self.first_sec(x))


In [11]:
fraud_nn_model = SimpleFraudMLP().to(DEVICE)

In [12]:
import warnings
warnings.simplefilter("ignore")

MAX_GRAD_NORM = 1.2
EPSILON = 90.0
DELTA = 1e-5
EPOCHS = 20

LR = 1e-3

from opacus.validators import ModuleValidator
errors = ModuleValidator.validate(fraud_nn_model, strict=False)
errors[-5:]


[]

In [13]:
from torch import nn, optim
#loss_func = nn.CrossEntropyLoss()   
loss_func = torch.nn.BCELoss().to(DEVICE)
loss_func


BCELoss()

In [14]:
#optimizer = optim.Adam(fraud_nn_model.parameters(), lr = 0.01)  
optimizer = torch.optim.SGD(fraud_nn_model.parameters(), lr = 0.07)
optimizer


SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.07
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [15]:
fraud_nn_model.eval()

SimpleFraudMLP(
  (first_sec): Sequential(
    (0): Linear(in_features=3, out_features=450, bias=True)
    (1): ReLU()
  )
  (second_sec): Sequential(
    (0): Linear(in_features=450, out_features=450, bias=True)
    (1): ReLU()
    (2): Linear(in_features=450, out_features=1, bias=True)
    (3): Sigmoid()
  )
)

In [16]:
from opacus import PrivacyEngine

fraud_nn_model.train()

privacy_engine = PrivacyEngine()


model, optimizer, train_loader = privacy_engine.make_private_with_epsilon(
    module=fraud_nn_model,
    optimizer=optimizer,
    data_loader=train_loader,
    epochs=EPOCHS,
    target_epsilon=EPSILON,
    target_delta=DELTA,
    max_grad_norm=MAX_GRAD_NORM,
)
print(f"Using sigma={optimizer.noise_multiplier} and C={MAX_GRAD_NORM}")


Using sigma=0.2883148193359375 and C=1.2


In [17]:
import numpy as np
import time

n_epochs = 10
#Setting the model in training mode
fraud_nn_model.train()

#Training loop
start_time=time.time()
epochs_train_losses = []
epochs_test_losses = []

for epoch in range(n_epochs):
    train_loss=[]
    train_loss1=0
    for x_batch, y_batch in train_loader:
        
        fraud_nn_model.train()
        
        y_pred = fraud_nn_model(x_batch)
       
        loss = loss_func(y_pred.squeeze(), y_batch)
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
       
        train_loss.append(loss.item())
        
        train_loss1 += loss.item()*x_batch.size(0)
    
    
    epsilon = privacy_engine.get_epsilon(DELTA)
    print('ε epsilon{}     : delta:{}'.format(epsilon, DELTA))
    
   
    epochs_train_losses.append(np.mean(train_loss))
    print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
    
    

ε epsilon27.552710036487476     : delta:1e-05
Epoch 0: train loss: 23.35688462161495
ε epsilon33.98911164791893     : delta:1e-05
Epoch 1: train loss: 22.910238819540744
ε epsilon38.786904746786384     : delta:1e-05
Epoch 2: train loss: 22.516301270594763
ε epsilon42.819749628256126     : delta:1e-05
Epoch 3: train loss: 23.105902637816435
ε epsilon46.852594509725854     : delta:1e-05
Epoch 4: train loss: 23.107947564866034
ε epsilon50.8854393911956     : delta:1e-05
Epoch 5: train loss: 22.70367295040291
ε epsilon54.91828427266533     : delta:1e-05
Epoch 6: train loss: 23.265120337606565
ε epsilon57.62100410282992     : delta:1e-05
Epoch 7: train loss: 22.029040902362663
ε epsilon60.31918791574003     : delta:1e-05
Epoch 8: train loss: 22.633963522135232
ε epsilon63.01737172865014     : delta:1e-05
Epoch 9: train loss: 22.68892878108748
