**Phase 1: Data Preprocessing**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
file_path = "/content/drive/My Drive/Fraud_Detection/creditcard.csv"
df = pd.read_csv(file_path)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,fraud
0,54063,-3.727627,3.80775,0.042464,-0.29561,-0.566886,1.122018,-2.786033,-10.34373,0.357416,...,9.689007,-3.963428,1.541998,0.178339,0.141429,0.139569,0.581307,0.191864,2.69,0
1,135019,-1.825818,-0.74136,0.246431,-3.477713,-0.527078,0.101212,0.490368,-0.333437,-1.258915,...,-0.405719,-0.059432,-0.067102,0.022992,0.265918,-0.409281,-0.572788,-0.315271,147.86,0
2,95682,1.689511,-0.25751,1.336604,4.303331,-0.756858,1.946686,-1.527325,0.548151,1.979609,...,-0.013362,0.608452,0.094575,-0.522768,-0.297246,0.050278,0.054399,-0.032195,40.2,0
3,38023,-0.393894,0.220968,1.221791,-0.487204,0.128807,0.046671,0.621301,0.098965,0.598315,...,-0.087401,-0.00952,-0.15437,-0.362853,-0.009611,0.362064,0.061287,-0.136786,49.9,0
4,82371,1.018111,-0.442078,1.672841,1.834509,-1.3383,0.443802,-0.878038,0.283587,1.636777,...,-0.109401,0.088993,-0.105478,0.413688,0.530205,-0.282166,0.095091,0.044359,49.5,0


In [None]:
df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,fraud
302457,75718,-1.438013,1.298178,0.909734,-0.495798,0.25564,-0.766527,0.449792,0.361424,-0.492433,...,-0.141175,-0.479615,0.105791,0.038759,-0.299486,-0.087374,-0.033205,0.108152,4.99,0
302458,159214,0.256621,0.223809,0.975735,-0.220988,-0.270464,0.56808,-0.394989,-0.637367,0.416232,...,1.080174,0.763228,-0.220929,0.82497,0.789579,-0.064883,0.223809,0.257101,39.95,0
302459,79893,-0.138855,-0.976921,0.883736,-3.393556,1.142488,3.914621,-1.11922,0.86957,-1.539373,...,-0.228985,-0.147826,-0.093426,0.977747,-0.26992,-0.413884,0.018587,-0.078387,20.0,0
302460,88181,-0.422831,0.239428,1.512375,-1.075881,-0.809157,-0.089498,0.838381,-0.4631,-0.698558,...,-0.568205,-0.860065,-0.002557,-0.079295,-0.594139,-0.64411,-0.140223,-0.106076,123.75,0
302461,76475,-1.004046,1.05327,1.093808,-0.591014,0.516087,-0.41598,0.476134,0.305124,-0.844857,...,-0.198036,-0.798085,-0.105299,-0.624258,-0.033219,-0.054535,-0.06688,0.004584,14.95,0


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Separate features & target variable
X = df.drop(columns=["fraud"])  # Features
y = df["fraud"]  #Target variable

# Normalize 'Time' and 'Amount' features
scaler = StandardScaler()
X[["Time", "Amount"]] = scaler.fit_transform(X[["Time", "Amount"]])

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check dataset sizes and class distribution
print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")
print(y_train.value_counts(normalize=True) * 100)
print(y_test.value_counts(normalize=True) * 100)

Training set: (241969, 30), Testing set: (60493, 30)
fraud
0    94.000058
1     5.999942
Name: proportion, dtype: float64
fraud
0    94.000959
1     5.999041
Name: proportion, dtype: float64


In [None]:
X_train_balanced, y_train_balanced = X_train, y_train

**Phase 2: Implement Selection Distribution Generator(SDG)**

In [None]:
!pip install torch torchvision torchaudio
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np



In [None]:
class SDG(nn.Module):
    def __init__(self, input_dim):
        super(SDG, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)  # Single output neuron for selection probability
        self.sigmoid = nn.Sigmoid()  # Activation function for probability

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [None]:
class SDGTrainer:
    def __init__(self, input_dim, lr=0.001):
        self.model = SDG(input_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.loss_fn = nn.BCELoss()  # Binary Cross-Entropy Loss for selection probability

    def train_step(self, X, y):
        self.optimizer.zero_grad()
        selection_probs = self.model(X).squeeze()
        selected = (selection_probs > 0.5).float()  #Thresholding for selection

        # Reward: Assign higher reward for correctly selected fraud transactions
        reward = (selected == y).float() * (2 * y - 1)  # Reward +1 for fraud, -1 for incorrect selection

        loss = self.loss_fn(selection_probs, y) * reward.mean()
        loss.backward()
        self.optimizer.step()

        return loss.item(), selection_probs.detach().cpu().numpy()

In [None]:
# Convert dataset to PyTorch tensors
X_train_tensor = torch.tensor(X_train_balanced.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_balanced.values, dtype=torch.float32)

# Initialize SDG Trainer
input_dim = X_train_balanced.shape[1]
sdg_trainer = SDGTrainer(input_dim)

# Train SDG for multiple epochs
num_epochs = 30
for epoch in range(num_epochs):
    loss, probs = sdg_trainer.train_step(X_train_tensor, y_train_tensor)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss:.4f}")

Epoch 1/30 - Loss: -0.2066
Epoch 2/30 - Loss: -0.1209
Epoch 3/30 - Loss: -0.0627
Epoch 4/30 - Loss: -0.0300
Epoch 5/30 - Loss: -0.0135
Epoch 6/30 - Loss: -0.0048
Epoch 7/30 - Loss: 0.0003
Epoch 8/30 - Loss: 0.0031
Epoch 9/30 - Loss: 0.0049
Epoch 10/30 - Loss: 0.0059
Epoch 11/30 - Loss: 0.0065
Epoch 12/30 - Loss: 0.0070
Epoch 13/30 - Loss: 0.0074
Epoch 14/30 - Loss: 0.0075
Epoch 15/30 - Loss: 0.0078
Epoch 16/30 - Loss: 0.0080
Epoch 17/30 - Loss: 0.0081
Epoch 18/30 - Loss: 0.0082
Epoch 19/30 - Loss: 0.0084
Epoch 20/30 - Loss: 0.0084
Epoch 21/30 - Loss: 0.0084
Epoch 22/30 - Loss: 0.0085
Epoch 23/30 - Loss: 0.0084
Epoch 24/30 - Loss: 0.0083
Epoch 25/30 - Loss: 0.0082
Epoch 26/30 - Loss: 0.0082
Epoch 27/30 - Loss: 0.0081
Epoch 28/30 - Loss: 0.0079
Epoch 29/30 - Loss: 0.0079
Epoch 30/30 - Loss: 0.0079


In [None]:
# Get selected transactions for further training
selected_probs = sdg_trainer.model(X_train_tensor).detach().cpu().numpy()
selected_indices = np.where(selected_probs > 0.5)[0]  # Select indices with probability > 0.5

# Create the new dataset with selected transactions
X_selected = X_train_balanced.iloc[selected_indices]
y_selected = y_train_balanced.iloc[selected_indices]

print(f"Selected Transactions: {len(X_selected)} out of {len(X_train_balanced)}")

Selected Transactions: 229834 out of 241969


Phase 3: Implement Transaction Fraud Detector (TFD) Using CNN + Attention Mechanism

In [None]:
!pip install torch torchvision torchaudio
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.utils.data import DataLoader, TensorDataset



In [None]:
class AttentionLayer(nn.Module):
    def __init__(self, input_dim):
        super(AttentionLayer, self).__init__()
        self.attention = nn.Linear(input_dim, 1)

    def forward(self, x):
        weights = torch.softmax(self.attention(x), dim=1)  # Compute attention scores
        return x * weights  # Apply attention weights to features

class FraudDetectorCNN(nn.Module):
    def __init__(self, input_dim):
        super(FraudDetectorCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.attention = AttentionLayer(32)  # Attention Mechanism
        self.fc1 = nn.Linear(32, 16)
        self.fc2 = nn.Linear(16, 2)  # Two output classes: fraud (1) or legit (0)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension for CNN
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.mean(dim=2)  # Global Average Pooling
        x = self.attention(x)  # Apply attention
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # Softmax will be applied during training

In [None]:
# Convert selected transactions into PyTorch tensors
X_selected_tensor = torch.tensor(X_selected.values, dtype=torch.float32)
y_selected_tensor = torch.tensor(y_selected.values, dtype=torch.long)

# Create PyTorch dataset and dataloader
dataset = TensorDataset(X_selected_tensor, y_selected_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# Initialize Model, Loss, and Optimizer
input_dim = X_selected.shape[1]
model = FraudDetectorCNN(input_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()  # For multi-class classification (fraud vs legit)

# Training Loop
num_epochs = 30
for epoch in range(num_epochs):
    total_loss = 0
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f}")

Epoch 1/30 - Loss: 437.3392
Epoch 2/30 - Loss: 359.2359
Epoch 3/30 - Loss: 321.2014
Epoch 4/30 - Loss: 294.5253
Epoch 5/30 - Loss: 274.8667
Epoch 6/30 - Loss: 254.2633
Epoch 7/30 - Loss: 237.3890
Epoch 8/30 - Loss: 222.5349
Epoch 9/30 - Loss: 205.1639
Epoch 10/30 - Loss: 194.7106
Epoch 11/30 - Loss: 175.8184
Epoch 12/30 - Loss: 158.4937
Epoch 13/30 - Loss: 143.0074
Epoch 14/30 - Loss: 129.6932
Epoch 15/30 - Loss: 117.0060
Epoch 16/30 - Loss: 108.6152
Epoch 17/30 - Loss: 94.7832
Epoch 18/30 - Loss: 92.8693
Epoch 19/30 - Loss: 82.8323
Epoch 20/30 - Loss: 77.3802
Epoch 21/30 - Loss: 73.5021
Epoch 22/30 - Loss: 65.2049
Epoch 23/30 - Loss: 60.4256
Epoch 24/30 - Loss: 56.5638
Epoch 25/30 - Loss: 54.5179
Epoch 26/30 - Loss: 48.8959
Epoch 27/30 - Loss: 46.8980
Epoch 28/30 - Loss: 46.6295
Epoch 29/30 - Loss: 45.8851
Epoch 30/30 - Loss: 37.7560


In [None]:
# Convert test data to PyTorch tensors
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Get predictions
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    predictions = torch.argmax(test_outputs, dim=1)

# Compute Accuracy
accuracy = (predictions == y_test_tensor).float().mean().item()
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 96.69%


Phase 4: Joint Training of SDG & TFD Using Reinforcement Learning

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from torch.distributions import Categorical

In [None]:
def compute_reward(predictions, labels):
    """
    Compute reward based on model accuracy:
    +1 for correctly identifying fraud,
    -1 for incorrect selection.
    """
    correct = (predictions == labels).float()
    reward = (2 * labels - 1) * correct  # +1 for fraud detected, -1 otherwise
    return reward.mean()

class SDGTrainerRL:
    def __init__(self, input_dim, lr=0.001):
        self.model = SDG(input_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.loss_fn = nn.BCELoss()  # Binary Cross-Entropy Loss for selection probability

    def train_step(self, X, y, reward):
        self.optimizer.zero_grad()
        selection_probs = self.model(X).squeeze()
        selected = (selection_probs > 0.5).float()  # Select transactions

        # Reward: Encourage correct fraud selection
        loss = self.loss_fn(selection_probs, y) * reward
        loss.backward()
        self.optimizer.step()

        return loss.item()

In [None]:
# Initialize SDG and TFD
input_dim = X_train.shape[1]
sdg_trainer = SDGTrainerRL(input_dim)
fraud_detector = FraudDetectorCNN(input_dim)
optimizer_tfd = optim.Adam(fraud_detector.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training Loop
num_epochs = 30
for epoch in range(num_epochs):
    total_loss_sdg = 0
    total_loss_tfd = 0

    for batch_X, batch_y in dataloader:
        # Convert labels to float to avoid dtype mismatch error
        batch_y = batch_y.float()

        # SDG selects transactions
        selection_probs = sdg_trainer.model(batch_X).squeeze()
        selected = (selection_probs > 0.5).float()

        # Filter selected transactions
        batch_X_selected = batch_X[selected.bool()]
        batch_y_selected = batch_y[selected.bool()]

        if len(batch_X_selected) == 0:
            continue  # Skip if no transactions selected

        # Convert selected labels to float
        batch_y_selected = batch_y_selected.float()

        # Train TFD on selected transactions
        optimizer_tfd.zero_grad()
        outputs = fraud_detector(batch_X_selected)
        loss_tfd = criterion(outputs, batch_y_selected.long())  # Convert back to long for CrossEntropyLoss
        loss_tfd.backward()
        optimizer_tfd.step()

        # Compute reward and train SDG
        predictions = torch.argmax(outputs, dim=1)
        reward = compute_reward(predictions, batch_y_selected)
        loss_sdg = sdg_trainer.train_step(batch_X, batch_y, reward)

        total_loss_sdg += loss_sdg
        total_loss_tfd += loss_tfd.item()

    print(f"Epoch {epoch+1}/{num_epochs} - SDG Loss: {total_loss_sdg:.4f} | TFD Loss: {total_loss_tfd:.4f}")

Epoch 1/30 - SDG Loss: -684640.0018 | TFD Loss: 450.0733
Epoch 2/30 - SDG Loss: -698287.6953 | TFD Loss: 362.8299
Epoch 3/30 - SDG Loss: -696473.2422 | TFD Loss: 322.0054
Epoch 4/30 - SDG Loss: -695799.3164 | TFD Loss: 296.2408
Epoch 5/30 - SDG Loss: -695241.9922 | TFD Loss: 272.9539
Epoch 6/30 - SDG Loss: -694693.6953 | TFD Loss: 248.6884
Epoch 7/30 - SDG Loss: -694173.6328 | TFD Loss: 224.5929
Epoch 8/30 - SDG Loss: -693675.0000 | TFD Loss: 203.3104
Epoch 9/30 - SDG Loss: -693278.0273 | TFD Loss: 186.3622
Epoch 10/30 - SDG Loss: -693088.6172 | TFD Loss: 168.2457
Epoch 11/30 - SDG Loss: -692542.4805 | TFD Loss: 151.6713
Epoch 12/30 - SDG Loss: -692144.1406 | TFD Loss: 134.7756
Epoch 13/30 - SDG Loss: -691849.8047 | TFD Loss: 121.9140
Epoch 14/30 - SDG Loss: -691417.8711 | TFD Loss: 109.6871
Epoch 15/30 - SDG Loss: -691168.3867 | TFD Loss: 99.5224
Epoch 16/30 - SDG Loss: -690832.4219 | TFD Loss: 88.1853
Epoch 17/30 - SDG Loss: -690673.6602 | TFD Loss: 82.2151
Epoch 18/30 - SDG Loss: -6

In [None]:
# Convert test data to PyTorch tensors
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Get predictions from TFD
with torch.no_grad():
    test_outputs = fraud_detector(X_test_tensor)
    predictions = torch.argmax(test_outputs, dim=1)

# Compute Accuracy
accuracy = (predictions == y_test_tensor).float().mean().item()
print(f"Final Test Accuracy: {accuracy * 100:.2f}%")

Final Test Accuracy: 98.06%


Phase 5: Model Evaluation


In [None]:
!pip install scikit-learn  # Install scikit-learn if not already installed
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score



In [None]:
# Ensure test predictions are made first
with torch.no_grad():
    test_outputs = fraud_detector(X_test_tensor)  # Get model outputs
    y_pred = torch.argmax(test_outputs, dim=1).numpy()  # Convert to NumPy

# Now check prediction distribution
unique, counts = np.unique(y_pred, return_counts=True)
print("Prediction Distribution:", dict(zip(unique, counts)))


Prediction Distribution: {np.int64(0): np.int64(57767), np.int64(1): np.int64(2726)}


In [None]:
import numpy as np

# Check how many fraud and non-fraud transactions are predicted
unique, counts = np.unique(y_pred, return_counts=True)
print("Prediction Distribution:", dict(zip(unique, counts)))


Prediction Distribution: {np.int64(0): np.int64(57767), np.int64(1): np.int64(2726)}


In [None]:
# Get predictions for the test set
with torch.no_grad():
    test_outputs = fraud_detector(X_test_tensor)
    predictions = torch.argmax(test_outputs, dim=1)

# Convert predictions and ground truth to numpy arrays
y_pred = predictions.numpy()
y_true = y_test_tensor.numpy()

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, test_outputs[:, 1].numpy())  # Use the probability for the positive class

# Print evaluation results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"AUC: {auc:.4f}")

Accuracy: 98.06%
Precision: 95.05%
Recall: 71.40%
F1-Score: 81.54%
AUC: 0.8559


In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# XGBoost Model
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# Random Forest Evaluation
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)
rf_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

# XGBoost Evaluation
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_precision = precision_score(y_test, xgb_predictions)
xgb_recall = recall_score(y_test, xgb_predictions)
xgb_f1 = f1_score(y_test, xgb_predictions)
xgb_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

# Print Baseline Model Results
print("\nRandom Forest Performance:")
print(f"Accuracy: {rf_accuracy * 100:.2f}%")
print(f"Precision: {rf_precision * 100:.2f}%")
print(f"Recall: {rf_recall * 100:.2f}%")
print(f"F1-Score: {rf_f1 * 100:.2f}%")
print(f"AUC: {rf_auc:.4f}")

print("\nXGBoost Performance:")
print(f"Accuracy: {xgb_accuracy * 100:.2f}%")
print(f"Precision: {xgb_precision * 100:.2f}%")
print(f"Recall: {xgb_recall * 100:.2f}%")
print(f"F1-Score: {xgb_f1 * 100:.2f}%")
print(f"AUC: {xgb_auc:.4f}")


Random Forest Performance:
Accuracy: 99.99%
Precision: 99.86%
Recall: 100.00%
F1-Score: 99.93%
AUC: 1.0000

XGBoost Performance:
Accuracy: 99.99%
Precision: 99.89%
Recall: 100.00%
F1-Score: 99.94%
AUC: 1.0000
