### Import

In [None]:
import torch
from Dataset.bank_dataset import BankTxnDataset, pad_collate_fn
from Models.transformer import TransformerClassifier
from Config.config import load_config
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import tqdm

### Loading Config

In [2]:
cfg = load_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Current device: \033[92m{device}\033[0m")

Current device: [92mcuda[0m


### Loading Dataset

In [3]:
train_ds = BankTxnDataset(cfg, split="train", val_ratio=cfg.dataset['validateSplit'])
val_ds = BankTxnDataset(cfg, split="val", val_ratio=cfg.dataset['validateSplit'])
# test_ds = BankTxnDataset(cfg, split="test")
print(f"Total number of training data: \033[92m{len(train_ds.data)}\033[0m")

train_loader = DataLoader(
	train_ds,
	batch_size=cfg.parameter['batchSize'],
	shuffle=True,
	num_workers=4,
	pin_memory=True,                # speeds host→GPU copies
	collate_fn=pad_collate_fn
)

val_loader = DataLoader(
    val_ds,
    batch_size=cfg.parameter['batchSize'],
    shuffle=False,  # No need to shuffle validation data
    num_workers=4,
    pin_memory=True,
    collate_fn=pad_collate_fn
)

# test_loader = DataLoader(
#     test_ds,
#     batch_size=cfg.parameter['batchSize'],
#     shuffle=False,
#     num_workers=4,
#     pin_memory=True,
#     collate_fn=pad_collate_fn
# )


Total number of training data: [92m22472[0m


In [None]:
sample_batch = next(iter(train_loader))
x_sample = sample_batch[0]
print(f"Input tensor shape: {x_sample.shape}")
actual_feat_dim = x_sample.shape[2]
print(f"Feature dimension from data: {actual_feat_dim}")

### Loading Model & Optimizer

In [None]:
# New cell: Custom F1 Loss and Combined Loss Function
import torch.nn.functional as F

def f1_loss(logits, labels, epsilon=1e-7):
    # Apply sigmoid to get probabilities
    probs = torch.sigmoid(logits)
    labels = labels.float()
    # Compute true positives, false positives & false negatives
    tp = (probs * labels).sum()
    fp = (probs * (1 - labels)).sum()
    fn = ((1 - probs) * labels).sum()
    # Calculate precision and recall
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    # Compute F1 score
    f1 = 2 * precision * recall / (precision + recall + epsilon)
    # Loss is (1 - F1)
    return 1 - f1

# Optional: Combine BCE with F1 loss
def combined_loss(logits, labels, alpha=0.5):
    bce = F.binary_cross_entropy_with_logits(logits, labels.float())
    loss_f1 = f1_loss(logits, labels)
    # alpha controls the tradeoff
    return alpha * bce + (1 - alpha) * loss_f1

In [None]:
import torch.nn as nn

model = TransformerClassifier(
	feat_dim=actual_feat_dim,
	d_model=cfg.parameter['d_model'],
	nhead=cfg.parameter['attention_head'],
	num_layers=cfg.parameter['num_layers'],
	num_classes=1
).to(device)

labels = [label.item() for _, label in train_ds]
num_pos = sum(labels)
num_neg = len(labels) - num_pos
pos_weight = torch.tensor([num_neg / max(num_pos, 1)], device=device)

# criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
criterion = combined_loss
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.parameter['learningRate'])
scaler    = torch.amp.GradScaler()  # optional mixed‑precision

### Training

In [6]:
# Add imports for metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define evaluation function
def evaluate(model, data_loader, device):
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for x, lengths, y in data_loader:
            x, lengths, y = x.to(device), lengths.to(device), y.to(device)
            
            with torch.amp.autocast(device_type=device.type):
                logits = model(
                    x,
                    src_key_padding_mask=(torch.arange(x.size(1), device=device)
                                         .unsqueeze(0)
                                         .ge(lengths.unsqueeze(1)))
                )
                # Only squeeze the last dimension, preserving batch dimension
                # loss = criterion(logits.squeeze(-1), y)
                loss = criterion(logits.squeeze(-1), y)
            
            val_loss += loss.item()
            
            # Get predictions (0 or 1)
            preds = torch.sigmoid(logits.squeeze(-1)) >= 0.5
            # Convert to list if single element
            if preds.dim() == 0:
                all_preds.append(preds.cpu().numpy().item())
            else:
                all_preds.extend(preds.cpu().numpy())
                
            if y.dim() == 0:
                all_labels.append(y.cpu().numpy().item())
            else:
                all_labels.extend(y.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    
    # Return average loss and metrics
    return val_loss / len(data_loader), accuracy, precision, recall, f1

In [None]:
epochs = cfg.parameter['epochs']
print(f"Starting training for {epochs} epochs on {device}")
train_losses = []
val_losses = []
val_metrics = []
best_val_loss = float('inf')
best_model_state = None
model.train()
import random
from torch.utils.data import Subset

ensemble_size = 5
ensemble_models = []
subset_size = int(len(train_ds)*2/5)

for b in range(ensemble_size):
    print(f"Training model {b+1}/{ensemble_size}")
    # Select a random subset (without replacement) of size 2/5 of the training data
    indices = random.sample(range(len(train_ds)), subset_size)
    subset_ds = Subset(train_ds, indices)
    subset_loader = DataLoader(
        subset_ds,
        batch_size=cfg.parameter['batchSize'],
        shuffle=True,
        num_workers=4,
        pin_memory=True,
        collate_fn=pad_collate_fn
    )
    
    # Instantiate a new model for this ensemble member:
    model_b = TransformerClassifier(
        feat_dim=actual_feat_dim,
        d_model=cfg.parameter['d_model'],
        nhead=cfg.parameter['attention_head'],
        num_layers=cfg.parameter['num_layers'],
        num_classes=1,
    ).to(device)
    
    optimizer_b = torch.optim.Adam(model_b.parameters(), lr=cfg.parameter['learningRate'])
    scaler_b    = torch.amp.GradScaler()
    
    best_val_loss_b = float('inf')
    best_model_state_b = None
    num_epochs = cfg.parameter['epochs']
    
    # Training loop for this model
    for epoch in range(1, num_epochs+1):
        model_b.train()
        epoch_loss = 0.0
        pbar = tqdm.tqdm(subset_loader, desc=f"Model {b+1} Epoch {epoch}/{num_epochs}", ncols=80)
        for x, lengths, y in pbar:
            x, lengths, y = x.to(device), lengths.to(device), y.to(device)
            optimizer_b.zero_grad()
            with torch.amp.autocast(device_type=device.type):
                logits = model_b(
                    x,
                    src_key_padding_mask=(torch.arange(x.size(1), device=device)
                                          .unsqueeze(0)
                                          .ge(lengths.unsqueeze(1)))
                )
                loss = criterion(logits.squeeze(), y)
            scaler_b.scale(loss).backward()
            scaler_b.step(optimizer_b)
            scaler_b.update()
            batch_loss = loss.item()
            epoch_loss += batch_loss
            pbar.set_postfix(loss=f"{batch_loss:.4f}")
        
        # Validate on the full validation loader
        val_loss, accuracy, precision, recall, f1 = evaluate(model_b, val_loader, device)
        print(f"Model {b+1} Epoch {epoch} - Val loss: {val_loss:.4f}, Acc: {accuracy:.4f}, P: {precision:.4f}, R: {recall:.4f}, F1: {f1:.4f}")
        if val_loss < best_val_loss_b:
            best_val_loss_b = val_loss
            best_model_state_b = model_b.state_dict().copy()
    
    if best_model_state_b:
        model_b.load_state_dict(best_model_state_b)
        ensemble_models.append(model_b)
        print(f"Model {b+1} saved with best validation loss: {best_val_loss_b:.4f}")

In [None]:
# New cell: Evaluate ensemble on the validation set and calculate aggregated F1 score
import numpy as np
from sklearn.metrics import f1_score

ensemble_probs = []

# Get probabilistic predictions from each ensemble model on the validation loader
with torch.no_grad():
    for model_b in ensemble_models:
        model_b.eval()
        probs_model = []
        for x, lengths, _ in val_loader:
            x, lengths = x.to(device), lengths.to(device)
            mask = (torch.arange(x.size(1), device=device)
                    .unsqueeze(0)
                    .ge(lengths.unsqueeze(1)))
            logits = model_b(x, src_key_padding_mask=mask).squeeze(1)
            probs = torch.sigmoid(logits)
            probs_model.extend(probs.cpu().numpy())
        ensemble_probs.append(np.array(probs_model))

# Average the probabilities over the ensemble and apply threshold of 0.5
avg_probs = np.mean(ensemble_probs, axis=0)
bin_preds = (avg_probs >= 0.5).astype(int)

# Gather ground truth labels from the validation set
all_labels = []
with torch.no_grad():
    for _, _, y in val_loader:
        if y.dim() == 0:
            all_labels.append(y.cpu().item())
        else:
            all_labels.extend(y.cpu().numpy())

# Compute and print the F1 score for the ensemble on the validation set
ensemble_f1 = f1_score(all_labels, bin_preds, zero_division=0)
print("Validation Ensemble F1 Score:", ensemble_f1)

In [None]:
# Load best model for final evaluation
if best_model_state:
    model.load_state_dict(best_model_state)
    print("Loaded best model based on validation performance")

# Plot training and validation loss
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, epochs+1), train_losses, marker='o', linestyle='-', color='b', label='Training Loss')
plt.plot(range(1, epochs+1), val_losses, marker='o', linestyle='-', color='r', label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)

# Plot validation metrics
plt.subplot(1, 2, 2)
plt.plot(range(1, epochs+1), [m['accuracy'] for m in val_metrics], marker='o', label='Accuracy')
plt.plot(range(1, epochs+1), [m['precision'] for m in val_metrics], marker='s', label='Precision')
plt.plot(range(1, epochs+1), [m['recall'] for m in val_metrics], marker='^', label='Recall')
plt.plot(range(1, epochs+1), [m['f1'] for m in val_metrics], marker='d', label='F1')
plt.title('Validation Metrics')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

### Testing

In [None]:
# — inference on test split —
import pandas as pd
from torch.utils.data import DataLoader

# 1) build test dataset & loader
test_ds     = BankTxnDataset(cfg, split="test")
test_loader = DataLoader(
    test_ds,
    batch_size=cfg.parameter['batchSize'],
    shuffle=False,
    num_workers=4,
    pin_memory=True,
    collate_fn=pad_collate_fn
)

# 2) run model in eval mode and collect probabilities
import numpy as np

all_probs = []

with torch.no_grad():
    for model_b in ensemble_models:
        model_b.eval()
        probs_b = []
        for x, lengths, _ in test_loader:
            x, lengths = x.to(device), lengths.to(device)
            mask = (torch.arange(x.size(1), device=device)
                    .unsqueeze(0)
                    .ge(lengths.unsqueeze(1)))
            logits = model_b(x, src_key_padding_mask=mask).squeeze(1)
            probs = torch.sigmoid(logits)
            probs_b.extend(probs.cpu().numpy())
        all_probs.append(np.array(probs_b))

# Average probabilities from all models and threshold for final prediction.
avg_probs = np.mean(all_probs, axis=0)
bin_preds = (avg_probs >= 0.5).astype(int)

# Map predictions back to account numbers and write to CSV.
df_out = test_ds.get_label()  # DataFrame with sorted ACCT_NBR
df_out['prediction'] = bin_preds  # same order as test_ds sequences
df_out.to_csv('predictions.csv', index=False)
print("Wrote", len(df_out), "rows to predictions.csv")