### Import

In [2]:
import torch
from Dataset.bank_dataset import BankTxnDataset, pad_collate_fn
from Models.transformer import TransformerClassifier
from Config.config import load_config
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Loading Config

In [3]:
cfg = load_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Current device: \033[92m{device}\033[0m")

Current device: [92mcuda[0m


### Loading Dataset

In [4]:
train_ds = BankTxnDataset(cfg, split="train")
print(f"Total number of training data: \033[92m{len(train_ds.data)}\033[0m")

train_loader = DataLoader(
	train_ds,
	batch_size=cfg.parameter['batchSize'],
	shuffle=True,
	num_workers=4,
	pin_memory=True,                # speeds host→GPU copies
	collate_fn=pad_collate_fn
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Total number of training data: [92m140[0m




In [5]:
sample_batch = next(iter(train_loader))
x_sample = sample_batch[0]
print(f"Input tensor shape: {x_sample.shape}")
actual_feat_dim = x_sample.shape[2]
print(f"Feature dimension from data: {actual_feat_dim}")

Input tensor shape: torch.Size([32, 12, 1839])
Feature dimension from data: 1839


### Loading Model & Optimizer

In [6]:
model = TransformerClassifier(
	feat_dim=actual_feat_dim,
	d_model=cfg.parameter['d_model'],
	nhead=cfg.parameter['attention_head'],
	num_layers=cfg.parameter['num_layers'],
	num_classes=1
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=cfg.parameter['learningRate'])
scaler    = torch.cuda.amp.GradScaler()  # optional mixed‑precision

  scaler    = torch.cuda.amp.GradScaler()  # optional mixed‑precision


### Training

In [7]:
epochs = cfg.parameter['epochs']
print(f"Starting training for {epochs} epochs on {device}")
model.train()
# Create list to store average loss per epoch
avg_losses = []

for epoch in range(1, epochs+1):
        epoch_loss = 0.0
        pbar = tqdm.tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}", ncols=80)
        for x, lengths, y in pbar:
            x, lengths, y = x.to(device), lengths.to(device), y.to(device)
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                logits = model(
                    x,
                    src_key_padding_mask=(torch.arange(x.size(1), device=device)
                                          .unsqueeze(0)
                                          .ge(lengths.unsqueeze(1)))
                )
                loss = torch.nn.functional.binary_cross_entropy_with_logits(
                    logits.squeeze(), y
                )
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            batch_loss = loss.item()
            epoch_loss += batch_loss
            # update tqdm bar
            pbar.set_postfix(loss=f"{batch_loss:.4f}")

        avg_loss = epoch_loss / len(train_loader)
        avg_losses.append(avg_loss)  # Store the average loss
        print(f"Epoch {epoch} completed — avg loss: {avg_loss:.4f}")

# After training, plot the loss curve
plt.figure(figsize=(10, 5))
plt.plot(range(1, epochs+1), avg_losses, marker='o', linestyle='-', color='b')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Average Loss')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(range(1, epochs+1))

# Add loss values as text annotations on the plot
for i, loss in enumerate(avg_losses):
    plt.text(i+1, loss, f'{loss:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

Starting training for 256 epochs on cuda


  with torch.cuda.amp.autocast():
Epoch 1/256: 100%|███████████████████| 5/5 [00:29<00:00,  5.86s/it, loss=0.0069]


Epoch 1 completed — avg loss: 0.2721


Epoch 2/256: 100%|███████████████████| 5/5 [00:28<00:00,  5.68s/it, loss=0.0025]


Epoch 2 completed — avg loss: 0.0035


Epoch 3/256: 100%|███████████████████| 5/5 [00:28<00:00,  5.63s/it, loss=0.0020]


Epoch 3 completed — avg loss: 0.0021


Epoch 4/256: 100%|███████████████████| 5/5 [00:28<00:00,  5.68s/it, loss=0.0016]


Epoch 4 completed — avg loss: 0.0017


Epoch 5/256: 100%|███████████████████| 5/5 [00:27<00:00,  5.60s/it, loss=0.0014]


Epoch 5 completed — avg loss: 0.0015


Epoch 6/256: 100%|███████████████████| 5/5 [00:28<00:00,  5.76s/it, loss=0.0013]


Epoch 6 completed — avg loss: 0.0014


Epoch 7/256:   0%|                                        | 0/5 [00:06<?, ?it/s]


KeyboardInterrupt: 