# Notebook 3: Train Transformer

## Goals
1. Load `training_data.jsonl`
2. Prepare Label Mappings (Unique Output -> ID)
3. Split Data by **Company** (Train/Val)
4. Tokenize Inputs using `FinBert`
5. Fine-tune Model (Sequence Classification head)
6. Evaluate & Save Best Model

In [12]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [13]:
# Load Data
data_path = Path('../data/training_data.jsonl')

with open(data_path, 'r', encoding='utf-8') as f:
    raw_data = [json.loads(line) for line in f]
    
df = pd.DataFrame(raw_data)
print(f"Loaded {len(df)} examples.")
df.head()

Loaded 2009 examples.


Unnamed: 0,input,output,metadata
0,[<START>] [<START>] [current_assets] [Cash and...,cash_and_equivalents,"{'source': 'AAPL.csv', 'is_calculated': False}"
1,[<START>] [Cash and cash equivalents] [current...,short_term_investments,"{'source': 'AAPL.csv', 'is_calculated': False}"
2,[Cash and cash equivalents] [Marketable securi...,accounts_receivable,"{'source': 'AAPL.csv', 'is_calculated': False}"
3,[Marketable securities (current)] [Accounts re...,other_current_assets,"{'source': 'AAPL.csv', 'is_calculated': False}"
4,"[Accounts receivable, net] [Vendor non-trade r...",inventory,"{'source': 'AAPL.csv', 'is_calculated': False}"


In [14]:
# Create Label Mappings
# Our target is 'output' column
labels = df['output'].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

num_labels = len(labels)
print(f"Total unique labels: {num_labels}")

# Save mappings for inference later
with open('../models/label_map.json', 'w') as f:
    json.dump(label2id, f, indent=2)
    
print("Sample labels:", labels[:5])

Total unique labels: 148
Sample labels: ['cash_and_equivalents', 'short_term_investments', 'accounts_receivable', 'other_current_assets', 'inventory']


In [15]:
# TRAINING ON ALL DATA (No separate validation split)
print("Using ALL data for training.")
train_df = df.copy()
val_df = df.copy()  # Use train data for val just effectively disables unknown-validation
print(f"Train size: {len(train_df)} rows")
print(f"Val size: {len(val_df)} rows (Same as train)")

Using ALL data for training.
Train size: 2009 rows
Val size: 2009 rows (Same as train)


In [16]:
# Custom Dataset
class FinancialDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
# CHANGED: Use FinBERT pre-trained model
model_name = 'yiyanghkust/finbert-pretrain' 
tokenizer = BertTokenizerFast.from_pretrained(model_name)

train_encodings = tokenizer(train_df['input'].tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_df['input'].tolist(), truncation=True, padding=True, max_length=128)

# Convert labels to IDs
train_labels = [label2id[l] for l in train_df['output']]
val_labels = [label2id[l] for l in val_df['output']]

train_dataset = FinancialDataset(train_encodings, train_labels)
val_dataset = FinancialDataset(val_encodings, val_labels)

In [18]:
# CHANGED: Use BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

model.to(device)

# Metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# CHANGED for FinBERT
training_args = TrainingArguments(
    output_dir='../models/results',
    num_train_epochs=15,
    per_device_train_batch_size=8,    # Lower batch size to fit in 8GB VRAM
    per_device_eval_batch_size=16,    # Lower eval batch size just in case
    gradient_accumulation_steps=2,    # Simulate batch size of 16 (8 * 2)
    fp16=True,                        # Use mixed precision (crucial for 2060 Super)
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='../models/logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [20]:
# Train!
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,3.0527,2.727574,0.50224
2,1.3211,1.020435,0.84221
3,0.7544,0.498801,0.919861
4,0.3553,0.299192,0.953211
5,0.2907,0.214033,0.967646
6,0.1764,0.139974,0.986063
7,0.127,0.100744,0.99104
8,0.1109,0.075339,0.99552
9,0.0727,0.055033,0.997013
10,0.0639,0.042098,0.999502


TrainOutput(global_step=1890, training_loss=0.576724591835466, metrics={'train_runtime': 294.4753, 'train_samples_per_second': 102.335, 'train_steps_per_second': 6.418, 'total_flos': 899367641539920.0, 'train_loss': 0.576724591835466, 'epoch': 15.0})

In [21]:
# Save Final Model
save_path = Path('../models/financial_transformer')
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")

Model saved to ..\models\financial_transformer


In [22]:
# ==========================================
# OPTIONAL: POST-TRAINING QUANTIZATION
# ==========================================

import torch.quantization
print("Quantizing model...")
# 1. Load the best saved model (we use the path where we saved FinBERT earlier)
# CHANGED: Use BertForSequenceClassification instead of DistilBert
model_to_quantize = BertForSequenceClassification.from_pretrained('../models/financial_transformer')
model_to_quantize.to('cpu')  # Quantization is typically done on CPU in PyTorch
# 2. Apply Dynamic Quantization
# This targets the Linear layers (weights) to convert to int8
quantized_model = torch.quantization.quantize_dynamic(
    model_to_quantize,
    {torch.nn.Linear},  # Layers to quantize
    dtype=torch.qint8
)
# 3. Save Quantized Model
quant_save_path = Path('../models/financial_transformer_quantized.pt')
torch.save(quantized_model.state_dict(), quant_save_path)
# Compare sizes
# Note: Newer transformers save 'model.safetensors' by default
model_file = Path('../models/financial_transformer/model.safetensors')
if not model_file.exists():
    model_file = Path('../models/financial_transformer/pytorch_model.bin')
original_size = model_file.stat().st_size / (1024 * 1024)
quantized_size = quant_save_path.stat().st_size / (1024 * 1024)
print(f"✓ Quantized model saved to: {quant_save_path}")
print(f"Original Size:  {original_size:.2f} MB")
print(f"Quantized Size: {quantized_size:.2f} MB")
print(f"Compression:    {original_size / quantized_size:.1f}x smaller")

Quantizing model...
✓ Quantized model saved to: ..\models\financial_transformer_quantized.pt
Original Size:  419.13 MB
Quantized Size: 174.23 MB
Compression:    2.4x smaller
