In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Automated Financial Report Reasoning with GRPO -- Implementation Notebook

## Setup and Data

This notebook implements the core components of the QuantaLedger Analytics GRPO training pipeline for financial reasoning.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import re
import time
from collections import defaultdict

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## Section 3.1: Data Pipeline

In [None]:
class FinancialReasoningDataset:
    """
    Synthetic dataset of financial reasoning questions.
    Each question requires 1-3 steps of numerical computation.
    """

    def __init__(self, n_samples=1000, seed=42):
        np.random.seed(seed)
        self.samples = self._generate_samples(n_samples)
        print(f"Generated {len(self.samples)} financial reasoning samples")

    def _generate_samples(self, n):
        samples = []
        for _ in range(n):
            qtype = np.random.choice(['growth', 'margin', 'comparison', 'projection'])

            if qtype == 'growth':
                rev_old = round(np.random.uniform(0.5, 10.0), 1)
                growth = round(np.random.uniform(-0.2, 0.5), 2)
                rev_new = round(rev_old * (1 + growth), 1)

                context = (f"In Q3 2024, the company reported revenue of ${rev_new}B, "
                          f"compared to ${rev_old}B in Q3 2023.")
                question = "What was the year-over-year revenue growth rate?"
                answer = round(growth * 100, 1)
                answer_type = 'percentage'
                steps = 2

            elif qtype == 'margin':
                revenue = round(np.random.uniform(1.0, 15.0), 1)
                margin_pct = round(np.random.uniform(0.05, 0.40), 2)
                profit = round(revenue * margin_pct, 2)

                context = (f"The company generated revenue of ${revenue}B and "
                          f"operating profit of ${profit}B in the fiscal year.")
                question = "What was the operating margin?"
                answer = round(margin_pct * 100, 1)
                answer_type = 'percentage'
                steps = 1

            elif qtype == 'comparison':
                q1 = round(np.random.uniform(1.0, 5.0), 1)
                q2 = round(np.random.uniform(1.0, 5.0), 1)
                q3 = round(np.random.uniform(1.0, 5.0), 1)

                context = (f"Quarterly revenues: Q1=${q1}B, Q2=${q2}B, Q3=${q3}B.")
                question = "What is the total revenue for the first three quarters?"
                answer = round(q1 + q2 + q3, 1)
                answer_type = 'dollar'
                steps = 1

            else:  # projection
                current = round(np.random.uniform(2.0, 10.0), 1)
                growth = round(np.random.uniform(0.05, 0.25), 2)

                context = (f"Current annual revenue is ${current}B with "
                          f"a {round(growth*100,0):.0f}% annual growth rate.")
                question = "What is the projected revenue next year?"
                answer = round(current * (1 + growth), 1)
                answer_type = 'dollar'
                steps = 2

            samples.append({
                'context': context,
                'question': question,
                'answer': answer,
                'answer_type': answer_type,
                'steps': steps,
                'qtype': qtype,
            })

        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

    def get_random(self):
        return self.samples[np.random.randint(len(self.samples))]

# Create dataset
dataset = FinancialReasoningDataset(n_samples=500)

# Show examples
for i in range(3):
    s = dataset[i]
    print(f"Context: {s['context']}")
    print(f"Question: {s['question']}")
    print(f"Answer: {s['answer']} ({s['answer_type']})")
    print(f"Steps: {s['steps']}, Type: {s['qtype']}")
    print()

## Section 3.2: Exploratory Data Analysis

In [None]:
# TODO: Analyze the dataset distribution
# Hint: Count samples by qtype, steps, answer_type

qtypes = [s['qtype'] for s in dataset.samples]
steps = [s['steps'] for s in dataset.samples]
atypes = [s['answer_type'] for s in dataset.samples]

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Question types
unique_qt, counts_qt = np.unique(qtypes, return_counts=True)
axes[0].bar(unique_qt, counts_qt, color=['#4a90d9', '#50c878', '#ff6b6b', '#ffd700'])
axes[0].set_title("Question Types", fontweight='bold')
axes[0].set_ylabel("Count")

# Steps required
unique_st, counts_st = np.unique(steps, return_counts=True)
axes[1].bar([str(s) for s in unique_st], counts_st, color='#4a90d9')
axes[1].set_title("Reasoning Steps Required", fontweight='bold')
axes[1].set_ylabel("Count")

# Answer types
unique_at, counts_at = np.unique(atypes, return_counts=True)
axes[2].bar(unique_at, counts_at, color=['#50c878', '#ff6b6b'])
axes[2].set_title("Answer Types", fontweight='bold')
axes[2].set_ylabel("Count")

plt.tight_layout()
plt.savefig("dataset_analysis.png", dpi=150, bbox_inches='tight')
plt.show()

## Section 3.3: Model and Tokenizer

In [None]:
class CharTokenizer:
    """Character-level tokenizer for financial expressions."""

    def __init__(self):
        chars = list("0123456789.+-*/%=$B QqWwEeRrTtYyUuIiOoPpAaSsDdFfGgHhJjKkLlZzXxCcVvNnMm,:<>()\"'")
        self.char_to_id = {c: i+1 for i, c in enumerate(chars)}  # 0 = padding
        self.id_to_char = {i+1: c for i, c in enumerate(chars)}
        self.vocab_size = len(chars) + 1  # +1 for padding
        self.pad_id = 0

    def encode(self, text, max_len=None):
        ids = [self.char_to_id.get(c, 0) for c in text]
        if max_len:
            ids = ids[:max_len]
        return ids

    def decode(self, ids):
        return ''.join([self.id_to_char.get(i, '?') for i in ids if i > 0])

tokenizer = CharTokenizer()
print(f"Vocab size: {tokenizer.vocab_size}")

# Test
test = "Revenue: $2.3B"
encoded = tokenizer.encode(test)
decoded = tokenizer.decode(encoded)
print(f"Encode '{test}': {encoded[:10]}...")
print(f"Decode back: '{decoded}'")

In [None]:
class FinancialTransformer(nn.Module):
    """Small transformer for financial reasoning."""

    def __init__(self, vocab_size, d_model=128, n_heads=4, n_layers=3, max_len=200):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Embedding(max_len, d_model)
        self.max_len = max_len

        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads,
            dim_feedforward=256, dropout=0.1, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        B, T = x.shape
        pos = torch.arange(min(T, self.max_len), device=x.device).unsqueeze(0)
        mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device)
        h = self.embedding(x) + self.pos_embedding(pos[:, :T])
        h = self.transformer(h, mask=mask, is_causal=True)
        return self.head(h)

    def generate(self, prompt_ids, max_new_tokens=30, temperature=1.0):
        generated = prompt_ids.clone()
        for _ in range(max_new_tokens):
            if generated.shape[1] >= self.max_len:
                break
            logits = self.forward(generated)
            next_logits = logits[:, -1, :] / temperature
            probs = F.softmax(next_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat([generated, next_token], dim=1)
        return generated

# Create models
torch.manual_seed(42)
model = FinancialTransformer(tokenizer.vocab_size, d_model=128, n_heads=4, n_layers=3).to(device)
ref_model = FinancialTransformer(tokenizer.vocab_size, d_model=128, n_heads=4, n_layers=3).to(device)
ref_model.load_state_dict(model.state_dict())
ref_model.requires_grad_(False)

n_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {n_params:,}")

## Section 3.4: Reward Function

In [None]:
def extract_answer(text):
    """Extract numerical answer from model output."""
    # Look for pattern: Answer: <number>
    m = re.search(r'Answer:\s*([\d.]+)', text)
    if m:
        try:
            return float(m.group(1))
        except:
            return None

    # Fallback: look for any number at the end
    numbers = re.findall(r'[\d.]+', text)
    if numbers:
        try:
            return float(numbers[-1])
        except:
            return None
    return None

def compute_reward(generated_text, ground_truth, answer_type, format_weight=0.1):
    """
    Compute verifiable reward for financial reasoning.

    TODO: Implement tolerance-based answer verification.
    """
    predicted = extract_answer(generated_text)

    # Correctness reward
    if predicted is None:
        correct = 0.0
    else:
        if answer_type == 'percentage':
            correct = 1.0 if abs(predicted - ground_truth) < 0.5 else 0.0
        elif answer_type == 'dollar':
            correct = 1.0 if abs(predicted - ground_truth) < 0.1 else 0.0
        else:
            correct = 1.0 if abs(predicted - ground_truth) < 0.01 else 0.0

    # Format reward
    has_think = '<think>' in generated_text and '</think>' in generated_text
    has_answer = 'Answer:' in generated_text
    format_score = 1.0 if (has_think and has_answer) else 0.0

    return correct + format_weight * format_score

# Test
print(compute_reward("<think>Growth = 21.1%</think> Answer: 21.1", 21.1, 'percentage'))
print(compute_reward("Answer: 22.0", 21.1, 'percentage'))
print(compute_reward("I think 21.1", 21.1, 'percentage'))

## Section 3.5: GRPO Training

In [None]:
def compute_grpo_advantages(rewards):
    mean_r = rewards.mean()
    std_r = rewards.std()
    if std_r < 1e-8:
        return torch.zeros_like(rewards)
    return (rewards - mean_r) / std_r

def grpo_step(model, ref_model, tokenizer, dataset, optimizer, G=8, epsilon=0.2, beta=0.04):
    """One GRPO training step."""
    model.train()

    sample = dataset.get_random()
    prompt_text = f"{sample['context']} Question: {sample['question']} "
    prompt_ids = torch.tensor([tokenizer.encode(prompt_text, max_len=150)]).to(device)
    prompt_len = prompt_ids.shape[1]

    # Generate G completions
    completions = []
    with torch.no_grad():
        for _ in range(G):
            comp = model.generate(prompt_ids, max_new_tokens=30, temperature=1.0)
            completions.append(comp)

    # Compute rewards
    rewards = []
    for comp in completions:
        text = tokenizer.decode(comp[0, prompt_len:].tolist())
        r = compute_reward(text, sample['answer'], sample['answer_type'])
        rewards.append(r)
    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)

    advantages = compute_grpo_advantages(rewards)

    if advantages.abs().sum() < 1e-8:
        return 0.0, rewards.mean().item()

    # Pad and compute log probs
    max_len = max(c.shape[1] for c in completions)
    padded = torch.zeros(G, max_len, dtype=torch.long).to(device)
    mask = torch.zeros(G, max(1, max_len - 1)).to(device)

    for i, comp in enumerate(completions):
        L = comp.shape[1]
        padded[i, :L] = comp[0]
        if max_len > 1:
            mask[i, prompt_len:min(L-1, max_len-1)] = 1.0

    if mask.sum() < 1:
        return 0.0, rewards.mean().item()

    logits = model(padded[:, :-1])
    log_probs = F.log_softmax(logits, dim=-1)
    targets = padded[:, 1:]
    token_log_probs = log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1)

    with torch.no_grad():
        old_token_log_probs = token_log_probs.detach()
        ref_logits = ref_model(padded[:, :-1])
        ref_log_probs = F.log_softmax(ref_logits, dim=-1)
        ref_token_log_probs = ref_log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1)

    # GRPO loss
    ratio = torch.exp(token_log_probs - old_token_log_probs)
    adv = advantages.unsqueeze(1)
    surr1 = ratio * adv
    surr2 = torch.clamp(ratio, 1-epsilon, 1+epsilon) * adv
    policy_obj = torch.min(surr1, surr2)

    kl_ratio = torch.exp(ref_token_log_probs - token_log_probs)
    kl = kl_ratio - torch.log(kl_ratio) - 1.0

    per_token = policy_obj - beta * kl
    per_response = (per_token * mask[:, :per_token.shape[1]]).sum(dim=1) / mask[:, :per_token.shape[1]].sum(dim=1).clamp(min=1)
    loss = -per_response.mean()

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

    return loss.item(), rewards.mean().item()

print("GRPO training step ready!")

## Section 3.6: Training Loop

In [None]:
# TODO: Run the full training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

n_steps = 300
log_every = 30
losses = []
rewards_history = []

print("Training with GRPO...")
start = time.time()

for step in range(n_steps):
    loss, mean_reward = grpo_step(model, ref_model, tokenizer, dataset, optimizer, G=8)
    losses.append(loss)
    rewards_history.append(mean_reward)

    if (step + 1) % log_every == 0:
        avg_loss = np.mean(losses[-log_every:])
        avg_reward = np.mean(rewards_history[-log_every:])
        print(f"Step {step+1:4d} | Loss: {avg_loss:.4f} | Reward: {avg_reward:.3f} | Time: {time.time()-start:.0f}s")

print("Training complete!")

## Section 3.7: Results and Error Analysis

In [None]:
# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

window = 20
if len(losses) > window:
    loss_smooth = np.convolve(losses, np.ones(window)/window, mode='valid')
    reward_smooth = np.convolve(rewards_history, np.ones(window)/window, mode='valid')

    axes[0].plot(loss_smooth, linewidth=2, color='red')
    axes[0].set_title("Training Loss", fontweight='bold')
    axes[0].set_xlabel("Step")
    axes[0].grid(True, alpha=0.3)

    axes[1].plot(reward_smooth, linewidth=2, color='blue')
    axes[1].set_title("Mean Reward", fontweight='bold')
    axes[1].set_xlabel("Step")
    axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("financial_grpo_training.png", dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# TODO: Evaluate on test samples
print("Evaluation on 20 test samples:")
print("=" * 60)

correct = 0
total = 20
for i in range(total):
    sample = dataset[i + 400]  # Use samples not seen in training
    prompt = f"{sample['context']} Question: {sample['question']} "
    prompt_ids = torch.tensor([tokenizer.encode(prompt, max_len=150)]).to(device)

    with torch.no_grad():
        output = model.generate(prompt_ids, max_new_tokens=30, temperature=0.1)

    generated = tokenizer.decode(output[0, prompt_ids.shape[1]:].tolist())
    r = compute_reward(generated, sample['answer'], sample['answer_type'])
    correct += (r >= 1.0)

    status = "OK" if r >= 1.0 else "MISS"
    print(f"  [{status}] Q: {sample['question'][:40]}... A: {sample['answer']} | Gen: {generated[:30]}")

print(f"\nAccuracy: {correct}/{total} = {correct/total:.0%}")

## Section 3.8-3.9: Deployment and Ethics

In [None]:
# TODO: Deployment readiness checklist
print("=" * 50)
print("Deployment Readiness Checklist")
print("=" * 50)
print()
print("[ ] Model quantized to INT8")
print("[ ] Latency < 2 seconds on A100")
print("[ ] Accuracy > 75% on test set")
print("[ ] KL divergence < 5.0")
print("[ ] Error analysis documented")
print("[ ] Monitoring dashboards configured")
print("[ ] A/B test plan approved")
print("[ ] Ethics review completed")
print()
print("Ethics Considerations:")
print("  1. Hallucination: model may generate plausible but wrong numbers")
print("  2. Overconfidence: no uncertainty quantification")
print("  3. Regulatory: not a substitute for human financial analysis")
print("  4. Bias: performance may vary across industries")