# Envis Insight Engine - Training Notebook

This notebook trains the Envis Insight Engine model on the labelled dataset.

**Requirements:**
- Google Colab Pro (for GPU access)
- A100 or V100 GPU runtime
- `financial_data_8k.csv` uploaded

**Expected Training Time:** ~6-8 hours on A100

## 1. Setup Environment

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install dependencies
!pip install torch transformers torch-geometric pandas scikit-learn tensorboard tqdm

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, mean_absolute_error
from tqdm import tqdm
import json
from datetime import datetime
import os

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Upload and Load Data

In [None]:
# Upload your CSV file
from google.colab import files
uploaded = files.upload()  # Select financial_data_8k.csv

In [None]:
# Load dataset
df = pd.read_csv('financial_data_8k.csv')

print(f"Total records: {len(df):,}")
print(f"Columns: {list(df.columns)}")
print(f"\nDistress distribution:")
print(df['distress'].value_counts())
print(f"\nFraming distribution:")
print(df['framing'].value_counts())

In [None]:
# Encode framing labels
FRAMING_CLASSES = ['supportive', 'direct', 'celebratory', 'gentle', 'urgent']
framing_to_idx = {f: i for i, f in enumerate(FRAMING_CLASSES)}
df['framing_idx'] = df['framing'].map(framing_to_idx)

# Split data (80/10/10)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['distress'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['distress'])

print(f"Train: {len(train_df):,}")
print(f"Val: {len(val_df):,}")
print(f"Test: {len(test_df):,}")

## 3. Define Dataset and Model

In [None]:
# Load FinBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')

class FinancialDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            row['text'],
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'distress': torch.tensor(row['distress'], dtype=torch.float),
            'framing': torch.tensor(row['framing_idx'], dtype=torch.long),
            'tension': torch.tensor(row['tension'], dtype=torch.float),
        }

# Create datasets
train_dataset = FinancialDataset(train_df, tokenizer)
val_dataset = FinancialDataset(val_df, tokenizer)
test_dataset = FinancialDataset(test_df, tokenizer)

# Create dataloaders
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

In [None]:
# Adapter module for parameter-efficient fine-tuning
class Adapter(nn.Module):
    def __init__(self, input_dim, bottleneck_dim=64):
        super().__init__()
        self.down = nn.Linear(input_dim, bottleneck_dim)
        self.up = nn.Linear(bottleneck_dim, input_dim)
        self.act = nn.GELU()
    
    def forward(self, x):
        return x + self.up(self.act(self.down(x)))


class EnvisInsightEngine(nn.Module):
    """
    Simplified Envis Insight Engine for proof-of-concept training.
    
    Uses FinBERT with adapter fine-tuning for text encoding,
    plus multi-task prediction heads.
    """
    
    def __init__(self, num_framing_classes=5, adapter_dim=64):
        super().__init__()
        
        # Load FinBERT (frozen)
        self.bert = AutoModel.from_pretrained('ProsusAI/finbert')
        for param in self.bert.parameters():
            param.requires_grad = False
        
        # Add adapters to each layer
        self.adapters = nn.ModuleList([
            Adapter(768, adapter_dim) for _ in range(12)
        ])
        
        # Prediction heads
        self.distress_head = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )
        
        self.framing_head = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_framing_classes)
        )
        
        self.tension_head = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )
        
        # Uncertainty weights for multi-task learning
        self.log_vars = nn.Parameter(torch.zeros(3))
    
    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Apply adapters to CLS token
        hidden = outputs.last_hidden_state[:, 0, :]  # CLS token
        for adapter in self.adapters:
            hidden = adapter(hidden)
        
        # Predictions
        distress = self.distress_head(hidden).squeeze(-1)
        framing = self.framing_head(hidden)
        tension = self.tension_head(hidden).squeeze(-1)
        
        return {
            'distress': distress,
            'framing': framing,
            'tension': tension,
            'log_vars': self.log_vars
        }


# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EnvisInsightEngine().to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Trainable %: {100*trainable_params/total_params:.2f}%")

## 4. Training Loop

In [None]:
def compute_loss(predictions, targets):
    """Uncertainty-weighted multi-task loss."""
    log_vars = predictions['log_vars']
    
    # Individual losses
    loss_distress = F.binary_cross_entropy(predictions['distress'], targets['distress'])
    loss_framing = F.cross_entropy(predictions['framing'], targets['framing'])
    loss_tension = F.binary_cross_entropy(predictions['tension'], targets['tension'])
    
    # Uncertainty weighting
    total_loss = (
        torch.exp(-log_vars[0]) * loss_distress + log_vars[0] +
        torch.exp(-log_vars[1]) * loss_framing + log_vars[1] +
        torch.exp(-log_vars[2]) * loss_tension + log_vars[2]
    )
    
    return total_loss, {
        'distress': loss_distress.item(),
        'framing': loss_framing.item(),
        'tension': loss_tension.item()
    }


def evaluate(model, dataloader, device):
    """Evaluate model on a dataset."""
    model.eval()
    
    all_distress_pred, all_distress_true = [], []
    all_framing_pred, all_framing_true = [], []
    all_tension_pred, all_tension_true = [], []
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            predictions = model(input_ids, attention_mask)
            
            targets = {
                'distress': batch['distress'].to(device),
                'framing': batch['framing'].to(device),
                'tension': batch['tension'].to(device)
            }
            
            loss, _ = compute_loss(predictions, targets)
            total_loss += loss.item()
            
            all_distress_pred.extend(predictions['distress'].cpu().numpy())
            all_distress_true.extend(batch['distress'].numpy())
            all_framing_pred.extend(predictions['framing'].argmax(dim=1).cpu().numpy())
            all_framing_true.extend(batch['framing'].numpy())
            all_tension_pred.extend(predictions['tension'].cpu().numpy())
            all_tension_true.extend(batch['tension'].numpy())
    
    metrics = {
        'loss': total_loss / len(dataloader),
        'distress_auc': roc_auc_score(all_distress_true, all_distress_pred),
        'framing_acc': accuracy_score(all_framing_true, all_framing_pred),
        'tension_auc': roc_auc_score(all_tension_true, all_tension_pred),
    }
    
    return metrics

In [None]:
# Training configuration
EPOCHS = 20
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 500
PATIENCE = 3

# Optimizer (only trainable parameters)
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)

# Scheduler
total_steps = len(train_loader) * EPOCHS
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)

# Training loop
best_val_loss = float('inf')
patience_counter = 0
training_log = []

print("Starting training...")
print("=" * 60)

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        targets = {
            'distress': batch['distress'].to(device),
            'framing': batch['framing'].to(device),
            'tension': batch['tension'].to(device)
        }
        
        optimizer.zero_grad()
        predictions = model(input_ids, attention_mask)
        loss, task_losses = compute_loss(predictions, targets)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
    
    # Validation
    val_metrics = evaluate(model, val_loader, device)
    
    # Log
    log_entry = {
        'epoch': epoch + 1,
        'train_loss': epoch_loss / len(train_loader),
        'val_loss': val_metrics['loss'],
        'distress_auc': val_metrics['distress_auc'],
        'framing_acc': val_metrics['framing_acc'],
        'tension_auc': val_metrics['tension_auc'],
    }
    training_log.append(log_entry)
    
    print(f"\nEpoch {epoch+1}: "
          f"train_loss={log_entry['train_loss']:.4f}, "
          f"val_loss={val_metrics['loss']:.4f}, "
          f"distress_auc={val_metrics['distress_auc']:.4f}, "
          f"framing_acc={val_metrics['framing_acc']:.4f}")
    
    # Early stopping
    if val_metrics['loss'] < best_val_loss:
        best_val_loss = val_metrics['loss']
        patience_counter = 0
        # Save best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_metrics['loss'],
            'metrics': val_metrics,
        }, 'best_model.pt')
        print("  âœ“ New best model saved!")
    else:
        patience_counter += 1
        print(f"  No improvement. Patience: {patience_counter}/{PATIENCE}")
    
    if patience_counter >= PATIENCE:
        print(f"\nEarly stopping triggered at epoch {epoch+1}")
        break

print("\nTraining complete!")

## 5. Evaluate on Test Set

In [None]:
# Load best model
checkpoint = torch.load('best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
print(f"Loaded best model from epoch {checkpoint['epoch']+1}")

# Evaluate on test set
test_metrics = evaluate(model, test_loader, device)

print("\n" + "=" * 60)
print("FINAL TEST SET RESULTS")
print("=" * 60)
print(f"Distress AUC:    {test_metrics['distress_auc']:.4f}")
print(f"Framing Acc:     {test_metrics['framing_acc']:.4f}")
print(f"Tension AUC:     {test_metrics['tension_auc']:.4f}")
print("=" * 60)

## 6. Save Final Model and Logs

In [None]:
# Save training log
log_df = pd.DataFrame(training_log)
log_df.to_csv('training_log.csv', index=False)

# Save final results
results = {
    'run_id': f'envis_poc_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
    'training_completed': datetime.now().isoformat(),
    'epochs_trained': len(training_log),
    'best_epoch': checkpoint['epoch'] + 1,
    'test_metrics': test_metrics,
    'training_log': training_log
}

with open('training_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Saved:")
print("  - best_model.pt (model weights)")
print("  - training_log.csv")
print("  - training_results.json")

In [None]:
# Download files
from google.colab import files

files.download('best_model.pt')
files.download('training_log.csv')
files.download('training_results.json')