### Framing classifier using BERT - V2 - Longformer Generalist

Retain all features from V1 notebook. Drop the long-doc policy in favor of the longformer. Adding in title + text into input. Introduce a more streamlined means of storing results, including folders per run. 


In [2]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from dotenv import load_dotenv
import os
import transformers
load_dotenv()  # looks for .env in current directory or parent
print(torch.__version__)
print(torch.cuda.is_available())


2.6.0+cu124
True


### Sample the data, NO Topic Filter

In [3]:
# Connect to server 
import psycopg2
conn = psycopg2.connect(
    dbname=os.getenv("DB_NAME"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD"),
    host=os.getenv("DB_HOST"),
    port=os.getenv("DB_PORT")
)
cur = conn.cursor()

# key: set the seed
cur.execute("SELECT setseed(0.42)")

# Do our join in database - NOTE this is with a POLITICS FILTER
cur.execute(f"""
           SELECT a.text_generic_frame, a.gpt_topic, a.political_leaning, a.title,  
           b.maintext
           FROM mm_framing_full a
           JOIN newsarticles b ON a.url = b.url
           ORDER BY RANDOM()
            LIMIT 75000
            """)

result= cur.fetchall()

print(cur.description)

cur.close()
conn.close()

df = pd.DataFrame(result, columns=["text_generic_frame", "gpt_topic", "political_leaning", "title", "article_text"])

del result

df.head()

(Column(name='text_generic_frame', type_code=1009), Column(name='gpt_topic', type_code=25), Column(name='political_leaning', type_code=25), Column(name='title', type_code=25), Column(name='maintext', type_code=25))


Unnamed: 0,text_generic_frame,gpt_topic,political_leaning,title,article_text
0,"[Cultural identity, External regulation and re...",War & Conflict,left_lean,Suspects arrested after Moscow concert hall at...,State media reported Saturday that Russian aut...
1,"[Legality, constitutionality and jurisprudence...",Politics,right,Senate prepares for Mayorkas impeachment artic...,NEWYou can now listen to Fox News articles!\nT...
2,"[Economic, Political]",Politics,left_lean,Tim Scott's presidential campaign is burning t...,Few 2024 GOP presidential hopefuls are bringin...
3,"[Cultural identity, Fairness and equality, Ext...",Sports,left_lean,USA v Portugal: Women’s World Cup 2023 Group E...,Key events\nShow key events only\nPlease turn ...
4,"[Policy prescription and evaluation, Crime and...",Legal,right,Alex Murdaugh 'extremely angry' about jury tam...,NEWYou can now listen to Fox News articles!\nA...


### Initial Data Filtering

Not that in between these notebooks, I changed the gpt_topic and generic frame columns directly in database so I don't have to keep fixing them.

In [4]:
# Create word count column
df['num_words'] = df['article_text'].str.split().str.len()

print(f"Original rows: {len(df)}")

# Filter based on length
df_filtered = df[(df['num_words'] > 100)]
df = df_filtered.dropna()
df = df.reset_index(drop=True)

print(f"Filtered rows: {len(df_filtered)}")

del df_filtered

Original rows: 75000
Filtered rows: 56406


In [5]:
# Keep rows only where the list is NOT exactly ['Other']
df = df[df['text_generic_frame'].apply(lambda x: x != ['Other'])]
print(f"Rows remaining: {len(df)}")

Rows remaining: 55579


In [6]:
df.head()

Unnamed: 0,text_generic_frame,gpt_topic,political_leaning,title,article_text,num_words
0,"[Cultural identity, External regulation and re...",War & Conflict,left_lean,Suspects arrested after Moscow concert hall at...,State media reported Saturday that Russian aut...,887
1,"[Legality, constitutionality and jurisprudence...",Politics,right,Senate prepares for Mayorkas impeachment artic...,NEWYou can now listen to Fox News articles!\nT...,602
2,"[Cultural identity, Fairness and equality, Ext...",Sports,left_lean,USA v Portugal: Women’s World Cup 2023 Group E...,Key events\nShow key events only\nPlease turn ...,1527
3,"[Policy prescription and evaluation, Crime and...",Legal,right,Alex Murdaugh 'extremely angry' about jury tam...,NEWYou can now listen to Fox News articles!\nA...,936
4,"[Fairness and equality, Political, Public opin...",Politics,left_lean,Nikki Haley dropping out of GOP presidential race,Former United Nations Ambassador Nikki Haley s...,1003


In [7]:
# Engineer the text column

# Adding the title
df['article_text'] = df['title'] + "\n" + df['article_text']

# Adding the topic at the very start
df['article_text'] = "TOPIC:" + df['gpt_topic'] + "\n" + df['article_text']



### Tokenization + Adaptively setting max_length

In [8]:

from transformers import LongformerTokenizerFast
import numpy as np

# 1. Initialize Tokenizer
model_name = "allenai/longformer-base-4096"
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

# 2. Measure Lengths
# We process in batches to keep it snappy
print("Measuring token lengths...")
token_lens = []
texts = df['article_text'].tolist()

# Tokenize just to count (no padding/truncation yet)
# Using the fast tokenizer's batch_encode_plus is usually efficient enough
encodings = tokenizer(texts, add_special_tokens=True, return_attention_mask=False)
token_lens = [len(x) for x in encodings['input_ids']]

# 3. Statistics
token_lens = np.array(token_lens)
p95 = np.percentile(token_lens, 95)
p99 = np.percentile(token_lens, 99)

print(f"Mean Length: {np.mean(token_lens):.1f}")
print(f"95th Percentile: {p95:.1f} tokens")
print(f"99th Percentile: {p99:.1f} tokens")
print(f"Max Length found: {np.max(token_lens)} tokens")

# based on these a max_length of 2048 is sufficient, and balances efficiency

Measuring token lengths...
Mean Length: 740.0
95th Percentile: 1449.0 tokens
99th Percentile: 1922.7 tokens
Max Length found: 20301 tokens


### Dataset Creation

In [9]:
# load the binarizer
import joblib
mlb = joblib.load('encoders/mlb_15_classes.pkl')

labels_matrix = mlb.transform(df['text_generic_frame'])

print(labels_matrix.shape)

(55579, 15)


In [10]:
# we can innovate in storing meta data from past approach by just storing self.df
# we also innovate by saving VRAM with the __getitem__ implementation
from torch.utils.data import Dataset, DataLoader

import torch
from torch.utils.data import Dataset, DataLoader

class NewsArticleDataset(Dataset):
    def __init__(self, df, tokenizer, labels_matrix, max_len=2048):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        # We store the pre-computed matrix you made with MLB
        self.labels = labels_matrix 
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row['article_text'])
        
        # TOKENIZATION
        # Truncate, but DO NOT PAD here. 
        # The collator handles padding to save memory.
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding=False, 
            add_special_tokens=True 
        )
        
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        
        # CREATE GLOBAL ATTENTION MASK
        # 0 = Local Attention, 1 = Global Attention
        # We strictly set the [CLS] token (index 0) to Global Attention.
        global_attention_mask = [0] * len(input_ids)
        global_attention_mask[0] = 1 
        
        # GET LABELS
        # Directly access the row from your matrix
        labels_vec = self.labels[idx]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'global_attention_mask': global_attention_mask,
            'labels': torch.tensor(labels_vec, dtype=torch.float),
            
            # METADATA PASS-THROUGH
            'metadata': {
                'url': row.get('url', ''),
                'title': row.get('title', ''),
                'gpt_topic': row.get('gpt_topic', ''),
                'num_words': row.get('num_words', 0)
            }
        }


In [11]:
# Collate function, which is critical to running this much larger model

def longformer_collate_fn(batch):
    """
    Custom collator to handle dynamic padding and 512-window alignment.
    """
    # 1. Determine the maximum length in this specific batch
    max_len = max(len(item['input_ids']) for item in batch)
    
    # 2. Round up to nearest multiple of 512 (Longformer Window Size)
    # This aligns memory for the sliding window attention mechanism
    window_size = 512
    padded_len = ((max_len + window_size - 1) // window_size) * window_size
    
    # Prepare batch lists
    input_ids_batch = []
    attention_mask_batch = []
    global_attention_mask_batch = []
    labels_batch = []
    metadata_batch = []
    
    pad_token_id = tokenizer.pad_token_id
    
    for item in batch:
        # Calculate padding needed for this sequence
        curr_len = len(item['input_ids'])
        pad_len = padded_len - curr_len
        
        # Pad Input IDs
        ids = item['input_ids'] + [pad_token_id] * pad_len
        
        # Pad Attention Mask (0 for padded tokens)
        mask = item['attention_mask'] + [0] * pad_len
        
        # Pad Global Attention Mask (0 for padded tokens)
        global_mask = item['global_attention_mask'] + [0] * pad_len
        
        input_ids_batch.append(ids)
        attention_mask_batch.append(mask)
        global_attention_mask_batch.append(global_mask)
        labels_batch.append(item['labels'])
        metadata_batch.append(item['metadata'])

    return {
        'input_ids': torch.tensor(input_ids_batch, dtype=torch.long),
        'attention_mask': torch.tensor(attention_mask_batch, dtype=torch.long),
        'global_attention_mask': torch.tensor(global_attention_mask_batch, dtype=torch.long),
        'labels': torch.stack(labels_batch),
        'metadata': metadata_batch # Returns a list of dicts
    }

In [12]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from torch.utils.data import Subset, DataLoader
import numpy as np

# 1. SETUP SPLITTERS -----------------------------------------------------------
# We need indices to split.
N = len(labels_matrix)
X_indices = np.zeros(N) # Dummy features just to satisfy the splitter API

# A. Split Train (80%) vs Temp (20%)
msss1 = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, temp_idx = next(iter(msss1.split(X_indices, labels_matrix)))

# B. Split Temp into Val (10%) and Test (10%)
# We split the *temp* indices in half
temp_labels = labels_matrix[temp_idx]
temp_dummy_X = np.zeros(len(temp_idx))

msss2 = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.50, random_state=42)
relative_val_idx, relative_test_idx = next(iter(msss2.split(temp_dummy_X, temp_labels)))

# Map relative indices back to original dataframe indices
val_idx = temp_idx[relative_val_idx]
test_idx = temp_idx[relative_test_idx]

print(f"Splits created:")
print(f"Train: {len(train_idx)} | Val: {len(val_idx)} | Test: {len(test_idx)}")


# 2. INSTANTIATE DATASETS ------------------------------------------------------
# We create ONE full dataset, then subset it using the indices above.
full_dataset = NewsArticleDataset(
    df, 
    tokenizer, 
    labels_matrix, 
    max_len=2048
)

train_dataset = Subset(full_dataset, train_idx)
val_dataset   = Subset(full_dataset, val_idx)
test_dataset  = Subset(full_dataset, test_idx)


# 3. CREATE DATALOADERS --------------------------------------------------------
# Optimizations for RTX 4070 Ti Super
BATCH_SIZE = 4 

train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, # Shuffle ONLY training
    collate_fn=longformer_collate_fn,
    num_workers=0
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=longformer_collate_fn,
    num_workers=0
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=longformer_collate_fn,
    num_workers=0
)

print(f"Loaders ready. Train batches: {len(train_loader)}")

Splits created:
Train: 44463 | Val: 5558 | Test: 5558
Loaders ready. Train batches: 11116


In [13]:
# Let's test on a batch as a sanity check
# grab a batch using iterator next()
batch = next(iter(train_loader))

print(batch['metadata'][1])
# so now we have our useful metadata in our batches

{'url': '', 'title': 'Greek government under fire after video shows ‘pushback’ of asylum seekers', 'gpt_topic': 'Politics', 'num_words': np.int64(663)}


### Pre-training Loading

In [14]:
# for a clean state, hard reset the GPU state
import torch, gc
gc.collect()
torch.cuda.empty_cache() 

In [19]:
import os
import json
import torch
from datetime import datetime
# 1. THE DEEP IMPORT (Go straight to the source file)
try:
    from transformers.models.longformer.modeling_longformer import LongformerForSequenceClassification
except ImportError:
    # If this fails, we try the old directory structure (sometimes happens in older/conda envs)
    from transformers.models.longformer import LongformerForSequenceClassification

# 2. MODEL INITIALIZATION -----------------------------------------------------
print("Loading Longformer (this may take a moment)...")

model = LongformerForSequenceClassification.from_pretrained(
    "allenai/longformer-base-4096",
    num_labels=15, 
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True 
)

# ENABLE Gradient Checkpointing - which saves a huge amount of V-Ram
model.gradient_checkpointing_enable()

model.to('cuda')
print("Model loaded and moved to GPU.")

# 3. OPTIMIZER & SCHEDULER ----------------------------------------------------
# Hyperparameters
LR = 3e-5
EPOCHS = 4
# set the number of mini batches you run before updating gradients
ACCUMULATION_STEPS = 4  # Effective Batch Size = 4 (physical) * 4 (accum) = 16

optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=LR 
)

# 4. THE "LAB MANAGER" (EXPERIMENT TRACKER) -----------------------------------
class ExperimentTracker:
    def __init__(self, run_name, base_dir="saved_models/framing_training_runs_longformer"):
        # Create a unique timestamped folder for this run
        timestamp = datetime.now().strftime("%Y%m%d_%H%M")
        self.run_dir = os.path.join(base_dir, f"{timestamp}_{run_name}")
        os.makedirs(self.run_dir, exist_ok=True)
        print(f" Experiment initialized. Saving to: {self.run_dir}")
        
        # Initialize a log dictionary
        self.history = {
            "config": {
                "model": "longformer-base-4096",
                "max_len": 2048,
                "batch_size": 4, 
                "accum_steps": ACCUMULATION_STEPS,
                "lr": LR
            },
            "epochs": []
        }
        
    def log_epoch(self, epoch_data):
        """Append epoch results to history and save immediately."""
        self.history["epochs"].append(epoch_data)
        self.save_history()
        
    def save_history(self):
        with open(os.path.join(self.run_dir, "metrics.json"), "w") as f:
            json.dump(self.history, f, indent=4)
            
    def save_model(self, model, name="model_state.bin"):
        torch.save(model.state_dict(), os.path.join(self.run_dir, name))
        print(f"Model saved: {name}")

    def save_report(self, df_report, name="classification_report.csv"):
        df_report.to_csv(os.path.join(self.run_dir, name))


Loading Longformer (this may take a moment)...


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to GPU.


In [20]:


# DEFINE weighted loss

num_positives = torch.tensor(labels_matrix.sum(axis=0), dtype=torch.float)
num_negatives = len(labels_matrix) - num_positives

# Calculate ratio: if we have 10x more negatives, we boost positives by 10x
pos_weight = (num_negatives / (num_positives + 1e-5)).to('cuda')

print(f"Calculated positive weights for {len(pos_weight)} classes.")

# 2. Define the Criterion
# We pass this into the training function
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)


Calculated positive weights for 15 classes.


### Training Run

In [21]:
# Initialize and name the run
tracker = ExperimentTracker(run_name="longformer_topic_expert_v1")

 Experiment initialized. Saving to: saved_models/framing_training_runs_longformer\20260121_0143_longformer_topic_expert_v1


In [None]:
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

# 3. TRAINING ENGINE ----------------------------------------------------------
def train_engine(model, train_loader, val_loader, optimizer, tracker, criterion):
    scaler = torch.amp.GradScaler('cuda') # Mixed Precision (Newer PyTorch syntax)
    
    for epoch in range(EPOCHS):
        print(f"\n======== EPOCH {epoch+1}/{EPOCHS} ========")
        
        # --- TRAINING PHASE ---
        model.train()
        train_loss = 0
        optimizer.zero_grad()
        
        loop = tqdm(train_loader, leave=True)
        for step, batch in enumerate(loop):
            # Move batch to device
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            global_attention_mask = batch['global_attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')
            
            # Forward Pass (Mixed Precision)
            with torch.amp.autocast('cuda'):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    global_attention_mask=global_attention_mask,
                    # We pass labels only to suppress warnings, we don't use internal loss
                    labels=labels 
                )
                
                # CUSTOM WEIGHTED LOSS
                loss = criterion(outputs.logits, labels)
                loss = loss / ACCUMULATION_STEPS
            
            # Backward Pass
            scaler.scale(loss).backward()
            
            if (step + 1) % ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            
            train_loss += loss.item() * ACCUMULATION_STEPS
            loop.set_postfix(loss=loss.item() * ACCUMULATION_STEPS)
            
        if len(train_loader) % ACCUMULATION_STEPS != 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
    
        avg_train_loss = train_loss / len(train_loader)
        
        # --- VALIDATION PHASE ---
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        
        print("Running Validation...")
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to('cuda')
                attention_mask = batch['attention_mask'].to('cuda')
                global_attention_mask = batch['global_attention_mask'].to('cuda')
                labels = batch['labels'].to('cuda')
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    global_attention_mask=global_attention_mask,
                    labels=labels
                )
                
                loss = criterion(outputs.logits, labels)
                val_loss += loss.item()
                
                probs = torch.sigmoid(outputs.logits)
                all_preds.append(probs.cpu().numpy())
                all_labels.append(labels.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        
        # --- METRICS ---
        all_preds_np = np.concatenate(all_preds)
        all_labels_np = np.concatenate(all_labels)

        # 1. Standard Monitor (Micro @ 0.5) - Good for overall accuracy
        temp_preds = (all_preds_np > 0.5).astype(int)
        val_f1_micro = f1_score(all_labels_np, temp_preds, average='micro')

        # 2. Minority Class Monitor (Macro @ 0.5) - Good for rare frames
        # This warns you if the model is just predicting the dominant classes
        val_f1_macro = f1_score(all_labels_np, temp_preds, average='macro')

        print(f" Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Micro-F1: {val_f1_micro:.4f} | Macro-F1: {val_f1_macro:.4f}")

        tracker.log_epoch({
            "epoch": epoch + 1,
            "train_loss": avg_train_loss,
            "val_loss": avg_val_loss,
            "val_f1_micro": val_f1_micro,
            "val_f1_macro": val_f1_macro 
        })
        
        # Save per-epoch model
        tracker.save_model(model, name=f"model_ep{epoch+1}.bin")

    print("Training Complete.")
    tracker.save_model(model, name="final_model.bin")
    
    return all_preds, all_labels


In [None]:

# 4. EXECUTE TRAINING ---------------------------------------------------------
print("Starting Training Run...")
val_preds, val_targets = train_engine(model, train_loader, val_loader, optimizer, tracker, criterion)