# Fine-tuning GPT-2 for Satirical News Generation

This notebook fine-tunes GPT-2 to generate satirical news articles based on topics.

In [1]:
"""Fine-tuning notebook for GPT-2 on satirical news generation.

This notebook:
1. Loads the processed satirical news dataset
2. Extracts keywords from headlines and articles using NLP
3. Formats data for fine-tuning with system/user prompts
4. Fine-tunes GPT-2 model
5. Saves checkpoints during training
"""

import sys
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import torch

# Determine project root
if Path.cwd().name == "LOL-LM":
    PROJECT_ROOT = Path.cwd()
elif (Path.cwd() / "src" / "notebooks").exists():
    PROJECT_ROOT = Path.cwd()
elif (Path.cwd().parent / "src" / "notebooks").exists():
    PROJECT_ROOT = Path.cwd().parent
else:
    PROJECT_ROOT = Path(__file__).parent.parent.parent if '__file__' in globals() else Path.cwd().parent.parent

# Add src to path for imports
SRC_DIR = PROJECT_ROOT / "src"
sys.path.insert(0, str(SRC_DIR))

from utils import env  # noqa: F401 - loads .env file
from logger import log

# Set up paths
DATA_DIR = PROJECT_ROOT / "data" / "fake_news" / "processed"
MODEL_DIR = PROJECT_ROOT / "data" / "model"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data dir: {DATA_DIR}")
print(f"Model dir: {MODEL_DIR}")

[32m2025-12-15 11:54:33[0m | [34m[1mDEBUG[0m | Loaded environment variables from: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/.env


Project root: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM
Data dir: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/fake_news/processed
Model dir: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/model


In [2]:
# Install required packages if not already installed
try:
    import spacy
    from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
    from datasets import Dataset
except ImportError:
    print("Installing required packages...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers", "datasets", "accelerate", "spacy"])
    import spacy
    from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
    from datasets import Dataset
    
    # Download spaCy model
    print("Downloading spaCy English model...")
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])

print("‚úÖ All packages installed")

‚úÖ All packages installed


In [3]:
# Load the processed dataset or cached fine-tuning file
raw_dataset_path = DATA_DIR / "babylonbee_processed.csv"
finetune_path = DATA_DIR / "babylonbee_finetune.csv"

finetune_exists = finetune_path.exists()

if finetune_exists:
    df = pd.read_csv(finetune_path)
    print(f"Loaded fine-tune dataset: {len(df)} rows from {finetune_path}")
    print(f"Columns: {df.columns.tolist()}")
else:
    if not raw_dataset_path.exists():
        raise FileNotFoundError(f"Dataset not found at {raw_dataset_path}. Please run prep_sarcasm.ipynb first.")
    df = pd.read_csv(raw_dataset_path)
    print(f"Loaded base dataset: {len(df)} rows from {raw_dataset_path}")
    print(f"Columns: {df.columns.tolist()}")

print("\nSample data:")
df.head()

Loaded fine-tune dataset: 10793 rows from /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/fake_news/processed/babylonbee_finetune.csv
Columns: ['Headline', 'Article', 'topics', 'system', 'user', 'result', 'training_text']

Sample data:


Unnamed: 0,Headline,Article,topics,system,user,result,training_text
0,Brave Adventurer Discovers Long-Lost Article H...,"MOAB, UT ‚Äî A historic discovery was made today...","Brave Adventurer, Long-Lost Article, Hidden Be...",You are a satirical news generator. When given...,"Generate an article on: Brave Adventurer, Long...",Headline: Brave Adventurer Discovers Long-Lost...,<system>You are a satirical news generator. Wh...
1,Drunk Irishmen Say They Understood Biden's Dub...,DUBLIN ‚Äî Despite claims from conservative medi...,"Drunk Irishmen, Biden's Dublin Speech, Dublin ...",You are a satirical news generator. When given...,"Generate an article on: Drunk Irishmen, Biden'...",Headline: Drunk Irishmen Say They Understood B...,<system>You are a satirical news generator. Wh...
2,John Leguizamo's Boycott Of Mario Movie Leads ...,"MANHATTAN, NY ‚Äî With The Super Mario Bros. Mov...","John Leguizamo's Boycott, Mario Movie, Sharp Rise",You are a satirical news generator. When given...,Generate an article on: John Leguizamo's Boyco...,Headline: John Leguizamo's Boycott Of Mario Mo...,<system>You are a satirical news generator. Wh...
3,Pentagon Leaker Kicking Himself For Not Just L...,"DIGHTON, MA ‚Äî Military police have arrested Ja...","Classified Documents, His Garage, Pentagon Lea...",You are a satirical news generator. When given...,"Generate an article on: Classified Documents, ...",Headline: Pentagon Leaker Kicking Himself For ...,<system>You are a satirical news generator. Wh...
4,Parents Just Relieved Teen Who Came Home Drunk...,"NEW BRITAIN, PA ‚Äî Local parents Tim and Julia ...","Just Relieved Teen, Bud Light",You are a satirical news generator. When given...,"Generate an article on: Just Relieved Teen, Bu...",Headline: Parents Just Relieved Teen Who Came ...,<system>You are a satirical news generator. Wh...


In [4]:
# Load spaCy model for NLP keyword extraction
try:
    nlp = spacy.load("en_core_web_sm")
    print("‚úÖ Loaded spaCy English model")
except OSError:
    print("‚ö†Ô∏è spaCy model not found. Installing...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")
    print("‚úÖ Loaded spaCy English model")

# Define stopwords (spaCy's stopwords + common irrelevant words)
stopwords = set[str](nlp.Defaults.stop_words)
stopwords.update(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who', 'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'now'])

‚úÖ Loaded spaCy English model


In [5]:
from typing import List


def _phrases_from_doc(doc):
    phrases = [chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2]
    ents = [ent.text.strip() for ent in doc.ents if len(ent.text.strip()) > 2]
    return phrases + ents


def extract_topics(headline: str, article: str, max_keywords: int = 3) -> str:
    """Headline-first topic extraction using noun chunks + entities.
    Falls back to the first article sentence when headline yields nothing."""
    candidates: List[str] = []
    headline = (headline or "").strip()
    article = (article or "").strip()

    if headline:
        doc = nlp(headline)
        candidates = _phrases_from_doc(doc)

    if not candidates and article:
        # Use first sentence of article to keep cost low
        first_sentence = next(nlp(article).sents, None)
        if first_sentence:
            candidates = _phrases_from_doc(first_sentence)

    filtered = []
    seen = set()
    for cand in candidates:
        norm = cand.lower()
        if norm in seen:
            continue
        tokens = re.split(r"\s+", cand)
        if all(tok.lower() in stopwords for tok in tokens):
            continue
        if len(cand) < 3:
            continue
        seen.add(norm)
        filtered.append(cand)
        if len(filtered) >= max_keywords:
            break

    if not filtered:
        return "news, article"
    return ", ".join(filtered)

# Quick check
_test_headline = "Mom starting to fear son's web series closest thing she'll have to grandchild"
print("Extracted topics sample:", extract_topics(_test_headline, ""))

Extracted topics sample: Mom, son's web series closest thing


In [6]:
# Extract keywords from all headlines and articles (only if not already processed)
if finetune_exists:
    print(f"Skipping keyword extraction; using existing file at {finetune_path}")
    print("Top 5 rows from cached fine-tune dataset:")
    display(df.head())
else:
    print("Extracting topics from dataset (headline-first noun chunks/entities)...")

    def extract_topics_from_row(row):
        headline = str(row.get('Headline', ''))
        article = str(row.get('Article', ''))
        return extract_topics(headline, article, max_keywords=3)

    tqdm.pandas(desc="Extracting topics")
    df['topics'] = df.progress_apply(extract_topics_from_row, axis=1)

    print(f"\n‚úÖ Extracted topics for {len(df)} rows")
    print("\nSample topics:")
    print(df[['Headline', 'topics']].head(10))

Skipping keyword extraction; using existing file at /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/fake_news/processed/babylonbee_finetune.csv
Top 5 rows from cached fine-tune dataset:


Unnamed: 0,Headline,Article,topics,system,user,result,training_text
0,Brave Adventurer Discovers Long-Lost Article H...,"MOAB, UT ‚Äî A historic discovery was made today...","Brave Adventurer, Long-Lost Article, Hidden Be...",You are a satirical news generator. When given...,"Generate an article on: Brave Adventurer, Long...",Headline: Brave Adventurer Discovers Long-Lost...,<system>You are a satirical news generator. Wh...
1,Drunk Irishmen Say They Understood Biden's Dub...,DUBLIN ‚Äî Despite claims from conservative medi...,"Drunk Irishmen, Biden's Dublin Speech, Dublin ...",You are a satirical news generator. When given...,"Generate an article on: Drunk Irishmen, Biden'...",Headline: Drunk Irishmen Say They Understood B...,<system>You are a satirical news generator. Wh...
2,John Leguizamo's Boycott Of Mario Movie Leads ...,"MANHATTAN, NY ‚Äî With The Super Mario Bros. Mov...","John Leguizamo's Boycott, Mario Movie, Sharp Rise",You are a satirical news generator. When given...,Generate an article on: John Leguizamo's Boyco...,Headline: John Leguizamo's Boycott Of Mario Mo...,<system>You are a satirical news generator. Wh...
3,Pentagon Leaker Kicking Himself For Not Just L...,"DIGHTON, MA ‚Äî Military police have arrested Ja...","Classified Documents, His Garage, Pentagon Lea...",You are a satirical news generator. When given...,"Generate an article on: Classified Documents, ...",Headline: Pentagon Leaker Kicking Himself For ...,<system>You are a satirical news generator. Wh...
4,Parents Just Relieved Teen Who Came Home Drunk...,"NEW BRITAIN, PA ‚Äî Local parents Tim and Julia ...","Just Relieved Teen, Bud Light",You are a satirical news generator. When given...,"Generate an article on: Just Relieved Teen, Bu...",Headline: Parents Just Relieved Teen Who Came ...,<system>You are a satirical news generator. Wh...


In [7]:
# Format data for fine-tuning
# System prompt: "You are a satirical news generator. When given a topic, generate a funny headline followed by the article."
# User prompt: "Generate an article on: {topics}"
# Generation format: "Headline: {headline}\nArticle: {article}"

SYSTEM_PROMPT = "You are a satirical news generator. When given a topic, generate a funny headline followed by the article."

def format_training_example(row):
    """Format a single training example."""
    topics = row['topics']
    headline = str(row['Headline'])
    article = str(row['Article'])
    
    # Format the conversation
    user_message = f"Generate an article on: {topics}"
    assistant_message = f"Headline: {headline}\nArticle: {article}"
    
    # Format for GPT-2 training (simple text format)
    # We'll use a format like: <system>...<user>...<assistant>...
    formatted_text = f"<system>{SYSTEM_PROMPT}<user>{user_message}<assistant>{assistant_message}<|endoftext|>"
    
    return formatted_text

print("Formatting training examples...")

if finetune_exists:
    print("Using existing fine-tune dataset; ensuring training_text is present")
    required_columns = {'Headline', 'Article', 'topics', 'system', 'user', 'result'}
    missing_columns = required_columns - set(df.columns)
    if missing_columns:
        raise ValueError(f"Existing fine-tune file is missing columns: {missing_columns}")
    if 'training_text' not in df.columns:
        df['training_text'] = df.apply(format_training_example, axis=1)
else:
    df['system'] = SYSTEM_PROMPT
    df['user'] = df['topics'].apply(lambda topics: f"Generate an article on: {topics}")
    df['result'] = df.apply(lambda row: f"Headline: {row['Headline']}\nArticle: {row['Article']}", axis=1)
    df['training_text'] = df.apply(format_training_example, axis=1)

    df.to_csv(finetune_path, index=False)
    print(f"‚úÖ Saved fine-tune dataset to {finetune_path}")

print(f"‚úÖ Formatted {len(df)} training examples")
print("\nSample training text:")
print(df['training_text'].iloc[0][:500] + "...")

Formatting training examples...
Using existing fine-tune dataset; ensuring training_text is present
‚úÖ Formatted 10793 training examples

Sample training text:
<system>You are a satirical news generator. When given a topic, generate a funny headline followed by the article.<user>Generate an article on: Brave Adventurer, Long-Lost Article, Hidden Beneath Labyrinth<assistant>Headline: Brave Adventurer Discovers Long-Lost Article Hidden Beneath Labyrinth Of Ads, Pop-Ups, Privacy Policies
Article: MOAB, UT ‚Äî A historic discovery was made today as a brave adventurer uncovered an internet article long thought to be lost forever underneath layers upon layers ...


In [8]:
# Load GPT-2 model and tokenizer
model_name = "gpt2"
print(f"Loading {model_name} model and tokenizer...")

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add special tokens for our format
special_tokens = {
    'pad_token': '<|pad|>',
    'bos_token': '<|startoftext|>',
    'eos_token': '<|endoftext|>',
    'additional_special_tokens': ['<system>', '<user>', '<assistant>']
}

tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# Detect device
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")
model.to(device)

print(f"‚úÖ Loaded model with {model.num_parameters():,} parameters")
print(f"Vocabulary size: {len(tokenizer)}")

Loading gpt2 model and tokenizer...


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Using device: mps
‚úÖ Loaded model with 124,443,648 parameters
Vocabulary size: 50262


In [9]:
# Prepare dataset for training
from datasets import Dataset

# Convert to HuggingFace Dataset
train_texts = df['training_text'].tolist()
dataset = Dataset.from_dict({'text': train_texts})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=512,  # GPT-2 context window
        padding='max_length',
        return_tensors='pt'
    )

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    desc="Tokenizing"
)

# Split into train/validation (90/10)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print(f"‚úÖ Training samples: {len(train_dataset)}")
print(f"‚úÖ Validation samples: {len(eval_dataset)}")

Tokenizing dataset...


Tokenizing:   0%|          | 0/10793 [00:00<?, ? examples/s]

‚úÖ Training samples: 9713
‚úÖ Validation samples: 1080


In [10]:
# Set up training arguments
output_dir = MODEL_DIR / "gpt2-satirical-news"
output_dir.mkdir(parents=True, exist_ok=True)

import inspect

training_kwargs = dict(
    output_dir=str(output_dir),
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    learning_rate=5e-5,
    fp16=device.type == "cuda",  # Mixed precision only on CUDA
    logging_steps=50,
    eval_steps=500,
    save_steps=500,  # Save checkpoint every 500 steps
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=5,  # Keep only last 5 checkpoints
    prediction_loss_only=True,
    report_to="none",  # Disable wandb/tensorboard
    dataloader_pin_memory=device.type == "cuda",
)

# Handle older transformers naming differences
if "evaluation_strategy" not in inspect.signature(TrainingArguments).parameters and "eval_strategy" in inspect.signature(TrainingArguments).parameters:
    training_kwargs["eval_strategy"] = training_kwargs.pop("evaluation_strategy")

# Filter out kwargs that are not supported by the installed transformers version
supported = set(inspect.signature(TrainingArguments).parameters.keys())
filtered_kwargs = {k: v for k, v in training_kwargs.items() if k in supported}
missing = set(training_kwargs.keys()) - supported
if missing:
    print(f"‚ö†Ô∏è Skipping unsupported TrainingArguments keys for this transformers version: {missing}")

# Ensure eval/save strategies align when load_best_model_at_end is requested
if filtered_kwargs.get("load_best_model_at_end"):
    save_strategy = filtered_kwargs.get("save_strategy") or filtered_kwargs.get("save_strategy", "steps")
    eval_key = "evaluation_strategy" if "evaluation_strategy" in supported else "eval_strategy" if "eval_strategy" in supported else None
    if eval_key:
        filtered_kwargs[eval_key] = save_strategy
    else:
        filtered_kwargs.pop("load_best_model_at_end", None)
        print("‚ö†Ô∏è load_best_model_at_end disabled because eval/save strategy key not supported in this transformers version")

training_args = TrainingArguments(**filtered_kwargs)

print(f"‚úÖ Training arguments configured")
print(f"   Output directory: {output_dir}")
if hasattr(training_args, "save_steps"):
    print(f"   Checkpoints will be saved every {training_args.save_steps} steps")

‚úÖ Training arguments configured
   Output directory: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/model/gpt2-satirical-news
   Checkpoints will be saved every 500 steps


In [11]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print("‚úÖ Trainer initialized")
print(f"   Total training steps: {len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")

‚úÖ Trainer initialized
   Total training steps: 1821


In [None]:
# Start training
print("üöÄ Starting fine-tuning...")
print(f"   Model will be saved to: {output_dir}")
print(f"   Checkpoints will be saved every {training_args.save_steps} steps")

trainer.train()

print("‚úÖ Training completed!")

üöÄ Starting fine-tuning...
   Model will be saved to: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/model/gpt2-satirical-news
   Checkpoints will be saved every 500 steps


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


In [None]:
# Save the final model and tokenizer
final_model_dir = output_dir / "final"
final_model_dir.mkdir(exist_ok=True)

trainer.save_model(str(final_model_dir))
tokenizer.save_pretrained(str(final_model_dir))

print(f"‚úÖ Final model saved to: {final_model_dir}")
print(f"   Model files:")
for file in final_model_dir.glob("*"):
    print(f"     - {file.name}")

In [None]:
# Test the fine-tuned model
print("Testing the fine-tuned model...")

# Load the fine-tuned model
test_model = GPT2LMHeadModel.from_pretrained(str(final_model_dir)).to(device)
test_tokenizer = GPT2Tokenizer.from_pretrained(str(final_model_dir))

# Test generation
test_topics = "politics, election, candidate"
test_prompt = f"<system>{SYSTEM_PROMPT}<user>Generate an article on: {test_topics}<assistant>"

inputs = test_tokenizer.encode(test_prompt, return_tensors='pt').to(device)

# Generate
with torch.no_grad():
    outputs = test_model.generate(
        inputs,
        max_length=300,
        num_return_sequences=1,
        temperature=0.8,
        do_sample=True,
        pad_token_id=test_tokenizer.pad_token_id,
        eos_token_id=test_tokenizer.eos_token_id,
    )

generated_text = test_tokenizer.decode(outputs[0], skip_special_tokens=False)
print("\nGenerated text:")
print(generated_text)

In [None]:
# List all saved checkpoints
print("üìÅ Saved checkpoints:")
checkpoint_dirs = sorted([d for d in output_dir.iterdir() if d.is_dir() and d.name.startswith('checkpoint')])
for checkpoint_dir in checkpoint_dirs:
    print(f"   {checkpoint_dir.name}")
    # Show checkpoint size
    total_size = sum(f.stat().st_size for f in checkpoint_dir.rglob('*') if f.is_file())
    print(f"     Size: {total_size / (1024**2):.2f} MB")

if final_model_dir.exists():
    final_size = sum(f.stat().st_size for f in final_model_dir.rglob('*') if f.is_file())
    print(f"\n   Final model: {final_size / (1024**2):.2f} MB")