# Headline Generation Training Experiment

This notebook demonstrates how to use the modular training pipeline for headline generation.

## Setup and Imports

In [None]:
# Install required packages if running in a new environment
# !pip install torch transformers datasets rouge-score nltk streamlit bitsandbytes trl peft flash-attn huggingface-hub pandas numpy

In [None]:
import sys
import os
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer

# Add src to path
sys.path.append('./src')

# Import our modules
from src import (
    load_dataset,
    train_headline_model,
    quick_train,
    load_trained_model,
    generate_headlines,
    setup_model_and_tokenizer
)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")

## Data Loading and Exploration

In [None]:
# Load the dataset
print("Loading dataset...")
train_df, val_df = load_dataset()

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print("\nTraining data sample:")
print(train_df.head())

### Data Analysis

In [None]:
# Analyze article and headline lengths
train_df['article_length'] = train_df['article'].str.len()
train_df['headline_length'] = train_df['headline'].str.len()
train_df['article_words'] = train_df['article'].str.split().str.len()
train_df['headline_words'] = train_df['headline'].str.split().str.len()

print("Article statistics:")
print(train_df[['article_length', 'article_words']].describe())
print("\nHeadline statistics:")
print(train_df[['headline_length', 'headline_words']].describe())

In [None]:
# Visualize length distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Article length distribution
axes[0, 0].hist(train_df['article_length'], bins=50, alpha=0.7)
axes[0, 0].set_title('Article Length Distribution (characters)')
axes[0, 0].set_xlabel('Length')
axes[0, 0].set_ylabel('Frequency')

# Headline length distribution
axes[0, 1].hist(train_df['headline_length'], bins=50, alpha=0.7, color='orange')
axes[0, 1].set_title('Headline Length Distribution (characters)')
axes[0, 1].set_xlabel('Length')
axes[0, 1].set_ylabel('Frequency')

# Article word count distribution
axes[1, 0].hist(train_df['article_words'], bins=50, alpha=0.7, color='green')
axes[1, 0].set_title('Article Word Count Distribution')
axes[1, 0].set_xlabel('Word Count')
axes[1, 0].set_ylabel('Frequency')

# Headline word count distribution
axes[1, 1].hist(train_df['headline_words'], bins=30, alpha=0.7, color='red')
axes[1, 1].set_title('Headline Word Count Distribution')
axes[1, 1].set_xlabel('Word Count')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Show some example article-headline pairs
print("Sample article-headline pairs:\n")
for i in range(3):
    print(f"Example {i+1}:")
    print(f"Article: {train_df.iloc[i]['article'][:200]}...")
    print(f"Headline: {train_df.iloc[i]['headline']}")
    print("-" * 80)

## Training Configuration and Execution

In [None]:
# Define training parameters
training_config = {
    "model_id": "Helsinki-NLP/opus-mt-en-mul",
    "output_dir": "./models/headline-generator-experiment",
    "hub_model_name": "headline-generator-opus-mt-en-mul-qlora-sft-v2",
    
    # Training parameters
    "num_train_epochs": 1,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 128,
    "learning_rate": 2.0e-05,
    
    # LoRA parameters
    "lora_r": 64,
    "lora_alpha": 16,
    "lora_dropout": 0.1,
    
    # Other settings
    "max_length": 256,
    "fp16": True,
    "seed": 42,
    
    # Hub upload
    "push_to_hub_after_training": False  # Set to True to push to hub
}

print("Training configuration:")
for key, value in training_config.items():
    print(f"  {key}: {value}")

In [None]:
# Start training
print("Starting training with the configured parameters...")

try:
    trainer, metrics = train_headline_model(**training_config)
    print("\nTraining completed successfully!")
    print(f"Final training metrics: {metrics}")
except Exception as e:
    print(f"Training failed with error: {e}")
    import traceback
    traceback.print_exc()

## Model Testing and Evaluation

In [None]:
# Load the trained model
model_path = training_config["output_dir"]

if os.path.exists(model_path):
    print(f"Loading trained model from: {model_path}")
    model, tokenizer = load_trained_model(model_path)
    print("Model loaded successfully!")
else:
    print(f"Model path {model_path} does not exist. Training may have failed.")

In [None]:
# Test the model with some examples from the validation set
test_articles = val_df['article'].head(5).tolist()
true_headlines = val_df['headline'].head(5).tolist()

print("Generating headlines for test articles...")
generated_headlines = generate_headlines(
    model=model,
    tokenizer=tokenizer,
    articles=test_articles,
    max_length=128,
    num_beams=5
)

print("\nGenerated vs True Headlines:")
print("=" * 100)
for i, (article, true_headline, generated_headline) in enumerate(
    zip(test_articles, true_headlines, generated_headlines)
):
    print(f"\nExample {i+1}:")
    print(f"Article: {article[:150]}...")
    print(f"True headline: {true_headline}")
    print(f"Generated headline: {generated_headline}")
    print("-" * 50)

In [None]:
# Test with custom articles
custom_articles = [
    "The stock market reached an all-time high today as investors showed confidence in the technology sector. Major tech companies reported strong quarterly earnings, driving the NASDAQ index up by 3.2%.",
    "Scientists at MIT have developed a new type of battery that can charge electric vehicles in under 5 minutes. The breakthrough technology uses a novel lithium-metal composition that promises to revolutionize the EV industry.",
    "The World Health Organization announced new guidelines for global pandemic preparedness following lessons learned from COVID-19. The recommendations include improved early warning systems and international cooperation protocols."
]

print("Testing model with custom articles...")
custom_headlines = generate_headlines(
    model=model,
    tokenizer=tokenizer,
    articles=custom_articles,
    max_length=64,
    num_beams=3
)

print("\nCustom Article Headlines:")
print("=" * 80)
for i, (article, headline) in enumerate(zip(custom_articles, custom_headlines)):
    print(f"\nExample {i+1}:")
    print(f"Article: {article}")
    print(f"Generated headline: {headline}")
    print("-" * 50)

## Push to Hugging Face Hub (Optional)

In [None]:
# Uncomment and run this cell to push the model to Hugging Face Hub
# Make sure you're logged in to Hugging Face Hub first

# from huggingface_hub import notebook_login
# notebook_login()

# from src.training_utils import push_to_hub

# hub_model_name = training_config["hub_model_name"]
# print(f"Pushing model to Hugging Face Hub as: {hub_model_name}")

# try:
#     push_to_hub(model, tokenizer, hub_model_name)
#     print("Model successfully pushed to Hub!")
# except Exception as e:
#     print(f"Failed to push to hub: {e}")

## Experiment with Different Parameters

In [None]:
# Quick training experiment with different parameters
quick_config = {
    "output_dir": "./models/headline-generator-quick",
    "num_train_epochs": 1,
    "gradient_accumulation_steps": 64,  # Reduced for faster training
    "learning_rate": 1.0e-04,  # Higher learning rate
    "lora_r": 32,  # Smaller LoRA rank
    "max_length": 128,  # Shorter sequences
}

print("Running quick training experiment...")
print("Quick training configuration:")
for key, value in quick_config.items():
    print(f"  {key}: {value}")

# Uncomment to run quick training
# trainer_quick, metrics_quick = quick_train(**quick_config)
# print(f"Quick training metrics: {metrics_quick}")

## Summary and Next Steps

This notebook demonstrates:

1. **Data Loading and Analysis**: How to load and explore the headline generation dataset
2. **Modular Training**: Using the structured training pipeline with configurable parameters
3. **Model Testing**: Testing the trained model with both validation data and custom examples
4. **Hub Integration**: Optional pushing to Hugging Face Hub for sharing

### Potential Next Steps:

- **Hyperparameter Tuning**: Experiment with different learning rates, LoRA parameters, and training schedules
- **Data Augmentation**: Try different data preprocessing techniques
- **Model Comparison**: Train multiple models with different base architectures
- **Evaluation Metrics**: Implement more comprehensive evaluation (ROUGE, BLEU, human evaluation)
- **Deployment**: Integrate the trained model into the Streamlit app

### Configuration Files:

For production use, consider creating configuration files (YAML/JSON) to manage training parameters instead of hardcoding them in notebooks.