In [None]:
!pip install torch torchvision torchaudio transformers datasets evaluate \
  rouge-score bert-score nltk --upgrade

In [None]:
!pip install --upgrade transformers
!pip install ipywidgets


In [3]:
from transformers import GPT2Tokenizer, GPT2Model
import os
import torch
import torch.nn as nn
import numpy as np
import random
import pickle
import functools
from datetime import datetime
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from datasets import load_dataset
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import single_meteor_score
import concurrent.futures  # (Keep only once)

# Set device for GPU usage
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple MPS GPU")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("‚úÖ Using NVIDIA CUDA GPU")
else:
    device = torch.device("cpu")
    print("‚ö†Ô∏è Using CPU (no GPU available)")

‚úÖ Using NVIDIA CUDA GPU


In [4]:
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
import pickle

# ‚úÖ Set your NLTK data path explicitly
nltk.data.path.append('')

# ‚úÖ Load Punkt tokenizer manually
with open('', 'rb') as f:
    punkt_tokenizer = pickle.load(f)


In [5]:
# ‚úÖ Sentence splitting
def split_into_sentences(text):
    return punkt_tokenizer.tokenize(text)

# ‚úÖ Label sentences based on ROUGE-L
def label_sentences(sentences, reference, top_k=3):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = [scorer.score(reference, sent)['rougeL'].fmeasure for sent in sentences]
    top_idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [1 if i in top_idxs else 0 for i in range(len(sentences))]

# ‚úÖ Device and Tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Important: GPT-2 has no pad_token by default

# ‚úÖ GPT-2 Extractive Summarizer Model
class GPT2ExtractiveSummarizer(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = GPT2Model.from_pretrained("gpt2")
        self.classifier = nn.Linear(self.encoder.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_rep = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_rep).squeeze(-1)
        return logits

# ‚úÖ Custom Dataset
class ExtractiveDataset(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return {
            "input_ids": sample["input_ids"],
            "attention_mask": sample["attention_mask"],
            "label": torch.tensor(sample["label"], dtype=torch.float)
        }

# üì• Load CNN/DailyMail dataset
raw_dataset = load_dataset("cnn_dailymail", "3.0.0")
train_data = raw_dataset["train"]
val_data = raw_dataset["validation"]
test_data = raw_dataset["test"]

import concurrent.futures

# Only split sentences and label them in parallel
def process_article(i):
    if i >= len(train_articles):
        return None
    article = train_articles[i]['article']
    summary = train_articles[i]['highlights']
    sentences = split_into_sentences(article)
    if not sentences:
        return None
    labels = label_sentences(sentences, summary)
    return (sentences, labels)

print("üîÑ Preprocessing 100k training samples with checkpointing using 8 CPUs...")

samples = []
start_idx = 0
sample_ckpt_path = ""

# üîÑ Try to resume from checkpoint if exists
if os.path.exists(sample_ckpt_path):
    print("üîÅ Loading existing checkpoint...")
    samples = torch.load(sample_ckpt_path)
    start_idx = len(samples)  # how many samples already saved
    print(f"‚úÖ Loaded {start_idx} samples from checkpoint!")

print(f"üîÑ Preprocessing remaining {100000 - start_idx} training samples using 8 CPUs...")

train_articles = raw_dataset["train"]

with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
    for idx, result in enumerate(executor.map(process_article, range(start_idx, 100000)), start=start_idx):
        if result:
            sentences, labels = result
            tokenized = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
            for j in range(len(sentences)):
                samples.append({
                    "input_ids": tokenized['input_ids'][j],
                    "attention_mask": tokenized['attention_mask'][j],
                    "label": labels[j]
                })
        if (idx + 1) % 10000 == 0 or idx == 99999:
            print(f"‚úÖ Processed {idx+1} articles ‚Äî saving progress...")
            torch.save(samples, sample_ckpt_path)

torch.save(samples, "")
print("‚úÖ Preprocessing complete!")

üîÑ Preprocessing 100k training samples with checkpointing using 8 CPUs...
üîÅ Loading existing checkpoint...
‚úÖ Loaded 934112 samples from checkpoint!
üîÑ Preprocessing remaining -834112 training samples using 8 CPUs...
‚úÖ Preprocessing complete!


In [6]:
# üìä Check number of final training samples
final_samples = torch.load("")
print(f"‚úÖ Total preprocessed training samples: {len(final_samples)}")

‚úÖ Total preprocessed training samples: 934112


In [7]:
# üèãÔ∏è Training setup
train_dataset = ExtractiveDataset(samples)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# ‚úÖ Move model to device first
model = GPT2ExtractiveSummarizer().to(device)

# ‚úÖ Forcefully use 4 GPUs if available
if torch.cuda.device_count() > 1:
    print(f"‚úÖ Using {torch.cuda.device_count()} GPUs (DataParallel)")
    model = nn.DataParallel(model)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()

save_path = "best_gpt2_extractive.pt"
checkpoint_path = "gpt2_extractive_checkpoint.pt"

start_epoch = 0
best_rougel = 0.0
num_epochs = 3 

# üîÅ Resume training if checkpoint exists
if os.path.exists(checkpoint_path):
    print("üîÅ Loading training checkpoint...")
    checkpoint = torch.load(checkpoint_path, map_location=device)
    
    # If using DataParallel, wrap model temporarily
    if isinstance(model, nn.DataParallel):
        model.module.load_state_dict(checkpoint['model_state_dict'])
    else:
        model.load_state_dict(checkpoint['model_state_dict'])

    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    best_rougel = checkpoint['best_rougel']
    start_epoch = checkpoint['epoch'] + 1
    print(f"‚úÖ Resuming from epoch {start_epoch}")

# üß™ Validation Evaluation
def evaluate_rougel(model, val_data, tokenizer, device, max_samples=2000):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    total_score = 0.0
    with torch.no_grad():
        for i in range(min(max_samples, len(val_data))):
            article = val_data[i]['article']
            reference = val_data[i]['highlights']
            sentences = split_into_sentences(article)
            if not sentences:
                continue
            tokenized = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
            input_ids = tokenized['input_ids'].to(device)
            attention_mask = tokenized['attention_mask'].to(device)
            logits = model(input_ids, attention_mask)
            topk = torch.topk(logits, k=min(3, len(sentences))).indices.tolist()
            pred_summary = " ".join([sentences[i] for i in topk])
            score = scorer.score(reference, pred_summary)['rougeL'].fmeasure
            total_score += score
    return total_score / max_samples

# üèÉ Training Loop
print("\nüöÄ Training...")
for epoch in range(start_epoch, num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"‚úÖ Epoch {epoch+1} Avg Loss: {avg_loss:.4f}")

    val_rougel = evaluate_rougel(model, val_data, tokenizer, device)
    print(f"üìà Validation ROUGE-L: {val_rougel:.4f}")

    if val_rougel > best_rougel:
        best_rougel = val_rougel
        # ‚úÖ Save .module.state_dict() because model is wrapped with DataParallel
        torch.save(model.module.state_dict(), save_path)
        print(f"üíæ Best model saved (ROUGE-L {val_rougel:.4f})")

    # ‚úÖ Save entire model checkpoint
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.module.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_rougel': best_rougel
    }, checkpoint_path)
    print(f"üìå Checkpoint saved at epoch {epoch+1}")

‚úÖ Using 4 GPUs (DataParallel)
üîÅ Loading training checkpoint...
‚úÖ Resuming from epoch 3

üöÄ Training...


In [None]:
import nltk

nltk.download('wordnet', download_dir='')
nltk.download('omw-1.4', download_dir='')

In [9]:
import os


os.environ["NLTK_DATA"] = ""

In [13]:
# üîç Final Test Evaluation
checkpoint = torch.load(save_path)

# ‚úÖ Correctly load for single GPU or multi-GPU
if isinstance(model, nn.DataParallel):
    model.module.load_state_dict(checkpoint)
else:
    model.load_state_dict(checkpoint)

model.eval()


def evaluate_on_test(model, dataset, tokenizer, device, max_samples=10000):
    model.eval()
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    references, predictions = [], []
    meteor_total, r1_total, r2_total, rl_total = 0, 0, 0, 0

    with torch.no_grad():
        for i in range(min(max_samples, len(dataset))):
            article = dataset[i]['article']
            reference = dataset[i]['highlights']
            sentences = split_into_sentences(article)
            if not sentences:
                continue

            tokenized = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
            input_ids = tokenized['input_ids'].to(device)    # [num_sentences, 128]
            attention_mask = tokenized['attention_mask'].to(device)

            logits = model(input_ids, attention_mask)   # [num_sentences]
            topk = torch.topk(logits, k=min(3, len(sentences))).indices.tolist()

            pred_summary = " ".join([sentences[i] for i in topk])

            scores = rouge.score(reference, pred_summary)
            r1_total += scores['rouge1'].fmeasure
            r2_total += scores['rouge2'].fmeasure
            rl_total += scores['rougeL'].fmeasure
            meteor_total += single_meteor_score(reference.split(), pred_summary.split())

            references.append(reference)
            predictions.append(pred_summary)

    precision, recall, f1 = bert_score(predictions, references, lang='en', verbose=False)
    n = len(predictions)
    print(f"\nüìä Final Evaluation on {n} test samples")
    print(f"ROUGE-1 F1: {r1_total / n:.4f}")
    print(f"ROUGE-2 F1: {r2_total / n:.4f}")
    print(f"ROUGE-L F1: {rl_total / n:.4f}")
    print(f"METEOR:     {meteor_total / n:.4f}")
    print(f"BERTScore P/R/F1: {precision.mean().item():.4f} / {recall.mean().item():.4f} / {f1.mean().item():.4f}")


In [14]:
import pandas as pd
evaluate_on_test(model, test_data, tokenizer, device)

# ‚ú® Save 100 qualitative summaries
summary_pairs = []
with torch.no_grad():
    for i in range(100):
        article = test_data[i]['article']
        reference = test_data[i]['highlights']
        sentences = split_into_sentences(article)
        tokenized = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        input_ids = tokenized['input_ids'].to(device)
        attention_mask = tokenized['attention_mask'].to(device)
        logits = model(input_ids, attention_mask)
        topk = torch.topk(logits, k=min(3, len(sentences))).indices.tolist()
        pred_summary = " ".join([sentences[i] for i in topk])
        summary_pairs.append({
            "article": article[:500] + "...",
            "reference": reference,
            "predicted_summary": pred_summary
        })

pd.DataFrame(summary_pairs).to_csv("gpt2_extractive_summary_pairs.csv", index=False)
print("‚úÖ Summary pairs saved to gpt2_extractive_summary_pairs.csv")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìä Final Evaluation on 10000 test samples
ROUGE-1 F1: 0.3249
ROUGE-2 F1: 0.1158
ROUGE-L F1: 0.1969
METEOR:     0.2476
BERTScore P/R/F1: 0.8535 / 0.8590 / 0.8562
‚úÖ Summary pairs saved to gpt2_extractive_summary_pairs.csv
