In [None]:
!pip install torch torchvision torchaudio transformers datasets evaluate \
  rouge-score bert-score nltk --upgrade

In [None]:
!pip install --upgrade transformers
!pip install ipywidgets


In [8]:
from transformers import GPT2Tokenizer, GPT2Model
import os
import torch
import torch.nn as nn
import numpy as np
import random
import pickle
import functools
from datetime import datetime
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from datasets import load_dataset
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import single_meteor_score
import concurrent.futures  # (Keep only once)

# Set device for GPU usage
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple MPS GPU")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("‚úÖ Using NVIDIA CUDA GPU")
else:
    device = torch.device("cpu")
    print("‚ö†Ô∏è Using CPU (no GPU available)")

‚úÖ Using NVIDIA CUDA GPU


In [9]:
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
import pickle

# ‚úÖ Set your NLTK data path explicitly
nltk.data.path.append('')

# ‚úÖ Load Punkt tokenizer manually
with open('', 'rb') as f:
    punkt_tokenizer = pickle.load(f)




In [None]:
!pip install scikit-learn

In [11]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# üì• Load dataset
raw_dataset = load_dataset("gigaword")
train_articles = raw_dataset["train"]
total_samples = 100000

# üîß File paths
save_dir = ""
os.makedirs(save_dir, exist_ok=True)
final_path = os.path.join(save_dir, "gpt2_final_samples.pt")
ckpt_path = os.path.join(save_dir, "gpt2_samples_progress.pt")

# ‚úÖ Sentence splitting
def split_into_sentences(text):
    return punkt_tokenizer.tokenize(text)

# ‚úÖ Label sentences with ROUGE-L
def label_sentences(sentences, reference, top_k=3):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = [scorer.score(reference, sent)['rougeL'].fmeasure for sent in sentences]
    top_idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [1 if i in top_idxs else 0 for i in range(len(sentences))]

# ‚úÖ Process one article
def process_article(i):
    if i >= len(train_articles): return None
    article = train_articles[i]['document']
    summary = train_articles[i]['summary']
    sentences = split_into_sentences(article)
    if not sentences: return None
    labels = label_sentences(sentences, summary)
    return sentences, labels

# üîÅ Load checkpoint if exists
samples = []
start_idx = 0
if os.path.exists(ckpt_path):
    print("üîÅ Resuming from existing checkpoint...")
    samples = torch.load(ckpt_path)
    start_idx = len(samples)
    print(f"‚úÖ Loaded {start_idx} preprocessed samples")

# üõ†Ô∏è Parallel preprocessing
print(f"üîÑ Preprocessing {total_samples - start_idx} remaining articles using 8 CPUs...")

with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
    for idx, result in enumerate(executor.map(process_article, range(start_idx, total_samples)), start=start_idx):
        if result:
            sentences, labels = result
            tokenized = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
            for j in range(len(sentences)):
                samples.append({
                    "input_ids": tokenized['input_ids'][j],
                    "attention_mask": tokenized['attention_mask'][j],
                    "label": labels[j]
                })

        # üíæ Checkpoint every 10k
        if (idx + 1) % 10000 == 0 or (idx + 1) == total_samples:
            print(f"‚úÖ Processed {idx + 1} articles ‚Äî saving checkpoint...")
            torch.save(samples, ckpt_path)

# üíæ Save final result
torch.save(samples, final_path)
print("‚úÖ Preprocessing complete and saved:", final_path)

In [14]:
# üìä Check number of final training samples
final_samples = torch.load("")
print(f"‚úÖ Total preprocessed training samples: {len(final_samples)}")

‚úÖ Total preprocessed training samples: 100313


In [20]:

# ‚úÖ Sentence splitting
def split_into_sentences(text):
    return punkt_tokenizer.tokenize(text)

# ‚úÖ Label sentences based on ROUGE-L
def label_sentences(sentences, reference, top_k=3):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = [scorer.score(reference, sent)['rougeL'].fmeasure for sent in sentences]
    top_idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [1 if i in top_idxs else 0 for i in range(len(sentences))]

# ‚úÖ GPT-2 Extractive Summarizer Model
class GPT2ExtractiveSummarizer(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = GPT2Model.from_pretrained("gpt2")
        self.classifier = nn.Linear(self.encoder.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_rep = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_rep).squeeze(-1)
        return logits

# ‚úÖ Custom Dataset
class ExtractiveDataset(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return {
            "input_ids": sample["input_ids"],
            "attention_mask": sample["attention_mask"],
            "label": torch.tensor(sample["label"], dtype=torch.float)
        }

# ‚úÖ Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# ‚úÖ Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# üì• Load full validation split from Gigaword
raw_dataset = load_dataset("gigaword")
val_data_full = raw_dataset["validation"]

# ‚úÖ Use Hugging Face's built-in split method
split = val_data_full.train_test_split(test_size=0.5, seed=42)
val_data = split["train"]
test_data = split["test"]

# üíæ Save to disk
val_data.save_to_disk("")
test_data.save_to_disk("")

print(f"‚úÖ Validation Samples: {len(val_data):,}")
print(f"‚úÖ Test Samples      : {len(test_data):,}")

# üîÅ Load training samples
samples = torch.load("")

# üèãÔ∏è Training setup
train_dataset = ExtractiveDataset(samples)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

model = GPT2ExtractiveSummarizer().to(device)

if torch.cuda.device_count() > 1:
    print(f"‚úÖ Using {torch.cuda.device_count()} GPUs (DataParallel)")
    model = nn.DataParallel(model)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()

save_path = "best_gpt2_extractive_gigaword.pt"
checkpoint_path = "gpt2_extractive_checkpoint_gigaword.pt"

start_epoch = 0
best_rougel = 0.0
num_epochs = 3

if os.path.exists(checkpoint_path):
    print("üîÅ Loading training checkpoint...")
    checkpoint = torch.load(checkpoint_path, map_location=device)
    if isinstance(model, nn.DataParallel):
        model.module.load_state_dict(checkpoint['model_state_dict'])
    else:
        model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    best_rougel = checkpoint['best_rougel']
    start_epoch = checkpoint['epoch'] + 1
    print(f"‚úÖ Resuming from epoch {start_epoch}")

# ‚úÖ ROUGE Evaluation Function
def evaluate_rougel(model, val_data, tokenizer, device, max_samples=2000):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    total_score = 0.0
    with torch.no_grad():
        for i in range(min(max_samples, len(val_data))):
            article = val_data[i]['document']
            reference = val_data[i]['summary']
            sentences = split_into_sentences(article)
            if not sentences:
                continue
            tokenized = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
            input_ids = tokenized['input_ids'].to(device)
            attention_mask = tokenized['attention_mask'].to(device)
            logits = model(input_ids, attention_mask)
            topk = torch.topk(logits, k=min(3, len(sentences))).indices.tolist()
            pred_summary = " ".join([sentences[i] for i in topk])
            score = scorer.score(reference, pred_summary)['rougeL'].fmeasure
            total_score += score
    return total_score / max_samples

# üöÄ Training Loop
print("\nüöÄ Training...")
for epoch in range(start_epoch, num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"‚úÖ Epoch {epoch+1} Avg Loss: {avg_loss:.4f}")

    val_rougel = evaluate_rougel(model, val_data, tokenizer, device)
    print(f"üìà Validation ROUGE-L: {val_rougel:.4f}")

    if val_rougel > best_rougel:
        best_rougel = val_rougel
        torch.save(model.module.state_dict(), save_path)
        print(f"üíæ Best model saved (ROUGE-L {val_rougel:.4f})")

    torch.save({
        'epoch': epoch,
        'model_state_dict': model.module.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_rougel': best_rougel
    }, checkpoint_path)
    print(f"üìå Checkpoint saved at epoch {epoch+1}")

Saving the dataset (0/1 shards):   0%|          | 0/94825 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/94826 [00:00<?, ? examples/s]

‚úÖ Validation Samples: 94,825
‚úÖ Test Samples      : 94,826


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

‚úÖ Using 4 GPUs (DataParallel)

üöÄ Training...
‚úÖ Epoch 1 Avg Loss: 0.0209
üìà Validation ROUGE-L: 0.2544
üíæ Best model saved (ROUGE-L 0.2544)
üìå Checkpoint saved at epoch 1
‚úÖ Epoch 2 Avg Loss: 0.0002
üìà Validation ROUGE-L: 0.2545
üíæ Best model saved (ROUGE-L 0.2545)
üìå Checkpoint saved at epoch 2
‚úÖ Epoch 3 Avg Loss: 0.0001
üìà Validation ROUGE-L: 0.2544
üìå Checkpoint saved at epoch 3


In [None]:
import nltk

nltk.download('wordnet', download_dir='')
nltk.download('omw-1.4', download_dir='')

In [22]:
import os


os.environ["NLTK_DATA"] = ""

In [23]:
# üîç Final Test Evaluation
checkpoint = torch.load(save_path)

# ‚úÖ Correctly load for single GPU or multi-GPU
if isinstance(model, nn.DataParallel):
    model.module.load_state_dict(checkpoint)
else:
    model.load_state_dict(checkpoint)

model.eval()


def evaluate_on_test(model, dataset, tokenizer, device, max_samples=10000):
    model.eval()
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    references, predictions = [], []
    meteor_total, r1_total, r2_total, rl_total = 0, 0, 0, 0

    with torch.no_grad():
        for i in range(min(max_samples, len(dataset))):
            article = dataset[i]['document']       # ‚úÖ updated
            reference = dataset[i]['summary'] 
            sentences = split_into_sentences(article)
            if not sentences:
                continue

            tokenized = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
            input_ids = tokenized['input_ids'].to(device)    # [num_sentences, 128]
            attention_mask = tokenized['attention_mask'].to(device)

            logits = model(input_ids, attention_mask)   # [num_sentences]
            topk = torch.topk(logits, k=min(3, len(sentences))).indices.tolist()

            pred_summary = " ".join([sentences[i] for i in topk])

            scores = rouge.score(reference, pred_summary)
            r1_total += scores['rouge1'].fmeasure
            r2_total += scores['rouge2'].fmeasure
            rl_total += scores['rougeL'].fmeasure
            meteor_total += single_meteor_score(reference.split(), pred_summary.split())

            references.append(reference)
            predictions.append(pred_summary)

    precision, recall, f1 = bert_score(predictions, references, lang='en', verbose=False)
    n = len(predictions)
    print(f"\nüìä Final Evaluation on {n} test samples")
    print(f"ROUGE-1 F1: {r1_total / n:.4f}")
    print(f"ROUGE-2 F1: {r2_total / n:.4f}")
    print(f"ROUGE-L F1: {rl_total / n:.4f}")
    print(f"METEOR:     {meteor_total / n:.4f}")
    print(f"BERTScore P/R/F1: {precision.mean().item():.4f} / {recall.mean().item():.4f} / {f1.mean().item():.4f}")


In [24]:
import pandas as pd
evaluate_on_test(model, test_data, tokenizer, device)

# ‚ú® Save 100 qualitative summaries
summary_pairs = []
with torch.no_grad():
    for i in range(100):
        article = test_data[i]['document']      # ‚úÖ updated
        reference = test_data[i]['summary']     # ‚úÖ updated
        sentences = split_into_sentences(article)
        tokenized = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        input_ids = tokenized['input_ids'].to(device)
        attention_mask = tokenized['attention_mask'].to(device)
        logits = model(input_ids, attention_mask)
        topk = torch.topk(logits, k=min(3, len(sentences))).indices.tolist()
        pred_summary = " ".join([sentences[i] for i in topk])
        summary_pairs.append({
            "article": article[:500] + "...",
            "reference": reference,
            "predicted_summary": pred_summary
        })

pd.DataFrame(summary_pairs).to_csv("gpt2_extractive_summary_pairs.csv", index=False)
print("‚úÖ Summary pairs saved to gpt2_extractive_summary_pairs.csv")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìä Final Evaluation on 10000 test samples
ROUGE-1 F1: 0.2904
ROUGE-2 F1: 0.1029
ROUGE-L F1: 0.2520
METEOR:     0.3926
BERTScore P/R/F1: 0.8348 / 0.9011 / 0.8664
‚úÖ Summary pairs saved to gpt2_extractive_summary_pairs.csv
