In [None]:
! pip install transformers datasets torch evaluate

In [None]:
! pip install rouge_score

In [None]:
!pip install evaluate

In [None]:
# Create the .kaggle directory
!mkdir -p ~/.kaggle

# Copy kaggle.json to the .kaggle directory
!cp kaggle.json ~/.kaggle/

# Set permissions for the Kaggle API token
!chmod 600 ~/.kaggle/kaggle.json

# Confirm Kaggle API setup
!kaggle datasets list -s "bbc-news-summary"

In [4]:
!kaggle datasets download -d pariza/bbc-news-summary

Dataset URL: https://www.kaggle.com/datasets/pariza/bbc-news-summary
License(s): CC0-1.0
Downloading bbc-news-summary.zip to /content
 79% 7.00M/8.91M [00:01<00:00, 8.47MB/s]
100% 8.91M/8.91M [00:01<00:00, 6.21MB/s]


In [None]:
!unzip bbc-news-summary.zip -d bbc-news-summary

In [1]:
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, pipeline
from sklearn.model_selection import train_test_split
from transformers import Seq2SeqTrainingArguments
from torch.utils.data import Dataset
import torch

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load articles and summaries
def load_bbc_dataset(articles_dir, summaries_dir):
    articles = []
    summaries = []

    # Iterate through categories
    for category in os.listdir(articles_dir):
        article_path = os.path.join(articles_dir, category)
        summary_path = os.path.join(summaries_dir, category)

        if os.path.isdir(article_path):
            for file in os.listdir(article_path):
                article_file = os.path.join(article_path, file)
                summary_file = os.path.join(summary_path, file)

                if os.path.exists(article_file) and os.path.exists(summary_file):
                    with open(article_file, 'r', encoding='utf-8', errors='ignore') as af, \
                         open(summary_file, 'r', encoding='utf-8', errors='ignore') as sf:
                        articles.append(af.read().strip())
                        summaries.append(sf.read().strip())

    return articles, summaries

# Paths to articles and summaries
articles_dir = "/content/bbc-news-summary/BBC News Summary/News Articles/"
summaries_dir = "/content/bbc-news-summary/BBC News Summary/Summaries/"

# Load data
articles, summaries = load_bbc_dataset(articles_dir, summaries_dir)

# Split into train, validation, and test sets
train_articles, val_test_articles, train_summaries, val_test_summaries = train_test_split(
    articles, summaries, test_size=0.2, random_state=42
)
val_articles, test_articles, val_summaries, test_summaries = train_test_split(
    val_test_articles, val_test_summaries, test_size=0.5, random_state=42
)

# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Preprocessing function
def preprocess_data(articles, summaries, tokenizer, max_input_length=512, max_target_length=128):
    model_inputs = tokenizer(articles, max_length=max_input_length, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(summaries, max_length=max_target_length, truncation=True, padding="max_length", return_tensors="pt").input_ids

    # Replace padding token ids in labels with -100 to ignore during training
    labels[labels == tokenizer.pad_token_id] = -100

    model_inputs["labels"] = labels
    return model_inputs

# Prepare datasets
train_data = preprocess_data(train_articles, train_summaries, tokenizer)
val_data = preprocess_data(val_articles, val_summaries, tokenizer)
test_data = preprocess_data(test_articles, test_summaries, tokenizer)

# Move tensors to GPU
train_data = {key: val.to(device) for key, val in train_data.items()}
val_data = {key: val.to(device) for key, val in val_data.items()}

# Load T5 model and move to GPU
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Define training arguments for summarization
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=500,
    save_total_limit=2,
    eval_steps=500,
    do_train=True,
    do_eval=True,
)

class SummarizationDataset(Dataset):
    def __init__(self, data):
        self.input_ids = data["input_ids"]
        self.attention_mask = data["attention_mask"]
        self.labels = data["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

# Create datasets
train_dataset = SummarizationDataset(train_data)
val_dataset = SummarizationDataset(val_data)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Load trained model for evaluation
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Generate summaries for test articles
generated_summaries = [
    summarizer(article, max_length=128, min_length=30, do_sample=False)[0]["summary_text"]
    for article in test_articles
]


rouge = load_metric("rouge")
results = rouge.compute(predictions=generated_summaries, references=test_summaries, use_stemmer=True)

# Print ROUGE results
for key in results:
    print(f"{key}: {results[key].mid}")


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.570953
2,No log,0.534389
3,0.831200,0.527357


Token indices sequence length is longer than the specified maximum sequence length for this model (621 > 512). Running this sequence through the model will result in indexing errors
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


ImportError: cannot import name 'load_metric' from 'datasets' (/usr/local/lib/python3.10/dist-packages/datasets/__init__.py)

In [6]:
len(generated_summaries)

223

In [12]:
from evaluate import load

# Load the ROUGE metric
rouge = load("rouge")

# Compute ROUGE scores
results = rouge.compute(predictions=generated_summaries, references=test_summaries, use_stemmer=True)

# Print ROUGE results
for key in results:
    print(f"{key}: {results[key]}")


rouge1: 0.3755542558833496
rouge2: 0.29366889632031407
rougeL: 0.2944316313768526
rougeLsum: 0.29539844632681955
