Load the required libraies in Python

In [None]:
!pip install transformers datasets torch tqdm numpy py7zr rouge_score

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting py7zr
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.21.0-cp36-abi3-manyli

# Preparing Data

## Load the Dataset

Choose a smaller dataset suitable for text summarization and load it from hugging_face

In [None]:
from datasets import load_dataset

dataset = load_dataset("samsum")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

samsum.py:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

The repository for samsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/samsum.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


corpus.7z:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:
dataset["train"][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

## Load Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

## Preprocess Data

## Get tokenized dataset

The pretrained tokenizer is applied to tokenize the natural language text input (i.e., 'dialogue') and target (i.e., 'summary') into tensors with a sequence of token IDs, which is obtained by predefined vocabulary to prepare for training.

In [None]:
def tokenize_input(example):
  # Format the input text with start and end prompts or prefix, which helps the model focus on summarization task.
  start_prompt = "Summarize the conversation. \n\n"
  end_prompt = "\n\nSummary: "
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]

  # Tokenize both input and labels data
  inputs = tokenizer(prompt, padding="max_length", truncation=True, return_tensors='pt', max_length=50)
  labels = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors='pt', max_length=50)

  inputs["labels"] = labels["input_ids"]

  return inputs

# Retrieve pad_token_id
pad_index = tokenizer.pad_token_id

# Adjust padding token to end_of_sentence(eos)
tokenizer.pad_token = tokenizer.eos_token

tokenized_datasets = dataset.map(tokenize_input, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['dialogue', 'summary', 'id'])

small_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["test"]

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

## Get tokenized and padded dataloader

In [None]:
from torch.utils.data import DataLoader
import torch

def get_collate_fn(pad_index):
    def collate_fn(batch):
        # Extract input_ids and labels as lists of tensors
        input_ids = [torch.tensor(example['input_ids'], dtype=torch.long) for example in batch]
        labels = [torch.tensor(example['labels'], dtype=torch.long) for example in batch]

        # Pad input_ids and labels into same length
        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=pad_index)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=pad_index)

        # Return the padded batch as a dictionary
        return {
            'input_ids': input_ids,
            'labels': labels
        }
    return collate_fn

def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    # Get the collate function with the specified padding index
    collate_fn = get_collate_fn(pad_index)

    # Create and return the DataLoader
    data_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )
    return data_loader

batch_size = 32
train_dataloader = get_data_loader(small_train_dataset, batch_size, pad_index, shuffle=True)
eval_dataloader = get_data_loader(small_eval_dataset, batch_size, pad_index, shuffle=False)

# Build the Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import tqdm
import random
import numpy as np


class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        assert (src >= 0).all() and (src < input_dim).all(), "Index out of range in src!"
        embedded = self.dropout(self.embedding(src))

        outputs, (hidden, cell) = self.lstm(embedded)

        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # Ensure to decode one token at a time
        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        # Obtain the prediction from linear layer
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0, :]

        # Decide whether use teacher forcing during training (i.e., uses the true target tokens as the next input rather than the predicted ones)
        for t in range(1, trg_length):

            output, hidden, cell = self.decoder(input, hidden, cell)

            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio

            top1 = output.argmax(1)

            input = trg[t] if teacher_force else top1

        return outputs

    def generate(self, src, max_length=50):
        batch_size = src.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_length, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = torch.zeros(batch_size).long().to(self.device)  # Assuming <sos> token is 0

        for t in range(max_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output

            top1 = output.argmax(1)

            input = top1

        return outputs

# Train the model

In [None]:
input_dim = 40000
output_dim = 40000
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 64
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2Seq(encoder, decoder, device).to(device)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)
optimizer = optim.Adam(model.parameters())

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(40000, 256)
    (rnn): LSTM(256, 64, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(40000, 256)
    (rnn): LSTM(256, 64, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=64, out_features=40000, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

## training loop

In [None]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device, pad_index
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["input_ids"].to(device)
        trg = batch["labels"].to(device)

        optimizer.zero_grad()

        output = model(src, trg, teacher_forcing_ratio)

        # shift and reshape the output for loss
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        # clip gradiaents to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

## model training

In [None]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_dataloader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
        pad_index
    )

    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")


 10%|█         | 1/10 [02:50<25:38, 170.95s/it]

	Train Loss:   4.354 | Train PPL:  77.761


 20%|██        | 2/10 [05:37<22:26, 168.28s/it]

	Train Loss:   3.902 | Train PPL:  49.488


 30%|███       | 3/10 [08:23<19:31, 167.41s/it]

	Train Loss:   3.848 | Train PPL:  46.879


 40%|████      | 4/10 [11:09<16:41, 166.86s/it]

	Train Loss:   3.828 | Train PPL:  45.990


 50%|█████     | 5/10 [13:56<13:54, 166.86s/it]

	Train Loss:   3.802 | Train PPL:  44.768


 60%|██████    | 6/10 [16:44<11:08, 167.17s/it]

	Train Loss:   3.791 | Train PPL:  44.309


 70%|███████   | 7/10 [19:30<08:20, 166.67s/it]

	Train Loss:   3.788 | Train PPL:  44.149


 80%|████████  | 8/10 [22:16<05:33, 166.51s/it]

	Train Loss:   3.778 | Train PPL:  43.728


 90%|█████████ | 9/10 [25:02<02:46, 166.43s/it]

	Train Loss:   3.769 | Train PPL:  43.356


100%|██████████| 10/10 [27:48<00:00, 166.87s/it]

	Train Loss:   3.766 | Train PPL:  43.206





# Evaluation

In [None]:
from rouge_score import rouge_scorer

def evaluate_fn(model, data_loader, tokenizer, device):
  model.eval()
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
  all_scores = []

  with torch.no_grad():
    for batch in data_loader:
      src = batch["input_ids"].to(device)

      generated_outputs = model.generate(src)

      generated_summaries = tokenizer.batch_decode(generated_outputs.argmax(2), skip_special_tokens=True)
      ground_truth_summaries = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

      for generated_summary, ground_truth_summary in zip(generated_summaries, ground_truth_summaries):
          scores = scorer.score(ground_truth_summary, generated_summary)
          all_scores.append(scores)

  avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in all_scores])
  avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in all_scores])
  avg_rougeL = np.mean([score['rougeL'].fmeasure for score in all_scores])

  return avg_rouge1, avg_rouge2, avg_rougeL

In [None]:
avg_rouge1, avg_rouge2, avg_rougeL = evaluate_fn(model, eval_dataloader, tokenizer, device)
print(f"Average ROUGE-1: {avg_rouge1:.4f}")
print(f"Average ROUGE-2: {avg_rouge2:.4f}")
print(f"Average ROUGE-L: {avg_rougeL:.4f}")

Average ROUGE-1: 0.0854
Average ROUGE-2: 0.0002
Average ROUGE-L: 0.0749
