In [3]:
!pip install sentencepiece
!pip install pytorch-lightning
!pip install transformers
# Change runtime to GPU (Google collab)
  
import random
import json
from pathlib import Path
import os
from typing import Tuple

import pytorch_lightning as pl
import pandas as pd

import torch

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer)

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 23.6 MB/s eta 0:00:01[K     |▌                               | 20 kB 19.0 MB/s eta 0:00:01[K     |▉                               | 30 kB 10.2 MB/s eta 0:00:01[K     |█                               | 40 kB 8.5 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.6 MB/s eta 0:00:01[K     |█▋                              | 61 kB 5.4 MB/s eta 0:00:01[K     |██                              | 71 kB 5.5 MB/s eta 0:00:01[K     |██▏                             | 81 kB 5.5 MB/s eta 0:00:01[K     |██▍                             | 92 kB 6.1 MB/s eta 0:00:01[K     |██▊                             | 102 kB 5.3 MB/s eta 0:00:01[K     |███                             | 112 kB 5.3 MB/s eta 0:00:01[K     |███▎                            | 122 kB 5.3 MB/s eta 0:00:01[K     |███▌       

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 37.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.4.0 sacremoses-0.0.47 tokenizers-0.11.5 transformers-4.16.2


# Task 1

## Data Preparation

In [4]:
def read_squad(path):
    path = Path(path)
    with open(path, "rb") as f:
      squad_dict = json.load(f)
    contexts = []
    questions = []
    answers = []
    for group in squad_dict["data"]:
        for passage in group["paragraphs"] :
            context = passage["context"]
            for qa in passage["qas"]:
                question = qa["question"]
                for answer in qa["answers"] :
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers

def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

def get_SQUAD2_data() -> Tuple[pd.DataFrame, pd.DataFrame] :
    train_contexts, train_questions, train_answers = read_squad("/content/train-v1.1.json")
    val_contexts, val_questions, val_answers = read_squad("/content/dev-v1.1.json")
    add_end_idx(train_answers, train_contexts)
    add_end_idx(val_answers, val_contexts)
    train_answers_text = [t["text"] for t in train_answers]
    train_answer_start = [t["answer_start"] for t in train_answers]
    train_answer_end = [t["answer_end"] for t in train_answers]
    val_answers_text = [t["text"] for t in val_answers]
    val_answer_start = [t["answer_start"] for t in val_answers]
    val_answer_end = [t["answer_end"] for t in val_answers]
    df_trn = pd.DataFrame({
        "context": train_contexts,
        "question": train_questions,
        "answer": train_answers_text,
        "answer_start" : train_answer_start,
        "answer_end" : train_answer_end
    })
    df_val = pd.DataFrame({
        "context": val_contexts,
        "question": val_questions,
        "answer": val_answers_text,
        "answer_start" : val_answer_start,
        "answer_end" : val_answer_end
    })
    return df_trn, df_val

df_trn, df_val = get_SQUAD2_data()

In [5]:
print(df_trn.shape)
print(df_val.shape)

(87599, 5)
(34726, 5)


In [6]:
df_trn.head()

Unnamed: 0,context,question,answer,answer_start,answer_end
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,541
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,213
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,296
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,420
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,126


In [7]:
p = 500
print("CONTEXT:", df_trn.iloc[p]["context"])
print("QUESTION:", df_trn.iloc[p]["question"])
print("ANSWER:", df_trn.iloc[p]["answer"])

CONTEXT: In 2011, documents obtained by WikiLeaks revealed that Beyoncé was one of many entertainers who performed for the family of Libyan ruler Muammar Gaddafi. Rolling Stone reported that the music industry was urging them to return the money they earned for the concerts; a spokesperson for Beyoncé later confirmed to The Huffington Post that she donated the money to the Clinton Bush Haiti Fund. Later that year she became the first solo female artist to headline the main Pyramid stage at the 2011 Glastonbury Festival in over twenty years, and was named the highest-paid performer in the world per minute.
QUESTION: Who did Beyonce donate the money to earned from her shows?
ANSWER: Clinton Bush Haiti Fund


## Tokenization, Summarization and Translation with T5

In [8]:
MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

t5 = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [9]:
t5.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": tru

In [10]:
type(tokenizer)

transformers.models.t5.tokenization_t5.T5Tokenizer

# Task 2

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Fine-Tuning T5 on SQuAD 1.1

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
class QADataset(torch.utils.data.Dataset) :
    def __init__(
        self,
        data : pd.DataFrame,
        tokenizer : T5Tokenizer,
        source_max_token_len : int = 512,
        target_max_token_len : int = 64
    ) :
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
    def __len__(self) :
        return len(self.data)
    def __getitem__(self, index : int) :
        data_row = self.data.iloc[index]
        source_encoding = self.tokenizer(
            data_row["question"],
            data_row["context"],
            max_length=self.source_max_token_len,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            data_row["answer"],
            max_length=self.target_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        labels = target_encoding["input_ids"]
        labels[labels==0] = -100
        return dict(
            question=data_row["question"],
            context=data_row["context"],
            answer_text=data_row["answer"],
            input_ids=source_encoding["input_ids"].flatten(),
            attention_mask=source_encoding["attention_mask"].flatten(),
            labels=labels.flatten()
        )

In [13]:
class QADataModule(pl.LightningDataModule) :
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        source_max_token_len: int = 512,
        target_max_token_len: int = 64
    ) :
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self) :
        self.train_dataset = QADataset(
            self.train_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )
        self.test_dataset = QADataset(
            self.test_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )
    def train_dataloader(self) :
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )
    def test_dataloader(self) :
        return torch.utils.data.DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
        )
    def val_dataloader(self) :
        return torch.utils.data.DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
        )

In [14]:
class T5LightningModel(pl.LightningModule) :
    def __init__(self) :
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(
            "t5-small",
            return_dict=True
        )
    def forward(
        self,
        input_ids,
        attention_mask,
        labels=None
    ) :
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    def training_step(self, batch, batch_idx) :
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, _ = self.forward(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    def test_step(self, batch, batch_idx) :
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, _ = self.forward(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    def validation_step(self, batch, batch_idx) :
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, _ = self.forward(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    def configure_optimizers(self) :
        return AdamW(self.parameters(), lr=0.001)

In [15]:
def trainer(
    NUM_EPOCHS : int = 1,
    NUM_BATCHES : int = 16,
    NUM_GPUS : int = 1
) -> None :
    tokenizer = T5Tokenizer.from_pretrained("t5-small")

    trn_df, val_df = get_SQUAD2_data()

    data_module = QADataModule(
        trn_df,
        val_df,
        tokenizer,
        batch_size=NUM_BATCHES,
        source_max_token_len=512,
        target_max_token_len=64
    )
    data_module.setup()

    model = T5LightningModel()

    checkpoints_path = "./models"
    if not os.path.exists(checkpoints_path):
        os.makedirs(checkpoints_path)

    model_name = "t5-small-squad_v2-best"
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath=checkpoints_path,
        filename=model_name,
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )

    logger = pl.loggers.TensorBoardLogger("tb_logs", name=model_name)

    trainer = pl.Trainer(
        checkpoint_callback=checkpoint_callback,
        max_epochs=NUM_EPOCHS,
        gpus=NUM_GPUS,
        progress_bar_refresh_rate=30,
        precision=32,
        distributed_backend="dp",
        logger=logger,
    )

    trainer.fit(model, data_module)
    return None

In [16]:
def trainer(
    NUM_EPOCHS : int = 3,
    NUM_BATCHES : int = 20,
    NUM_GPUS : int = 2
) -> None :
    tokenizer = T5Tokenizer.from_pretrained("t5-small")

    trn_df, val_df = get_SQUAD2_data()

    data_module = QADataModule(
        trn_df,
        val_df,
        tokenizer,
        batch_size=NUM_BATCHES,
        source_max_token_len=512,
        target_max_token_len=64
    )
    data_module.setup()

    model = T5LightningModel()

    checkpoints_path = "./models"
    if not os.path.exists(checkpoints_path):
        os.makedirs(checkpoints_path)

    model_name = "t5-small-squad_v2-best"
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath=checkpoints_path,
        filename=model_name,
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )

    logger = pl.loggers.TensorBoardLogger("tb_logs", name=model_name)

    trainer = pl.Trainer(
        checkpoint_callback=checkpoint_callback,
        max_epochs=NUM_EPOCHS,
        gpus=NUM_GPUS,
        progress_bar_refresh_rate=30,
        precision=32,
        distributed_backend="dp",
        logger=logger,
    )

    trainer.fit(model, data_module)
    return None

## Evaluating the trained model

In [17]:
def inference(N : int) -> None :
    trained_model = T5LightningModel.load_from_checkpoint(
        "/content/drive/MyDrive/ML/t5-small-squad_v1-best.ckpt"
    )
    trained_model.freeze()
    trained_model.to(device)

    tokenizer = T5Tokenizer.from_pretrained("t5-small")

    for _ in range(N) :
        _, df_val = get_SQUAD2_data()
        r = random.choice([x for x in range(df_val.shape[0])])
        question = df_val.iloc[r]

        source_encoding = tokenizer(
            question["question"],
            question["context"],
            max_length=512,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        generated_ids = trained_model.model.generate(
            input_ids=source_encoding["input_ids"].to(device),
            attention_mask=source_encoding["attention_mask"].to(device),
            num_beams=5,
            max_length=64,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            use_cache=True
        )

        preds = [
            tokenizer.decode(
                generated_id,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            for generated_id in generated_ids
        ]

        print("QUESTION:", question["question"])
        print("CONTEXT:", question["context"])
        print("ANSWER:", question["answer"])
        print("PREDICTED ANSWER :", "".join(preds))
        print("~~~~~~~")
    return None

In [19]:
inference(10)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

QUESTION: Who is the current President and the High Representative for Foreign and Security Policy?
CONTEXT: The European Commission is the main executive body of the European Union. Article 17(1) of the Treaty on European Union states the Commission should "promote the general interest of the Union" while Article 17(3) adds that Commissioners should be "completely independent" and not "take instructions from any Government". Under article 17(2), "Union legislative acts may only be adopted on the basis of a Commission proposal, except where the Treaties provide otherwise." This means that the Commission has a monopoly on initiating the legislative procedure, although the Council is the "de facto catalyst of many legislative initiatives". The Parliament can also formally request the Commission to submit a legislative proposal but the Commission can reject such a suggestion, giving reasons. The Commission's President (currently an ex-Luxembourg Prime Minister, Jean-Claude Juncker) sets t