In [15]:
!pip install --quiet transformers==4.1.1
!pip install --quiet pytorch-lightning=1.1.2
!pip install --quiet tokenizers==0.9.4
!pip install --quiet sentencepiece==0.1.94

In [2]:
import argparse
import glob
import os
import logging
import json
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

from torch import nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torch.utils.data import Dataset,DataLoader, random_split
from torchvision import transforms
import pytorch_lightning as pl

from transformers import(
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [3]:
pl.seed_everything(3)

Global seed set to 3


3

In [4]:
question = {'context': 'In 1799 The Manhattan Company is founded. The Manhattan Company, JPMorgan Chase earliest predecessor institution, is chartered by the New York State legislature to supply "pure and wholesome" drinking water to the city\'s growing population.  Among its founders are Alexander Hamilton and Aaron Burr. A provision in the charter allows The Manhattan Company to use its surplus capital for banking operations.  Within five months, The Bank of The Manhattan Company opens for business, becoming the second commercial bank in New York City after Hamilton’s Bank of New York.  With his banking monopoly broken, Hamilton severs his association with the water company',
            'question': 'When was The Manhattan Company founded ? ',
            'answer':'1799'}

In [5]:
datarows = []
datarows.append({
    "question": question['question'],
    "context" : question['context'],
    "answer_text": question['answer']
})
datarows.append({
    "question": 'Who founded The Manhattan Company ?',
    "context" : question['context'],
    "answer_text": 'Alexander Hamilton and Aaron Burr'
})
datarows.append({
    "question": 'What happened in 1799 ?',
    "context" : question['context'],
    "answer_text": question['context']
})
df = pd.DataFrame(datarows)
df

Unnamed: 0,question,context,answer_text
0,When was The Manhattan Company founded ?,In 1799 The Manhattan Company is founded. The ...,1799
1,Who founded The Manhattan Company ?,In 1799 The Manhattan Company is founded. The ...,Alexander Hamilton and Aaron Burr
2,What happened in 1799 ?,In 1799 The Manhattan Company is founded. The ...,In 1799 The Manhattan Company is founded. The ...


In [6]:
m_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(m_name)

In [7]:
# sample_question = df.iloc[0]

In [8]:
# encoding = tokenizer(
#     sample_question['question'],
#     sample_question['context'],
#     max_length=396,
#     padding="max_length",
#     truncation="only_second",
#     return_attention_mask=True,
#     add_special_tokens=True,
#     return_tensors="pt"
# )

In [9]:
# answer_encoding = tokenizer( 
#     sample_question['answer_text'],
#     max_length=32,
#     padding="max_length",
#     truncation=True,
#     return_attention_mask=True,
#     add_special_tokens=True,
#     return_tensors="pt"
# )

In [10]:
# model = T5ForConditionalGeneration.from_pretrained(m_name, return_dict=True)

In [11]:
# output = model(
#     input_ids=encoding["input_ids"],
#     attention_mask=encoding["attention_mask"],
#     labels = answer_encoding["input_ids"]
# )

In [12]:
# output.logits.shape, output.loss

In [13]:
class QADataset(Dataset):
  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: T5Tokenizer,
      source_max_token_length: int = 396,
      target_max_token_length: int = 32
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_length = source_max_token_length
    self.target_max_token_length = target_max_token_length

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index: int):
    data_row = self.data.iloc[index]
  
    source_encoding = tokenizer(
      data_row['question'],
      data_row['context'],
      max_length=self.source_max_token_length,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
    )

    target_encoding = tokenizer(
      data_row['answer_text'],
      max_length=self.target_max_token_length,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
    )

    labels = target_encoding["input_ids"]
    labels[labels == 0] = -100

    return dict(
        question=data_row["question"],
        context=data_row["context"],
        answer_text=data_row["answer_text"],
        input_ids=source_encoding["input_ids"].flatten(),
        labels=labels.flatten()
    )


In [14]:
class QADataModel(pl.LightningDataModule):

  def __init__(
      self,
      train_df: pd.DataFrame,
      tokenizer: T5Tokenizer,
      batch_size: int=8,
      source_max_token_length = 396,
      target_max_token_length = 32
  ):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.train_dataset = None
    self.tokenizer = tokenizer
    self.source_max_token_length = source_max_token_length
    self.target_max_token_length = target_max_token_length

  def setup(self):
    self.train_dataset = QADataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_length,
        self.target_max_token_length
    )
  
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=2
    )
    

In [15]:
BATCH_SIZE = 1
EPOCHS = 5
data_module = QADataModel(df,tokenizer,batch_size=BATCH_SIZE)
data_module.setup()

In [16]:
class QAModel(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(m_name,return_dict = True)
  
  def forward(self, input_ids, attention_mask, labels=None):
    output = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    return output.loss, output.logits
  
  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    # attention_mask = batch['attention_mask']
    labels = batch["labels"]
    loss, outputs = self(input_ids,None,labels)
    self.log("val_loss",loss,prog_bar=True,logger=True)
    return loss
  
  def configure_optimizers(self):
    return AdamW(self.parameters(),lr=0.001)


In [17]:
model = QAModel()

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode="min"
)



In [19]:
trainer = Trainer(
    checkpoint_callback=checkpoint_callback,
    max_epochs=EPOCHS,
    progress_bar_refresh_rate=30
)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores


In [20]:
# %load_ext tensorboard

In [21]:
# %tensorboard --logdir ./lightning_logs

In [22]:
!rm -r lightning_logs

In [36]:
trainer.fit(model, data_module)


  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Epoch 0, global step 2: val_loss reached 4.76596 (best 4.76596), saving model to "/content/checkpoints/best-checkpoint-v1.ckpt" as top 1
Epoch 1, global step 5: val_loss reached 2.99939 (best 2.99939), saving model to "/content/checkpoints/best-checkpoint-v1.ckpt" as top 1
Epoch 2, global step 8: val_loss reached 1.13788 (best 1.13788), saving model to "/content/checkpoints/best-checkpoint-v1.ckpt" as top 1
Epoch 3, global step 11: val_loss reached 0.87239 (best 0.87239), saving model to "/content/checkpoints/best-checkpoint-v1.ckpt" as top 1
Epoch 4, global step 14: val_loss reached 0.49230 (best 0.49230), saving model to "/content/checkpoints/best-checkpoint-v1.ckpt" as top 1





1

In [23]:
ls checkpoints

best-checkpoint.ckpt  best-checkpoint-v1.ckpt


In [24]:
trained_model = QAModel.load_from_checkpoint("checkpoints/best-checkpoint-v1.ckpt")
trained_model.freeze()

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
def generate_answer(question):
  source_encoding = tokenizer(
      question["question"],
      question["context"],
      max_length=396,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )

  generated_ids = trained_model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=3,
      max_length=80,
      repetition_penalty=1.0,
      early_stopping=True,
      use_cache=True
  )

  preds = [
           tokenizer.decode(generated_id,skip_special_tokens=True,clean_up_tokenization_spaces=True)
           for generated_id in generated_ids
  ]

  return "".join(preds)

In [73]:
datarows = []
datarows.append({
    "question": 'when was The Manhattan Company founded ?',
    "context" : question['context']
})
datarows.append({
    "question": 'What happened in 1799 ?',
    "context" : question['context']
})
datarows.append({
    "question": 'where did Chase open its first environmentally friendly branch ?',
    "context" : 'In 2007, Chase opens its first environmentally friendly branch in Denver.'
})
datarows.append({
    "question": 'When did Chase open its first environmentally friendly branch ?',
    "context" : 'In 2007, Chase opens its first environmentally friendly branch in Denver.'
})
datarows.append({
    "question": 'What happened in the year 1998 ?',
    "context" : 'In the year 1998, Banc One merges with First Chicago NBD, The new firm, retaining the name Bank One Corporation, chooses Chicago as its headquarters and becomes the fourth largest bank in the U.S. and the world\'s largest Visa credit card issuer.'
})
datarows.append({
    "question": 'When did Banc One merge with First Chicago NBD ?',
    "context" : 'In the year 1998, Banc One merges with First Chicago NBD, The new firm, retaining the name Bank One Corporation, chooses Chicago as its headquarters and becomes the fourth largest bank in the U.S. and the world\'s largest Visa credit card issuer.'
})
datarows.append({
    "question": 'Where is the headquaters located ?',
    "context" : 'In the year 1998, Banc One merges with First Chicago NBD, The new firm, retaining the name Bank One Corporation, chooses Chicago as its headquarters and becomes the fourth largest bank in the U.S. and the world\'s largest Visa credit card issuer.'
})
v_df = pd.DataFrame(datarows)

In [74]:
sample = v_df.iloc[1]
sample

question                              What happened in 1799 ?
context     In 1799 The Manhattan Company is founded. The ...
Name: 1, dtype: object

In [75]:
generate_answer(sample)

'1799'