In [26]:
from collections import defaultdict
import statistics
import pandas as pd
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl

In [2]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [3]:
max_source_length = 512
max_target_length = 128

In [4]:
# Define two training examples
input_sequence_1 = "Welcome to NYC"
output_sequence_1 = "Bienvenue à NYC"

input_sequence_2 = "HuggingFace is a company"
output_sequence_2 = "HuggingFace est une entreprise"

In [5]:
task_prefix = "translate English to French: "
input_sequences = [input_sequence_1, input_sequence_2]

In [6]:
# Get input encoding
encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

In [7]:
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

In [8]:
print(input_ids)
print(attention_mask)

tensor([[13959,  1566,    12,  2379,    10,  5242,    12, 13465,     1,     0,
             0,     0,     0,     0],
        [13959,  1566,    12,  2379,    10, 11560,  3896,   371,  3302,    19,
             3,     9,   349,     1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [9]:
target_encoding = tokenizer(
    [output_sequence_1, output_sequence_2],
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)

labels = target_encoding.input_ids

In [10]:
print(labels)

tensor([[10520, 15098,     3,    85, 13465,     1,     0,     0],
        [11560,  3896,   371,  3302,   259,   245, 11089,     1]])


In [11]:
print(tokenizer.pad_token_id) # pad_token_id

0


In [12]:
labels[labels == tokenizer.pad_token_id] = -100

In [13]:
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

0.18801377713680267

In [14]:
# load dataset
def get_parallel_corpus(ip_df, story_df):
    # hash stories and sections
    story_sec_hash = defaultdict(dict)
    for i, row in story_df.iterrows():
        story_sec_hash[row['source_title']][row['cor_section']] = row['text']
    
    story, answer, question = [], [], []
    for i, row in ip_df.iterrows():
        sec_nums = row['cor_section'].split(',')
        story_str = ''
        for sec_num in sec_nums:
            story_str += story_sec_hash[row['source_title']][int(sec_num)]
        story.append(story_str)
        answer.append(row['answer'])
        question.append(row['question'])
    
    return story, answer, question

In [15]:
story_file = '../../data/original/source_texts.csv'
story_df = pd.read_csv(story_file)
# Train-Val split
train_file = '../../data/train_val_split_csv/train.csv'
train_df = pd.read_csv(train_file)
val_file = '../../data/train_val_split_csv/val.csv'
val_df = pd.read_csv(val_file)

train_story, train_answer, train_question = get_parallel_corpus(train_df, story_df)
val_story, val_answer, val_question = get_parallel_corpus(val_df, story_df)

In [16]:
def get_stats(story, answer, question):
    print('Average story length:', statistics.mean([len(stry) for stry in story]))
    print('Average answer length:', statistics.mean([len(ans) for ans in answer]))
    print('Average question length:', statistics.mean([len(quest) for quest in question]))

In [17]:
# print stats
print('Train Set')
get_stats(train_story, train_answer, train_question)

print('Valid Set')
get_stats(val_story, val_answer, val_question)

Train Set
Average story length: 1015.8474604496253
Average answer length: 36.77901748542881
Average question length: 51.96319733555371
Valid Set
Average story length: 987.5924796747968
Average answer length: 33.672764227642276
Average question length: 48.9369918699187


In [18]:
# Constrcut t5 input 
def construct_t5_input(story, answer):
    inps = []
    prefix = 'Generate question from story and answer: '
    for stry, ans in zip(story, answer):
        t5_input = prefix + ' The story is ' + stry + ' The answer is ' + ans 
        inps.append(t5_input)
    return inps

In [19]:
train_inps = construct_t5_input(train_story, train_answer)
val_inps = construct_t5_input(val_story, val_answer)

In [20]:
def get_t5_encoding(t5_inputs, answer):
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    max_source_length, max_target_length = 512, 128

    inp_encoding = tokenizer(t5_inputs, padding='longest', 
                        max_length=max_source_length,
                        truncation=True,
                        return_tensors="pt"
                    )
    input_ids, attention_mask = inp_encoding.input_ids, inp_encoding.attention_mask

    target_encoding = tokenizer(answer, padding='longest', 
                        max_length=max_target_length,
                        truncation=True,
                        return_tensors="pt"
                    )
    
    labels = target_encoding.input_ids

    # 0 loss for pad tokens
    labels[labels == tokenizer.pad_token_id] = -100

    return input_ids, attention_mask, labels

In [21]:
train_input_ids, train_attention_mask, train_labels = get_t5_encoding(train_inps, train_answer)
val_input_ids, val_attention_mask, val_labels = get_t5_encoding(val_inps, val_answer)

In [34]:
class FairyDataset(Dataset):
    def __init__(self, input_ids, attn_masks, labels):
        self.input_ids = input_ids
        self.attn_masks = attn_masks
        self.labels = labels
        
    def __getitem__(self, index):
        x = self.input_ids[index]
        y = self.attn_masks[index]
        z = self.labels[index]
        
        return {'input_ids': x, 'attention_mask': y, 'labels':z}
    
    def __len__(self):
        return len(self.input_ids)

In [35]:
train_dataset = FairyDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = FairyDataset(val_input_ids, val_attention_mask, val_labels)

In [36]:
def get_dataloader(batch_size, dataset):
    return DataLoader(dataset=dataset, shuffle=True, batch_size = batch_size)

In [38]:
batch_size = 8
train_dataloader = get_dataloader(batch_size, train_dataset)
valid_dataloader = get_dataloader(batch_size, val_dataset)

# T5 Model with Pytorch Lightning

In [39]:
class FinetuneT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=5, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
        self.save_hyperparameters()
    
    def forward(self, input_ids, attention_mask, labels=None):     
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs
    
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss
    
    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)

        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)

        return loss
    
    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     

        return loss
    
    def configure_optimizers(self):
        # create optimizer
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        # create learning rate scheduler
        num_train_optimization_steps = self.hparams.num_train_epochs * len(train_dataloader)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.hparams.warmup_steps,
                                                    num_training_steps=num_train_optimization_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}
        
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}
    
    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader


In [41]:
import wandb, os

os.environ['WANDB_NOTEBOOK_NAME'] = 'FinetuneT5'

wandb.login()

True

In [42]:
model = FinetuneT5()

In [None]:
# Trainig code
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

wandb_logger = WandbLogger(name='FinetuneT5', project='Quest_Gen_Challenge')

early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)

lr_monitor = LearningRateMonitor(logging_interval='step')


trainer = Trainer(gpus=0, 
                  default_root_dir="./Checkpoints", 
                  logger=wandb_logger, 
                  callbacks=[early_stop_callback, lr_monitor])


trainer.fit(model)

In [None]:
save_directory = './Checkpoints'
model.save_pretrained(save_directory)