In [23]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '6'
import importlib
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

### Data Loader

In [24]:
from datasets import load_dataset
# from torch.utils.data import Dataset, DataLoader

In [15]:
dataset = load_dataset("McGill-NLP/FaithDial")
dataset['train']

No config specified, defaulting to: faith_dial/plain_text
Found cached dataset faith_dial (/home/csgrad/jayashok/.cache/huggingface/datasets/McGill-NLP___faith_dial/plain_text/1.0.0/70568c8ab3bbc83b603bce58fa593ab27e7f0d0cde51034e1c2073ff3e14189a)
100%|██████████| 7/7 [00:00<00:00, 862.82it/s]


Dataset({
    features: ['dialog_idx', 'response', 'original_response', 'history', 'knowledge', 'BEGIN', 'VRM'],
    num_rows: 18357
})

In [2]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

#### Using Custom Pipeline for custom training loop

In [5]:
from torch.utils.data import DataLoader

In [4]:
import CustomDataset
from CustomDataset import Dataset

In [8]:
train_set = Dataset(dataset['train'], tokenizer, CustomDataset.DatasetMap.faithdial)

100%|██████████| 18357/18357 [00:00<00:00, 20474.52it/s]


In [9]:
validation_set = Dataset(dataset['validation'], tokenizer, CustomDataset.DatasetMap.faithdial)

100%|██████████| 3417/3417 [00:00<00:00, 20166.91it/s]


In [10]:
import random
print(random.choice(train_set))
print(random.choice(validation_set))

('The piano is an acoustic, stringed musical instrument invented in Italy by Bartolomeo Cristofori around the year 1700 (the exact year is uncertain), in which the strings are struck by hammers.', 'I play the piano.  Leaning more about it would be great.', 'The piano was invented in Italy by Bartolomeo Cristofori around the year 1700.')
('Comic-Con International also produces two other conventions, WonderCon, held in Anaheim, and the Alternative Press Expo (APE), held in San Francisco.', "I guess I'll have to see if I get lucky. Does Comic-Con have any other conventions?", "In fact it does, there is WonderCon and APE or Alternative Press Expo. They're held at Anaheim and San Francisco respectively.")


In [11]:
BATCH_SIZE = 64

In [12]:
my_trainset_dataloader = DataLoader(train_set, batch_size=BATCH_SIZE,
                                    num_workers=16, collate_fn=lambda data: train_set.pack_minibatch(data))
my_validation_dataloader = DataLoader(validation_set, batch_size=BATCH_SIZE,
                                        num_workers=16, collate_fn=lambda data: validation_set.pack_minibatch(data))

### Model Initialization

In [5]:
# Initialize the T5 model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Resize the model's embeddings to accommodate the new tokens (No New Tokens used yet)
# model.resize_token_embeddings(len(tokenizer))

### Training

In [25]:
import torch
from tqdm import tqdm

In [26]:
DEVICE = 'cuda:3'
NUM_TRAIN_EPOCHS = 15
MAX_INPUT_LENGTH = 256

In [7]:
model.train()
model.to(DEVICE)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
for epoch in range(NUM_TRAIN_EPOCHS):
    epoch_train_loss = 0.
    ### Training Piece
    for contexts,questions,answers in tqdm(my_trainset_dataloader):
        optimizer.zero_grad()

        inputs = list(map(lambda tuple: f"question: {tuple[0]}  context: {tuple[1]}", zip(questions,contexts)))
        encoded_inputs = tokenizer(
                                inputs,
                                padding="longest",
                                max_length=MAX_INPUT_LENGTH,
                                truncation=True,
                                return_tensors="pt",
                            )
        encoded_targets = tokenizer(
                                answers,
                                padding="longest",
                                max_length=MAX_INPUT_LENGTH,
                                truncation=True,
                                return_tensors="pt",
                            )

        input_ids, attention_mask = encoded_inputs.input_ids, encoded_inputs.attention_mask
        encoded_targets = encoded_targets.input_ids

        # replace padding target token id's of the labels by -100, crossEntropy skip target label == -100
        encoded_targets[encoded_targets == tokenizer.pad_token_id] = -100

        input_ids = input_ids.to(DEVICE)
        encoded_targets = encoded_targets.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=encoded_targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item() * BATCH_SIZE
    print(f"epoch={epoch + 1}/{NUM_TRAIN_EPOCHS}")
    print(f"\t Train loss = {epoch_train_loss/len(train_set):.4f}")

    ## Validation Piece
    model.eval()
    epoch_val_loss = 0.
    with torch.no_grad():
        # model_predictions_encoded = []
        # target_encoded = []
        for contexts, questions, answers in tqdm(my_validation_dataloader):
            inputs = list(map(lambda tuple: f"question: {tuple[0]}  context: {tuple[1]}", zip(questions,contexts)))
            encoded_inputs = tokenizer(
                inputs,
                padding="longest",
                max_length=MAX_INPUT_LENGTH,
                truncation=True,
                return_tensors="pt",
            )
            encoded_targets = tokenizer(
                answers,
                padding="longest",
                max_length=MAX_INPUT_LENGTH,
                truncation=True,
                return_tensors="pt",
            )
            encoded_inputs, attention_mask = encoded_inputs.input_ids, encoded_inputs.attention_mask
            encoded_targets = encoded_targets.input_ids

            encoded_inputs = encoded_inputs.to(DEVICE)
            encoded_targets = encoded_targets.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            outputs = model(input_ids=encoded_inputs, attention_mask=attention_mask, labels=encoded_targets)
            loss = outputs.loss
            epoch_val_loss += loss.item() * BATCH_SIZE
    print(f"\t Val loss = {epoch_val_loss/len(validation_set):.4f}")
    model.train()

In [20]:
model.save_pretrained("t5_finetuned_faithdial_qa_ep15_seqlen256")

### Inference

In [27]:
model.from_pretrained("t5_finetuned_faithdial_qa_ep15_seqlen256")
model.to(DEVICE)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [32]:
test_sample = dataset['test'][40]

In [33]:
test_sample

{'dialog_idx': 9,
 'response': "I'm unable to solve that but you may be curious to know that it has 3000 years of history.",
 'original_response': 'Best medicine is to just take up a hobby and get involved. But the concept of heartbreak dates back at least 3,000 years.',
 'history': ['My ex girlfriend broke my heart. What are some ways to deal with heartbreak?'],
 'knowledge': 'The concept is cross-cultural, often cited with reference to a desired or lost lover, and dates back at least 3,000 years.',
 'BEGIN': ['Hallucination', 'Entailment'],
 'VRM': ['Disclosure', 'Advisement']}

In [34]:
def infer(model, prompt, knowledge, max_input_length, max_output_length, device):
    # question = "What is 42?"
    # context = "42 is the answer to life, the universe and everything"
    input = f"question: {prompt} context: {knowledge}"
    encoded_input = tokenizer([input],
                                return_tensors='pt',
                                max_length=max_input_length,
                                truncation=True)
    inp = encoded_input.input_ids.to(device)
    attn_mask = encoded_input.attention_mask.to(device)
    output = model.generate(input_ids = inp,
                                attention_mask = attn_mask, max_length=max_output_length)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    # print(output)
    return output

In [36]:
resp = infer(model, prompt=test_sample['history'][-1], knowledge=test_sample['knowledge'],
             max_input_length=MAX_INPUT_LENGTH, max_output_length=100, device=DEVICE)

In [38]:
# epoch 10 seqlen 256
print('p:', test_sample['history'][-1])
print('k:', test_sample['knowledge'])
print('res:', resp)
print('org:', test_sample['original_response'])
print('gt:', test_sample['response'])

p: My ex girlfriend broke my heart. What are some ways to deal with heartbreak?
k: The concept is cross-cultural, often cited with reference to a desired or lost lover, and dates back at least 3,000 years.
res: I'm not sure, but the concept of heartbreak is cross-cultural, often cited with reference to a desired or lost lover.
org: Best medicine is to just take up a hobby and get involved. But the concept of heartbreak dates back at least 3,000 years.
gt: I'm unable to solve that but you may be curious to know that it has 3000 years of history.


In [27]:
# epoch 10 seqlen 256
print('p:', test_sample['history'][-1])
print('k:', test_sample['knowledge'])
print('res:', res)
print('org:', test_sample['original_response'])
print('gt:', test_sample['response'])

p: Yes it is. It is also the color of Emeralds, and sometimes the color of camouflage.
k: It is evoked by light which has a dominant wavelength of roughly 495570 nm.
res: Oh, I see. It's evoked by light, which has a dominant wavelength of 495570 nm.
org: I have read it has a dominant wavelength of roughly 495570 nm.
gt: The dominant wavelength of green is around 495570 nm.


In [37]:
# Epoch 10 seqlen 512
print('p:', test_sample['history'][-1])
print('k:', test_sample['knowledge'])
print('res:', res)
print('org:', test_sample['original_response'])
print('gt:', test_sample['response'])

p: Yes it is. It is also the color of Emeralds, and sometimes the color of camouflage.
k: It is evoked by light which has a dominant wavelength of roughly 495570 nm.
res: It is evoked by light which has a dominant wavelength of about 495570 nm.
org: I have read it has a dominant wavelength of roughly 495570 nm.
gt: The dominant wavelength of green is around 495570 nm.


In [None]:
## Get Results on entire_test set

In [9]:
model.from_pretrained("t5_finetuned_faithdial_qa_ep15_seqlen256")

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [11]:
MAX_INPUT_LENGTH, DEVICE

(256, 'cuda:3')

In [17]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("./t5_finetuned_faithdial_qa_ep15_seqlen256")
model.to(DEVICE)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [18]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [19]:
from tqdm import tqdm

In [None]:
# edit-response

In [20]:
all_results = []
for test_sample in tqdm(dataset['test'], total=len(dataset['test'])):
    input_ = {'knowledge': test_sample['knowledge'],
            'prompt': test_sample['history'][-1]}
    
    resp = infer(model, max_input_length=MAX_INPUT_LENGTH, max_output_length=100, device=DEVICE, **input_)
    all_results.append([input_['knowledge'], input_['prompt'], resp,
                         test_sample['original_response'], test_sample['response']])

# clean_all-results
all_results = [[cell if cell is not None else '' for cell in row] for row in all_results]
data_dump = "\n".join(['|'.join(res) for res in all_results])

with open("T5_edit_FaithDial_khorr.txt", 'w') as f:
    f.write(data_dump)

100%|██████████| 3539/3539 [07:17<00:00,  8.10it/s]


TypeError: sequence item 3: expected str instance, NoneType found

In [21]:
all_results = [[cell if cell is not None else '' for cell in row] for row in all_results]

In [22]:
data_dump = "\n".join(['|'.join(res) for res in all_results])

with open("T5_gen_FaithDial.txt", 'w') as f:
    f.write(data_dump)