In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import importlib
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PRETRAINED_MODEL_NAME = "t5-small"

# Worth trying the below one..
# PRETRAINED_MODEL_NAME = "MaRiOrOsSi/t5-base-finetuned-question-answering"

BATCH_SIZE = 64

DEVICE = 'cuda:2'
NUM_TRAIN_EPOCHS = 15
MAX_INPUT_LENGTH = 256

CKPT_SAVE_PATH = "t5_finetuned_wow_qa_ep15_seqlen256"

### Data Loader

In [3]:
from datasets import load_dataset
# from torch.utils.data import Dataset, DataLoader

In [4]:
dataset = load_dataset("McGill-NLP/FaithDial")
dataset['train']

No config specified, defaulting to: faith_dial/plain_text
Found cached dataset faith_dial (/home/csgrad/jayashok/.cache/huggingface/datasets/McGill-NLP___faith_dial/plain_text/1.0.0/70568c8ab3bbc83b603bce58fa593ab27e7f0d0cde51034e1c2073ff3e14189a)
100%|██████████| 7/7 [00:00<00:00, 480.16it/s]


Dataset({
    features: ['dialog_idx', 'response', 'original_response', 'history', 'knowledge', 'BEGIN', 'VRM'],
    num_rows: 18357
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

#### Using Custom Pipeline for custom training loop

In [6]:
from torch.utils.data import DataLoader

In [7]:
import CustomDataset
from CustomDataset import Dataset

In [8]:
train_set = Dataset(dataset['train'], tokenizer, CustomDataset.DatasetMap.wow)

100%|██████████| 18357/18357 [00:00<00:00, 20236.74it/s]


In [9]:
validation_set = Dataset(dataset['validation'], tokenizer, CustomDataset.DatasetMap.wow)

100%|██████████| 3417/3417 [00:00<00:00, 20224.91it/s]


In [10]:
import random
print(random.choice(train_set))
print(random.choice(validation_set))

("Frank B. Adams (December 19, 1847 – after January 1923), commonly known as Yank Adams, was a professional carom billiards player who specialized in finger billiards, in which a player directly manipulates the balls with his or her hands, instead of using an implement such as a cue stick, often by twisting the ball between one's thumb and middle finger.", "There are different sizes of pool tables. I enjoy playing on the 7-foot, or what I like to call the ''bar table'' because most bars have the smaller tables to fit.", 'I think Yank Adams is my favorite billiard champ of all time.')
('Calvin Cordozar Broadus, Jr. (born October 20, 1971), known professionally as Snoop Dogg, is an American rapper, singer, songwriter, record producer, television personality, and actor.', 'Snoop dog is my idol, he is my favorite artist in the world.', 'You are right, he is multitalented, he is not only a rapper, he is also an actor and a record producer.')


In [11]:
my_trainset_dataloader = DataLoader(train_set, batch_size=BATCH_SIZE,
                                    num_workers=16, collate_fn=lambda data: train_set.pack_minibatch(data))
my_validation_dataloader = DataLoader(validation_set, batch_size=BATCH_SIZE,
                                        num_workers=16, collate_fn=lambda data: validation_set.pack_minibatch(data))

### Model Initialization

In [12]:
# Initialize the T5 model
model = AutoModelForSeq2SeqLM.from_pretrained(PRETRAINED_MODEL_NAME)

# Resize the model's embeddings to accommodate the new tokens (No New Tokens used yet)
# model.resize_token_embeddings(len(tokenizer))


### Training

In [13]:
import torch
from tqdm import tqdm

In [14]:
model.train()
model.to(DEVICE)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [15]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [16]:
def process_data(tokenizer, questions, contexts, answers, max_input_length, device):
    inputs = list(map(lambda tuple: f"question: {tuple[0]}  context: {tuple[1]}", zip(questions,contexts)))
    encoded_inputs = tokenizer(
                            inputs,
                            padding="longest",
                            max_length=max_input_length,
                            truncation=True,
                            return_tensors="pt",
                        )
    encoded_targets = tokenizer(
                            answers,
                            padding="longest",
                            max_length=max_input_length,
                            truncation=True,
                            return_tensors="pt",
                        )
    
    input_ids, attention_mask = encoded_inputs.input_ids, encoded_inputs.attention_mask
    encoded_targets = encoded_targets.input_ids

    # replace padding target token id's of the labels by -100, crossEntropy skip target label == -100
    encoded_targets[encoded_targets == tokenizer.pad_token_id] = -100

    encoded_inputs = input_ids.to(device)
    encoded_targets = encoded_targets.to(device)
    attention_mask = attention_mask.to(device)

    return encoded_inputs, attention_mask, encoded_targets

In [18]:
for epoch in range(NUM_TRAIN_EPOCHS):
    ### Training loop
    epoch_train_loss = 0.0
    model.train()
    for contexts,questions,answers in tqdm(my_trainset_dataloader):
        # print("*"*20)
        # print('Answers:')
        # print(answers)

        encoded_inputs, attention_mask, encoded_targets = process_data(tokenizer, questions, contexts, answers, max_input_length=MAX_INPUT_LENGTH, device=DEVICE)
        optimizer.zero_grad()
        outputs = model(input_ids=encoded_inputs, attention_mask=attention_mask, labels=encoded_targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item() * encoded_inputs.shape[0]

    ## Validation loop
    model.eval()
    epoch_val_loss = 0.0
    with torch.no_grad():
        # model_predictions_encoded = []
        # target_encoded = []
        for contexts, questions, answers in tqdm(my_validation_dataloader):
            encoded_inputs, attention_mask, encoded_targets = process_data(tokenizer, questions, contexts, answers, max_input_length=MAX_INPUT_LENGTH, device=DEVICE)
            outputs = model(input_ids=encoded_inputs, attention_mask=attention_mask, labels=encoded_targets)
            loss = outputs.loss
            epoch_val_loss += loss.item() * encoded_inputs.shape[0]

    print("*"*20)
    print(f"epoch={epoch + 1}/{NUM_TRAIN_EPOCHS}")
    print(f"\t Train loss = {epoch_train_loss/len(train_set):.4f}")
    print(f"\t Val loss = {epoch_val_loss/len(validation_set):.4f}")

100%|██████████| 287/287 [01:01<00:00,  4.66it/s]
100%|██████████| 54/54 [00:04<00:00, 11.22it/s]


********************
epoch=1/15
	 Train loss = 2.8588
	 Val loss = 2.6314


100%|██████████| 287/287 [01:02<00:00,  4.56it/s]
100%|██████████| 54/54 [00:04<00:00, 11.26it/s]


********************
epoch=2/15
	 Train loss = 2.7478
	 Val loss = 2.5987


100%|██████████| 287/287 [01:03<00:00,  4.55it/s]
100%|██████████| 54/54 [00:04<00:00, 11.01it/s]


********************
epoch=3/15
	 Train loss = 2.6947
	 Val loss = 2.5792


100%|██████████| 287/287 [01:02<00:00,  4.56it/s]
100%|██████████| 54/54 [00:04<00:00, 11.11it/s]


********************
epoch=4/15
	 Train loss = 2.6513
	 Val loss = 2.5634


100%|██████████| 287/287 [01:03<00:00,  4.53it/s]
100%|██████████| 54/54 [00:04<00:00, 10.87it/s]


********************
epoch=5/15
	 Train loss = 2.6139
	 Val loss = 2.5561


100%|██████████| 287/287 [01:03<00:00,  4.55it/s]
100%|██████████| 54/54 [00:05<00:00, 10.43it/s]


********************
epoch=6/15
	 Train loss = 2.5801
	 Val loss = 2.5490


100%|██████████| 287/287 [01:05<00:00,  4.41it/s]
100%|██████████| 54/54 [00:05<00:00, 10.71it/s]


********************
epoch=7/15
	 Train loss = 2.5515
	 Val loss = 2.5459


100%|██████████| 287/287 [01:04<00:00,  4.45it/s]
100%|██████████| 54/54 [00:05<00:00,  9.99it/s]


********************
epoch=8/15
	 Train loss = 2.5231
	 Val loss = 2.5405


100%|██████████| 287/287 [01:03<00:00,  4.51it/s]
100%|██████████| 54/54 [00:05<00:00, 10.30it/s]


********************
epoch=9/15
	 Train loss = 2.4960
	 Val loss = 2.5407


100%|██████████| 287/287 [01:03<00:00,  4.54it/s]
100%|██████████| 54/54 [00:04<00:00, 11.14it/s]


********************
epoch=10/15
	 Train loss = 2.4714
	 Val loss = 2.5393


100%|██████████| 287/287 [01:03<00:00,  4.53it/s]
100%|██████████| 54/54 [00:04<00:00, 10.95it/s]


********************
epoch=11/15
	 Train loss = 2.4483
	 Val loss = 2.5417


100%|██████████| 287/287 [01:03<00:00,  4.53it/s]
100%|██████████| 54/54 [00:04<00:00, 11.27it/s]


********************
epoch=12/15
	 Train loss = 2.4255
	 Val loss = 2.5444


100%|██████████| 287/287 [01:03<00:00,  4.52it/s]
100%|██████████| 54/54 [00:04<00:00, 10.98it/s]


********************
epoch=13/15
	 Train loss = 2.4042
	 Val loss = 2.5438


100%|██████████| 287/287 [01:02<00:00,  4.56it/s]
100%|██████████| 54/54 [00:04<00:00, 11.23it/s]


********************
epoch=14/15
	 Train loss = 2.3826
	 Val loss = 2.5452


100%|██████████| 287/287 [01:03<00:00,  4.55it/s]
100%|██████████| 54/54 [00:05<00:00, 10.59it/s]

********************
epoch=15/15
	 Train loss = 2.3630
	 Val loss = 2.5472





In [20]:
model.save_pretrained(CKPT_SAVE_PATH)

### Inference

In [30]:
test_sample = dataset['test'][30]

In [31]:
test_sample

{'dialog_idx': 6,
 'response': 'Yes, of course, she is the founder and namesake of the Chanel brand!',
 'original_response': 'Yes of course she is the founder and namesake of the chanel brand!',
 'history': ['I like wearing cardigans, they make me feel fancy. Do you know anything cool about them?',
  "One thing I know is that there is a modern version that doesn't have any buttons.",
  "I didn't know there was a modern version and an old version, what was the old version like?",
  "I don't really know, but it's a knitted sweater with an open front.",
  'I wonder who came up with the idea of a cardigan.',
  'I know that Coco Chanel is credited with popularizing cardigans for women.',
  "Oh interesting, I'm not really familiar with Coco Chanel, can you tell me more about it?"],
 'knowledge': 'She was the founder and namesake of the Chanel brand.',
 'BEGIN': ['Entailment'],
 'VRM': ['Edification']}

In [32]:
def infer(model, prompt, knowledge, max_input_length, max_output_length, device):
    # question = "What is 42?"
    # context = "42 is the answer to life, the universe and everything"
    input = f"question: {prompt} context: {knowledge}"
    encoded_input = tokenizer([input],
                                return_tensors='pt',
                                max_length=max_input_length,
                                truncation=True)
    inp = encoded_input.input_ids.to(device)
    attn_mask = encoded_input.attention_mask.to(device)
    output = model.generate(input_ids = inp,
                                attention_mask = attn_mask, max_length=max_output_length)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    # print(output)
    return output

In [33]:
res = infer(model, prompt=test_sample['history'][-1], knowledge=test_sample['knowledge'],
             max_input_length=MAX_INPUT_LENGTH, max_output_length=100, device=DEVICE)

In [29]:
# Hallucinated Response
print('p:', test_sample['history'][-1])
print('k:', test_sample['knowledge'])
print('res:', res)
print('org:', test_sample['original_response'])
print('gt:', test_sample['response'])

p: Yes! I enjoy making sports and strategy games. I love the way players caninteract on a sports game in a simulated environment.
k: Increasingly, elements and principles of game design are also applied to other interactions, particularly virtual ones (see gamification).
res: I love how games are used to interact with virtual ones.
org: Same and Increasingly, elements and principles of game design are also applied to other interactions
gt: Nice, did you know that elements of game design can also be applied to other interactions?


In [34]:
# Faithful Response
print('p:', test_sample['history'][-1])
print('k:', test_sample['knowledge'])
print('res:', res)
print('org:', test_sample['original_response'])
print('gt:', test_sample['response'])

p: Oh interesting, I'm not really familiar with Coco Chanel, can you tell me more about it?
k: She was the founder and namesake of the Chanel brand.
res: I know she was the founder and namesake of the Chanel brand.
org: Yes of course she is the founder and namesake of the chanel brand!
gt: Yes, of course, she is the founder and namesake of the Chanel brand!


In [25]:
# Knowledge copied - as is
print('p:', test_sample['history'][-1])
print('k:', test_sample['knowledge'])
print('res:', res)
print('org:', test_sample['original_response'])
print('gt:', test_sample['response'])

p: Yes it is. It is also the color of Emeralds, and sometimes the color of camouflage.
k: It is evoked by light which has a dominant wavelength of roughly 495570 nm.
res: It is evoked by light which has a dominant wavelength of roughly 495570 nm.
org: I have read it has a dominant wavelength of roughly 495570 nm.
gt: The dominant wavelength of green is around 495570 nm.


In [39]:
# Retrieve results for all samples

In [40]:
all_results = []
for test_sample in tqdm(dataset['test'], total=len(dataset['test'])):
    input_ = {'knowledge': test_sample['knowledge'],
            'prompt': test_sample['history'][-1]}
    
    resp = infer(model, max_input_length=MAX_INPUT_LENGTH, max_output_length=100, device=DEVICE, **input_)
    all_results.append([input_['knowledge'],input_['prompt'],resp])

data_dump = "\n".join(['|'.join(res) for res in all_results])

with open("T5_gen_WoW.txt", 'w') as f:
    f.write(data_dump)


100%|██████████| 3539/3539 [07:40<00:00,  7.68it/s]


In [41]:
all_results

[["Dylan's Candy Bar is a chain of boutique candy shops and candy supplier currently located in New York City; East Hampton, New York; Los Angeles, Chicago and Miami Beach, as well as in wholesale venues around the globe.",
  "I love candy, what's a good brand?",
  "I love Dylan's Candy Bar. It's a chain of boutique candy shops and candy supplier currently located in New York City"],
 ["Dylan's Candy Bar is a chain of boutique candy shops and candy supplier currently located in New York City; East Hampton, New York; Los Angeles, Chicago and Miami Beach, as well as in wholesale venues around the globe.",
  'Oh, they do? What kind of candy do they sell?',
  "I'm not sure but Dylan's Candy Bar is a chain of boutique candy shops and candy supplier currently located in New York City"],
 ['It stocks 7,000 candies from around the world.',
  'Oh I see, what kind of candy do they offer?',
  'They stock 7,000 candies from around the world.'],
 ['It is owned by Dylan Lauren, daughter of fashion d

#### Evaluation

In [36]:
import importlib
import numpy as np

In [37]:
import sys
sys.path.append("./baseline/")

In [38]:
import task1_infer

ModuleNotFoundError: No module named 'spacy'

In [35]:
# Evaluation set will have the gold response
test_set = Dataset(dataset['test'], tokenizer, CustomDataset.DatasetMap.faithdial)

100%|██████████| 3539/3539 [00:00<00:00, 20038.64it/s]


In [None]:
test_dataloader = DataLoader(validation_set, batch_size=1,
                                        num_workers=1, collate_fn=lambda data: validation_set.pack_minibatch(data))

In [None]:
scores = []
for knowledge,history,gt_response in tqdm(test_dataloader):
    pred_resp = infer(model, prompt=history, knowledge=knowledge, max_input_length=MAX_INPUT_LENGTH, max_output_length=100, device=DEVICE)
    for idx in range(len(knowledge)):
        res = task1_infer.predict_hallucination(task1_infer.model, knowledge, history, pred_resp)
        scores.append(res)

In [None]:
np.mean(scores)

In [2]:
from datasets import load_dataset

In [3]:
faithdial_dataset = load_dataset("McGill-NLP/FaithDial")

No config specified, defaulting to: faith_dial/plain_text
Found cached dataset faith_dial (/home/csgrad/jayashok/.cache/huggingface/datasets/McGill-NLP___faith_dial/plain_text/1.0.0/70568c8ab3bbc83b603bce58fa593ab27e7f0d0cde51034e1c2073ff3e14189a)
100%|██████████| 7/7 [00:00<00:00, 639.81it/s]


In [4]:
def critic_preprocess(dataset):
    """
    Data items transformed into (knowledge, response, is_hallucination)
    """
    new_dataset = []
    for d in dataset:
        # original response
        if d["original_response"] != None:
            new_dataset.append({
                "knowledge": d["knowledge"],
                "response": d["original_response"],
                "hallucination": "yes" if "Hallucination" in d["BEGIN"] else "no",
                "history": " ".join(d["history"]),
                "all": " ".join(d["history"]) + " <eos> " + d["knowledge"] + " <eos> " + d["original_response"]
            })

        # new responses always aren't hallucinations
        new_dataset.append({"knowledge": d["knowledge"],
                            "response": d["response"],
                            "hallucination": "no",
                            "history": " ".join(d["history"]),
                            "all": " ".join(d["history"]) + " <eos> " + d["knowledge"] + " <eos> " + d["response"]
        })
    return new_dataset

In [5]:
import json

def dump_as_json(dataset, filename):
    """
    Takes a list of dicts and dumps it as a json file that torchtext can parse.
    """
    with open(filename, "w") as file:
        for d in dataset:
            file.write(json.dumps(d))
            file.write("\n")


In [6]:
!mkdir "critic_data/"

In [7]:
dump_as_json(critic_preprocess(faithdial_dataset["test"]), "critic_data/faithdial_dataset_test.json")
dump_as_json(critic_preprocess(faithdial_dataset["train"]), "critic_data/faithdial_dataset_train.json")
dump_as_json(critic_preprocess(faithdial_dataset["validation"]), "critic_data/faithdial_dataset_validation.json")