In [1]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm.auto import tqdm
from transformers import Adafactor
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import numpy as np

In [4]:
from tqdm import tqdm
from typing import Set, List
import re


class KB:
    def __init__(self, kb_path: str):
        self.kb_path = kb_path
        self.entities, self.relations, self.triplets = self.load_kb()

    def load_kb(self) -> (Set[str], Set[str], List[List[str]]):
        raise NotImplementedError()


class MetaQAKB(KB):
    SPECIAL_CHAR = 'SPC'

    def __init__(self, kb_path: str, add_reverse_rel: bool = True):
        self.add_reverse_rel = add_reverse_rel
        # self.regex = re.compile("[^a-zA-Z0-9\\s*.!?',_\\-]")
        super().__init__(kb_path)

    # def normalize_chars(self, strl: List[str]) -> List[str]:
    #     return [self.regex.sub(self.SPECIAL_CHAR, x) for x in strl]

    def load_kb(self) -> (Set, Set, List):
        """
        Loads the knowledge base from the given path
        :return: set of entities, set of relations, list of triplets
        """
        entities = set()
        relations = set()
        triplets = []

        with open(self.kb_path, 'r') as f:
            lines = f.read().strip().split('\n')
            for line in tqdm(lines):
                triplet = line.split('|')
                # e1, e2 = self.normalize_chars([triplet[0], triplet[2]])
                e1, e2 = triplet[0], triplet[2]
                r = triplet[1]

                triplets.append([e1, r, e2])
                if self.add_reverse_rel:
                    rel = r + '_reverse'
                    triplets.append([e2, rel, e1])
                    relations.add(rel)

                entities.add(e1)
                entities.add(e2)
                relations.add(r)

        print(f"loaded {len(triplets)} triplets with {len(entities)} entities and {len(relations)} relations")
        return entities, relations, triplets

In [5]:
import os
import re

import pandas as pd
from typing import Dict
# from knowledge_handler.prolog import PrologDA
# from knowledge_handler.kb import MetaQAKB
import random

import argparse


class MetaQADataLoader:
    def __init__(self, base_path, split='test'):
        self.base_path = base_path
#         self.prolog_da = PrologDA()

        kb_path = os.path.join(base_path, 'kb.txt')
#         self.kb = MetaQAKB(kb_path)

#         self.prolog_da.register_kb(self.kb)
        self.dataset = self.load_question_answers(base_path, split)

    def load_question_answers(self, base_path, split='test') -> Dict:
        multi_hop_paths = ['1hop', '2hop', '3hop']
        dataset = {}

        for multi_hop_path in multi_hop_paths:
            hop_path = os.path.join(base_path, multi_hop_path)

            questions_path = os.path.join(hop_path, f'qa_{split}.txt')
            questions = []
            answers = []
            question_concepts = []

            with open(questions_path, 'r') as f:
                lines = f.read().strip().split('\n')
                for line in lines:
                    q, a = line.split('\t')
                    question_concept = re.findall(r'\[(.+)\]', q)[0]
                    # question_concept_cleaned = self.kb.regex.sub(self.kb.SPECIAL_CHAR, question_concept)
#                     q = q.replace(question_concept, 'ENT')
                    question_concepts.append(question_concept)
                    q = q.replace("[", "", 1)
                    q = q.replace("]", "", 1)
                    if split == "train":
                        num_pos_ans = len(a.split('|'))
                        questions.extend([q]*num_pos_ans)
                        answers.extend(a.split('|'))
                    else:
                        questions.append(q)
                        answers.append(a.split('|'))
            df = pd.DataFrame(list(zip(questions, answers)), columns =['question', 'answer'])
                

#             dataset[multi_hop_path] = list(zip(questions, answers, question_concepts))
            dataset[multi_hop_path] = df

        return dataset

In [6]:
test_data_loader = MetaQADataLoader("/workspace/tanu/BTP-2/exp/knowledge infusion/metaqa/", "test")
train_data_loader = MetaQADataLoader("/workspace/tanu/BTP-2/exp/knowledge infusion/metaqa/", "train")
dev_data_loader = MetaQADataLoader("/workspace/tanu/BTP-2/exp/knowledge infusion/metaqa/", "dev")


In [7]:
hop = "2hop"
test_data = test_data_loader.dataset[hop]
train_data = train_data_loader.dataset[hop]
dev_data = dev_data_loader.dataset[hop]

In [8]:
test_data_loader.dataset[hop][:4]

Unnamed: 0,question,answer
0,which person directed the movies starred by Jo...,"[Nancy Meyers, Sam Mendes, George Clooney, Ken..."
1,who are movie co-directors of Delbert Mann,"[Franco Zeffirelli, Cary Fukunaga, Lewis Miles..."
2,what are the primary languages in the movies d...,[German]
3,the screenwriter Mimsy Farmer co-wrote movies ...,[Barbet Schroeder]


In [9]:
len(train_data)

739782

In [10]:
train_data_loader.dataset[hop][:10]

Unnamed: 0,question,answer
0,which person wrote the films directed by Yuriy...,Sergei Kozlov
1,which movies have the same director of Just Cause,The Mambo Kings
2,what genres do the movies written by Maureen M...,Drama
3,what were the release years of the movies acte...,1998
4,what were the release years of the movies acte...,1993
5,what are the movies that have the same screenw...,Gone with the Wind
6,what are the movies that have the same screenw...,Raffles
7,what are the movies that have the same screenw...,Elmer Gantry
8,what are the movies that have the same screenw...,Cass Timberlane
9,what are the movies that have the same screenw...,Arrowsmith


In [11]:
dev_data.head()

Unnamed: 0,question,answer
0,what are the languages spoken in the films dir...,[Greek]
1,the films acted by Sharon Tate were released i...,"[1967, 1968]"
2,when did the films written by Anthony Mann rel...,"[1949, 1947]"
3,what genres do the movies directed by Clark Gr...,"[Drama, Comedy]"
4,when did the movies written by Emir Kusturica ...,"[1998, 1995, 2007, 1988, 1981]"


In [12]:
train_data_loader.dataset[hop].iloc[4]["question"]

'what were the release years of the movies acted by Todd Field'

In [13]:
def prompt(row):
    return "Question: {} Answer: <extra_id_0>".format(row["question"])
def get_label(row):
    return "<extra_id_0> {}.".format(row["answer"])
def get_eval_label(row):
    return ["<extra_id_0> {}.".format(r) for r in row["answer"]]

In [14]:
train_data["input"] = train_data.apply(prompt, axis = 1)
train_data["label"] = train_data.apply(get_label, axis = 1)

In [15]:
dev_data["input"] = dev_data.apply(prompt, axis = 1)
dev_data["label"] = dev_data.apply(get_eval_label, axis = 1)
test_data["input"] = test_data.apply(prompt, axis = 1)
test_data["label"] = test_data.apply(get_eval_label, axis = 1)

In [16]:
test_data.iloc[2]

question    what are the primary languages in the movies d...
answer                                               [German]
input       Question: what are the primary languages in th...
label                                  [<extra_id_0> German.]
Name: 2, dtype: object

In [17]:
# def em_metric(preds, refs):
#     total = len(refs)  #TODO: should it be preds instead
#     correct = set(preds).intersection(set(refs))
#     return correct/total

# def em_metric(preds, refs):
#     total  =  0.0
#     for pred,ref in zip(preds,refs):
#         if pred in ref:
#             total += 1/len(ref)
#     return total/len(preds)

def em_metric(preds, refs):
    total  =  0.0
    for pred,ref in zip(preds,refs):
        if pred in ref:
            total += 1
    return total/len(preds)

In [18]:
train_data.iloc[0][["input", "label"]] 

input    Question: which person wrote the films directe...
label                          <extra_id_0> Sergei Kozlov.
Name: 0, dtype: object

In [19]:
tokenizer = T5Tokenizer.from_pretrained("t5-large")
# model = T5ForConditionalGeneration.from_pretrained("/workspace/tanu/BTP-2/exp/knowledge infusion/trained models/KGinfusedLM/6") 
# model = T5ForConditionalGeneration.from_pretrained("t5-large") 
# workspace/tanu/aviation/trained models/KGinfusedLM_ankush_c4/20
model = T5ForConditionalGeneration.from_pretrained("/workspace/tanu/aviation/trained models/KGinfusedLM_ankush_c4/20") 

# model = T5ForConditionalGeneration.from_pretrained("/workspace/tanu/trained models/metaqa-finetuned-using-KGLM20/2hop/9") 
# # model = T5ForConditionalGeneration.from_pretrained("/workspace/tanu/trained models/metaqa-finetuned-using-KGLM20/1hop/1") 
# model = T5ForConditionalGeneration.from_pretrained("/workspace/tanu/trained models/metaqa-finetuned-using-KGLM20/1hop/9") 

# tanu/BTP-2/exp/knowledge infusion/trained models/KGinfusedLM/19
# model = T5ForConditionalGeneration.from_pretrained("/workspace/tanu/trained models/metaqa-finetuned/1hop/9") 

#replace with the trained model dir


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [20]:
def tokenize(text):
    return tokenizer(text, return_tensors="pt",  padding=True, truncation=True)
def tokenize_target(text):
    return tokenizer(text, return_tensors="pt",   padding=True, truncation=True).input_ids

In [21]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"


In [22]:
optimizer = Adafactor(model.parameters(), lr=1e-3, relative_step = False)
num_epochs = 20  #TODO : unspecified num epochs for finetuning
batch_size = 16
num_training_steps = num_epochs * (train_data.shape[0] // batch_size )
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )
# Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)

In [23]:
# device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")
# device

In [24]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [25]:
# P = []
# R = []

In [26]:
def run_data(model, data, batch_size, optimizer, tokenizer, device, eval_mode):
    iters = int(np.ceil(data.shape[0] / batch_size))
    avg_loss = 0
    step = 0
    p_bar = tqdm(total=iters, position=0, leave=True, desc='Running through data')
    for row_idx in range(0, data.shape[0], batch_size):
        upper_idx = min(row_idx + batch_size, data.shape[0]) -1
#         upper_idx =
        labels = data.loc[row_idx : upper_idx]['label'].tolist()
        inputs = data.loc[row_idx : upper_idx]['input'].tolist()
        tokenized_input = tokenize(inputs)
#         global input_ids 
#         global attention_mask
        input_ids = tokenized_input["input_ids"].to(device)
        attention_mask = tokenized_input["attention_mask"].to(device)
       
        if not eval_mode:
            tokenized_labels = tokenize_target(labels)
            labels = tokenized_labels.to(device)
            labels[labels == tokenizer.pad_token_id] = -100
#             labels[labels == 32099] = -100
#             labels[labels == 32098] = -100
            loss = model(input_ids= input_ids, attention_mask= attention_mask,labels= labels).loss
        else:
            with torch.no_grad():
                sequence_ids = model.generate(input_ids = input_ids, attention_mask= attention_mask)
                pred = tokenizer.batch_decode(sequence_ids)
                labels = [[re.sub("[\<\[].*?[\>\]]", "", x).strip()  for x in label] for label in  labels]
                pred = [re.sub("[\<\[].*?[\>\]]", "", x).strip() for x in pred]
                
#                 P.extend(pred)
#                 R.extend(labels)
                loss_item = em_metric(pred,labels)

        if not eval_mode:
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_item = loss.detach().clone().item()

#             lr_scheduler.step()
        
#         loss_item = loss.detach().clone().item()
#         print(loss_item)
#         return
        avg_loss = (avg_loss * step + loss_item) / (step + 1)

        p_bar.set_postfix(avg_loss=avg_loss)
        p_bar.update(1)
        step += 1

    p_bar.close()
    return model, optimizer, avg_loss

In [27]:
dev_data.iloc[0]["input"]

'Question: what are the languages spoken in the films directed by Joel Zwick Answer: <extra_id_0>'

In [28]:
torch.cuda.get_device_name()

'NVIDIA A100-SXM4-80GB'

In [29]:
hop

'2hop'

In [30]:
model.eval()
_, _, avg_eval_loss = run_data(model, dev_data, batch_size, \
                                optimizer, tokenizer, device, eval_mode=True)


Running through data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 930/930 [03:32<00:00,  4.37it/s, avg_loss=0.0185]


In [31]:
x = P[0][0]
x

NameError: name 'P' is not defined

In [None]:
re.sub("[\<\[].*?[\>\]]", "", x).strip()

In [None]:
P[0][:10], R[0][:10]

In [None]:
# device = torch.device("cpu")

In [None]:
train_losses = []
eval_losses = []
model.eval()
_, _, avg_eval_loss = run_data(model, dev_data, batch_size, \
                                optimizer, tokenizer, device, eval_mode=True)
# print(avg_eval_loss)        
for epoch in range(10,num_epochs):
        shuffled_train_data = train_data.sample(frac=1).reset_index() 
        model.train()
        optimizer.zero_grad() 
        model, optimizer, avg_train_loss = run_data(model, shuffled_train_data, batch_size, \
                                optimizer, tokenizer, device, eval_mode=False)
        model.eval()
        _, _, avg_eval_loss = run_data(model, dev_data, batch_size, \
                                optimizer, tokenizer, device, eval_mode=True)
#         break
        
        train_losses.append(avg_train_loss)
        eval_losses.append(avg_eval_loss)
        print(f'Epoch {epoch}:\tTrain loss: {avg_train_loss}\t\t Eval loss: {avg_eval_loss}')
        model.save_pretrained(f"trained models/metaqa-finetuned-using-C4_20/{hop}/{epoch}", from_pt=True) 



Running through data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 930/930 [03:35<00:00,  4.31it/s, avg_loss=0.0185]
Running through data:   2%|██▊                                                                                                                               | 1012/46237 [07:51<5:37:38,  2.23it/s, avg_loss=2.69]

In [33]:
eval_losses, train_losses

([0.11122311827956989], [1.552494299562091])

In [None]:
eval_losses, train_losses

In [None]:
mt [0], pred, res

In [None]:
exm = "Question: the films acted by Shaun White were in which genres? Answer: <extra_id_0> "

In [None]:
exm = "Question: what movies did Temuera Morrison act in Answer: <extra_id_0>"


In [None]:
input_ids = tokenizer(exm , return_tensors="pt").input_ids
sequence_ids = model.generate(input_ids)
sequences = tokenizer.batch_decode(sequence_ids)
sequences

In [None]:
from evaluate import load
exact_match_metric = load("exact_match")


In [None]:
predictions = ["abc"]
references = ["abc"]
results = exact_match_metric.compute(predictions=predictions, references=references)
results