### Extract target answer type

In [1]:
import torch
from langchain.prompts import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'

import warnings
warnings.filterwarnings("ignore")

In [2]:
# examples extracted from train
class baseline_LLM:
    def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(model_path,
                                                          device_map="auto",
                                                          torch_dtype=torch.bfloat16)
        self.template ="""
You are tasked to extract the answer entity type corresponding to a question.

Here are some examples:
who wrote films that share actors with the film [Anastasia] => writer
which person directed the films acted by the actors in [Jawbreaker] => director
what languages are the films that share directors with [The Age of Innocence] in => language
who starred in the films whose directors also directed [The Decline of the American Empire] => actor
when did the movies release whose actors also appear in the movie [Little Big Man] => year
which person wrote the movies directed by the director of [Incognito] => writer
who are the directors of the movies written by the writer of [The Green Mile] => director
what types are the films directed by the director of [The Conspirator] => genre
what genres are the movies written by [The Beast] writers => genre
who acted in the films directed by the director of [Terms of Endearment] => actor
the films that share actors with the film [Dil Chahta Hai] were released in which years => year
the movies written by the screenwriter of [The Science of Sleep] starred who => actor
who directed the movies written by the writer of [A Sunday in the Country] => director
the films written by the screenwriter of [Dracula 2000] were directed by who => director
who is listed as director of the movies starred by [Our Modern Maidens] actors => director
when did the movies written by [Europa] writers release => year
what types are the movies written by the writer of [The Green Hornet] => genre

No explanation required. Output one of: 'actor', 'director', 'genre', 'language', 'writer', 'year'.

{question} =>
"""
        self.prompt_template = PromptTemplate.from_template(self.template)

    def predict(self, question):
        formatted_prompt = self.prompt_template.format(question = question)

        chat = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": formatted_prompt},
        ]

        tokenized_chat = self.tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(device)

        outputs = self.model.generate(tokenized_chat, max_new_tokens=100, pad_token_id=self.tokenizer.eos_token_id).cpu()
        tokenized_chat = tokenized_chat.cpu() ###
        del tokenized_chat ###

        return self.tokenizer.decode(outputs[0]).split("<|end_header_id|>")[-1].strip().split("<|eot_id|>")[0]

In [3]:
# load model
llm = baseline_LLM("/scratch/users/nus/e1329380/models/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
# testing
qn = "the films that share actors with the film [Dil Chahta Hai] were released in which years"
llm.predict(qn)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


'year'

In [5]:
# iterate through train (first 5000)
df = pd.read_csv('../Datasets/MetaQA_dataset/vanilla 3-hop/qa_train.txt', sep='\t', header=None, names=['question', 'answer'])
with open('../Datasets/MetaQA_dataset/processed/train_ans_type.txt', 'w') as f:
    for idx in tqdm(range(5000)):
        f.write(llm.predict(df.question.iloc[idx]))
        f.write("\n")

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [06:17<00:00, 13.25it/s]


In [6]:
# check against groundtruth
gt_types = []
with open('../Datasets/MetaQA_dataset/vanilla 3-hop/qa_train_qtype.txt') as f:
    for line in f:
        gt_types.append(line.strip().split("_")[-1])
gt_types = gt_types[:5000]

pred_types = []
with open('../Datasets/MetaQA_dataset/processed/train_ans_type.txt') as f:
    for line in f:
        if line:
            pred_types.append(line.strip())

In [13]:
# sanity check
sum([1 for (i, j) in zip(gt_types, pred_types) if i!=j])

0

In [9]:
# iterate through train (first 1000)
df = pd.read_csv('../Datasets/MetaQA_dataset/vanilla 3-hop/qa_dev.txt', sep='\t', header=None, names=['question', 'answer'])
with open('../Datasets/MetaQA_dataset/processed/dev_ans_type.txt', 'w') as f:
    for idx in tqdm(range(1000)):
        f.write(llm.predict(df.question.iloc[idx]))
        f.write("\n")

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:15<00:00, 13.26it/s]


In [14]:
# check against groundtruth
gt_types = []
with open('../Datasets/MetaQA_dataset/vanilla 3-hop/qa_dev_qtype.txt') as f:
    for line in f:
        gt_types.append(line.strip().split("_")[-1])

pred_types = []
with open('../Datasets/MetaQA_dataset/processed/dev_ans_type.txt') as f:
    for line in f:
        if line:
            pred_types.append(line.strip())

In [15]:
# sanity check
sum([1 for (i, j) in zip(gt_types, pred_types) if i!=j])

0

In [10]:
# iterate through train (first 1000)
df = pd.read_csv('../Datasets/MetaQA_dataset/vanilla 3-hop/qa_test.txt', sep='\t', header=None, names=['question', 'answer'])
with open('../Datasets/MetaQA_dataset/processed/test_ans_type.txt', 'w') as f:
    for idx in tqdm(range(1000)):
        f.write(llm.predict(df.question.iloc[idx]))
        f.write("\n")

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:15<00:00, 13.25it/s]


In [16]:
# check against groundtruth
gt_types = []
with open('../Datasets/MetaQA_dataset/vanilla 3-hop/qa_test_qtype.txt') as f:
    for line in f:
        gt_types.append(line.strip().split("_")[-1])

pred_types = []
with open('../Datasets/MetaQA_dataset/processed/test_ans_type.txt') as f:
    for line in f:
        if line:
            pred_types.append(line.strip())

In [17]:
# sanity check
sum([1 for (i, j) in zip(gt_types, pred_types) if i!=j])

0

### Extract node entity type

In [42]:
from functions_modified import *
from torch.utils.data import DataLoader

path_to_node_embed = '../Datasets/MetaQA_dataset/processed/node2vec _embeddings/ud_node2vec_embeddings.txt'
path_to_idxes = '../Datasets/MetaQA_dataset/processed/idxes.json'
path_to_qa = '../Datasets/MetaQA_dataset/vanilla 3-hop/qa_train.txt'
path_to_ans_types = '../Datasets/MetaQA_dataset/processed/train_ans_type.txt'
data = KGQADataset(path_to_node_embed, path_to_idxes, path_to_qa, path_to_ans_types, train = True)

dataloader = DataLoader(data, batch_size=16, collate_fn=collate_fn, shuffle=True)

for batched_subgraphs, question_embeddings, stacked_labels, node_maps, labels, answer_types in dataloader:
    break

In [74]:
i = 7
sum(np.array(labels[i])[np.where(np.array(batched_subgraphs[i].node_types) == answer_types[i])[0]])

9

In [75]:
sum(labels[i])

tensor(9)

In [76]:
# total nodes to calculate loss on or evaluate metric on
len(np.array(labels[i])[np.where(np.array(batched_subgraphs[i].node_types) == answer_types[i])[0]])

21

In [77]:
answer_types[i]

'genre'

In [84]:
type(batched_subgraphs.node_types)

list