In [2]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
import pandas as pd
import numpy as np
import torch

In [3]:
checkpoint_path = "output/example_train"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)
summarizer = pipeline(task="summarization", model=model, tokenizer=tokenizer)
summarizer

<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x238d9769430>

In [6]:
from sentence_transformers import SentenceTransformer
sim_model = SentenceTransformer('hiiamsid/sentence_similarity_hindi')
sim_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [7]:
# input_query = ["what are the languages spoken in the movies whose directors also directed [Son of Dracula]",
#          "what are the languages spoken in the movies whose directors also directed [Son of Dracula]"]
# input_query_num = 50
input_query = pd.read_csv("./data/test_339/example_test.csv")["text"]
# input_query = input_query[:input_query_num].to_list()
input_query = input_query.to_list()
golden_path_target = pd.read_csv("./data/test_339/example_test.csv")["summary"]
golden_path_target = golden_path_target.to_list()
input_query

['What is the rating of Movie1?',
 'What is the IMDB rating of Movie1?',
 'The cast of Movie1.',
 'Running time of Movie1.',
 "When is Movie1's release date?",
 'The color of Movie1.',
 'Person1 specialty.',
 'Height of Person1.',
 "Person1's birthday.",
 "Person1's death date.",
 "What is Person1's nickname?",
 'The trivia of Person1.',
 'The trade mark of Person1.',
 'Who acted Movie1?',
 "Who is Movie1's director?",
 'Who wrote Movie1?',
 'The editor of Movie1.',
 'Who compose Movie1??',
 'The cinematographer of Movie1.',
 'The production-designer of Movie1.',
 'What movie are performed by Person1?',
 'what is the name of the movie directed by Person1?',
 'What movie did Person1 write?',
 "What movie's editor is Person1?",
 'A movie created by Person1.',
 "What movie's cinematographer is Person1?",
 'what is the name of the movie designed by Person1?',
 'what movies are producted by C1?',
 'The movie distributed by C1.',
 'what is the name of the company produce Movie1??',
 'Company

In [8]:
# obtain the embedding representation of the path targets
df = pd.read_csv("./data/test_339/example_train.csv")
df = df.drop_duplicates(subset=['summary'])
path_targets = df["summary"].to_list()
path_target_embedding = sim_model.encode(path_targets)
path_target_embedding.shape

(339, 768)

In [9]:
# obtain the representation of the output of the query
output_query = summarizer(input_query, max_length=23, min_length=8)
output_query_list = []
for i in range(len(output_query)):
    output_query_list.append(output_query[i]["summary_text"])
output_query_embedding = sim_model.encode(output_query_list)
output_query_embedding.shape

  next_indices = next_tokens // vocab_size


(339, 768)

In [10]:
def cos_sim(a: torch.Tensor, b: torch.Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

In [11]:
# print(path_targets[0],output_query_list[0])
# print(path_target_embedding[0],output_query_embedding[0])
# cos_sim(path_target_embedding[0],output_query_embedding[0]).item()

In [12]:
# for i in range(5):
#     print(output_query_list[i], path_targets[i])

In [13]:
# # test
# transfer_output = []
# for i in range(len(output_query_list)):
#     target_path_index = 0
#     max_score = 0.0
#     for j in range(len(path_targets)):
#         score = cos_sim(output_query_embedding[i], path_target_embedding[j]).item()
#         if score > max_score:
#             max_score = score
#             target_path_index = j
#         else:
#             max_score = max_score
# #   print(f"{input_query[i]}; {output_query[i]['summary_text']}; G:{path_targets[target_path_index]}; A:{path_targets[i]}; Score:{max_score:.6f}")
#     transfer_output.append(path_targets[target_path_index])

In [20]:
import heapq
# test
hop_number = 10
transfer_output = []

for i in range(len(output_query_list)):
    target_path_index = 0
    score_list = []
    for j in range(len(path_targets)):
        score = cos_sim(output_query_embedding[i], path_target_embedding[j]).item()
        score_list.append(score)
    max_score_l = heapq.nlargest(hop_number,score_list)
    max_score_index = []
    for i in max_score_l:
        max_score_index.append(score_list.index(i))
    # print(f"{input_query[i]}; {output_query[i]['summary_text']}; G:{path_targets[target_path_index]}; A:{path_targets[i]}; Score:{max_score:.6f}")
    transfer_output.append([path_targets[max_score_index[i]] for i in range(len(max_score_index))])
    # print(transfer_output)

In [15]:
# for i in range(len(transfer_output)):
#     print(f"{transfer_output[i]} -> {golden_path_target[i]}")

In [16]:
df = pd.read_csv(checkpoint_path + "/339_example_pred.csv")
df["question"] = input_query
df["generated_chain"] = transfer_output
df["actual_chain"] = path_targets
df.to_csv("./data/test_339/example_test_pred.csv", index=False, sep=",")

FileNotFoundError: [Errno 2] No such file or directory: 'output/example_train/339_example_pred.csv'

In [21]:
# evaluate the t5 seq2seq
assert len(transfer_output) == len(golden_path_target)
acc = 0
# # Hop1
# for i in range(len(transfer_output)):
#     if transfer_output[i] == golden_path_target[i]:
#         acc += 1
# print(f"Accuracy: {acc/len(transfer_output)}")

# Hop_n
for i in range(len(transfer_output)):
    if golden_path_target[i] in transfer_output[i]:
        acc += 1
print(f"Accuracy: {acc/len(transfer_output)}")

Accuracy: 0.8466076696165191
