In [71]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
import pandas as pd
import numpy as np
import torch

In [72]:
tokenizer = AutoTokenizer.from_pretrained("checkpoint/t5-base")
model = T5ForConditionalGeneration.from_pretrained("checkpoint/t5-base")
summarizer = pipeline(task="summarization", model=model, tokenizer=tokenizer)
summarizer

<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x1f43862f550>

In [73]:
# input_query = ["what are the languages spoken in the movies whose directors also directed [Son of Dracula]",
#          "what are the languages spoken in the movies whose directors also directed [Son of Dracula]"]
input_query_num = 50
input_query = pd.read_csv("./data/eval_csv.csv")["text"]
input_query = input_query[:input_query_num].to_list()
golden_path_target = pd.read_csv("./data/eval_csv.csv")["summary"]
golden_path_target = golden_path_target[:input_query_num].to_list()

In [74]:
# obtain the last hidden states of the fine-tuned t5-base model
input_tokenzier = tokenizer(
    input_query,
    return_tensors="pt",
    padding=True,
    truncation=True
)
input_query_embedding = model.encoder(
    input_ids=input_tokenzier["input_ids"],
    attention_mask=input_tokenzier["attention_mask"],
    return_dict=True
)
input_query_embedding = input_query_embedding.last_hidden_state
input_query_embedding.shape # (batch_size, seq_len, hidden_size)

In [75]:
# obtain the embedding representation of the path targets
df = pd.read_csv("./data/train_csv.csv")
df = df.drop_duplicates(subset=['summary'])
path_targets = df["summary"].to_list()
path_tokenzier = tokenizer(
    path_targets,
    return_tensors="pt",
    padding=True,
    truncation=True
)
path_target_embedding = model.encoder(
    input_ids=path_tokenzier["input_ids"],
    attention_mask=path_tokenzier["attention_mask"],
    return_dict=True
)
path_target_embedding = path_target_embedding.last_hidden_state
path_target_embedding.shape

torch.Size([15, 18, 768])

In [76]:
# obtain the representation of the output of the query
output_query = summarizer(input_query, max_length=20, min_length=18)
output_query_list = []
for i in range(len(output_query)):
    output_query_list.append(output_query[i]["summary_text"])
output_query_tokenzier = tokenizer(
    output_query_list,
    return_tensors="pt",
    padding=True,
    truncation=True
)
output_query_embedding = model.encoder(
    input_ids=output_query_tokenzier["input_ids"],
    attention_mask=output_query_tokenzier["attention_mask"],
    return_dict=True
)
output_query_embedding = output_query_embedding.last_hidden_state
output_query_embedding.shape

  next_indices = next_tokens // vocab_size


torch.Size([50, 20, 768])

In [77]:
# calculate the vector similarity
def vector_similarity(v1, v2):
    # define the sentence vector
    def sentence_vector(v):
        sentence_vector = 0.0
        for i in range(v.shape[0]):
            sentence_vector += v[i]
        return sentence_vector / v.shape[0]
    v1, v2 = sentence_vector(v1), sentence_vector(v2)
    return torch.cosine_similarity(v1, v2, dim=0, eps=1e-6)

In [78]:
print(path_targets[0])
print(output_query_list[0])
v1 = path_target_embedding[0]
v2 = output_query_embedding[0]
vector_similarity(v1,v2)

movie_to_actor_to_movie_to_year
movie_to_writer_to­movie_to _____director .


tensor(0.8872, grad_fn=<DivBackward0>)

In [79]:
# replace the output with the high similarity target path
transfer_output = []
for i in range(len(output_query_list)):
    print(f"{output_query[i]['summary_text']}")
    target_path_index = 0
    max_score = 0.0
    for j in range(len(path_targets)):
        score = vector_similarity(output_query_embedding[i], path_target_embedding[j])
        print(f"{path_targets[j]} score={score:.6f}")
        if score > max_score:
            max_score = score
            target_path_index = j
        else:
            max_score = max_score
    print(f"Results: {output_query[i]['summary_text']}-> {path_targets[target_path_index]} with confidence {max_score:.6f}")
    transfer_output.append(path_targets[target_path_index])

movie_to_writer_to­movie_to _____director .
movie_to_actor_to_movie_to_year score=0.887226
movie_to_writer_to_movie_to_director score=0.943432
movie_to_actor_to_movie_to_director score=0.914048
movie_to_director_to_movie_to_genre score=0.905170
movie_to_actor_to_movie_to_writer score=0.929436
movie_to_director_to_movie_to_language score=0.907837
movie_to_writer_to_movie_to_actor score=0.935946
movie_to_director_to_movie_to_actor score=0.912251
movie_to_actor_to_movie_to_language score=0.892608
movie_to_director_to_movie_to_year score=0.904358
movie_to_actor_to_movie_to_genre score=0.885195
movie_to_director_to_movie_to_writer score=0.935709
movie_to_writer_to_movie_to_genre score=0.930343
movie_to_writer_to_movie_to_year score=0.925986
movie_to_writer_to_movie_to_language score=0.928867
Results: movie_to_writer_to­movie_to _____director .-> movie_to_writer_to_movie_to_director with confidence 0.943432
movie_to_director_to-movie_to_______actor.
movie_to_actor_to_movie_to_year score=0.93

In [80]:
transfer_output

['movie_to_writer_to_movie_to_director',
 'movie_to_director_to_movie_to_actor',
 'movie_to_director_to_movie_to_genre',
 'movie_to_director_to_movie_to_genre',
 'movie_to_director_to_movie_to_genre',
 'movie_to_actor_to_movie_to_language',
 'movie_to_writer_to_movie_to_genre',
 'movie_to_writer_to_movie_to_director',
 'movie_to_director_to_movie_to_language',
 'movie_to_director_to_movie_to_genre',
 'movie_to_writer_to_movie_to_genre',
 'movie_to_actor_to_movie_to_year',
 'movie_to_writer_to_movie_to_actor',
 'movie_to_director_to_movie_to_genre',
 'movie_to_actor_to_movie_to_year',
 'movie_to_director_to_movie_to_actor',
 'movie_to_director_to_movie_to_actor',
 'movie_to_writer_to_movie_to_genre',
 'movie_to_director_to_movie_to_actor',
 'movie_to_actor_to_movie_to_genre',
 'movie_to_actor_to_movie_to_director',
 'movie_to_actor_to_movie_to_writer',
 'movie_to_director_to_movie_to_writer',
 'movie_to_director_to_movie_to_actor',
 'movie_to_director_to_movie_to_genre',
 'movie_to_dire

In [81]:
# df = pd.read_csv("./data/eval_csv_pred.csv")
# df["predicted"] = transfer_output
# df.to_csv("././data/eval_csv_pred.csv", index=False, sep=",")

In [82]:
# evaluate the t5 seq2seq
assert len(transfer_output) == len(golden_path_target)
acc = 0
for i in range(len(transfer_output)):
    if transfer_output[i] == golden_path_target[i]:
        acc += 1
print(f"Accuracy: {acc/len(transfer_output)}")

Accuracy: 0.98
