In [13]:
import pickle
import logging
import numpy as np
import random
from haystack.nodes import FARMReader
from haystack.document_stores.memory import InMemoryDocumentStore

In [22]:
# Configs

## KG Config : To create them you need to run KG notebook in notebook folder
KG_Graph_File = '../models/graph.pkl'
KG_Embeddings_File = '../models/embeddings.pkl'
#LM_model_path = "deepset/roberta-base-squad2"
LM_model_path = 'DKud7/finetuned-roberta-squad2'
reader = FARMReader(LM_model_path, use_gpu = True)

  return self.fget.__get__(instance, owner)()



In [23]:
def get_similar_nodes(node, topn):
    similar_nodes = embeddings.wv.most_similar(node, topn = topn)
    return similar_nodes

def get_content_from_embeddings(node, embeddings):
    similar_nodes = embeddings.wv.most_similar(node, topn = 10)
    content = str(node)
    for n in similar_nodes:
        content = content + ' ' + str(n[0])
    return content

def get_graph_and_embeddings(KG_Graph_File, KG_Embeddings_File):
    with open(KG_Graph_File, 'rb') as f:
        graph = pickle.load(f)
    with open(KG_Embeddings_File, 'rb') as f:
        embeddings = pickle.load(f)
    return graph, embeddings

def get_query_vector(query):
    nodes_to_consider = []
    for node in graph.nodes():
        if node in query.lower():
            nodes_to_consider.append(node)
    total_nodes = len(graph.nodes())
    query_vector = embeddings.wv[random.randint(0, total_nodes - 1)]
    if len(nodes_to_consider):
        query_vector = query_vector * 0
        for node in nodes_to_consider:
            query_vector = query_vector + embeddings.wv[node]
        query_vector = query_vector / len(nodes_to_consider)
    return query_vector

def retrieve_docs_from_query_vector(query_vector, custom_documents, topk):
    custom_documents.sort(key = lambda d: np.square(d['embedding'] - query_vector).mean())
    return custom_documents[:topk]

def get_answers_from_question(query, topk):
    query_vector = get_query_vector(query)
    returned_docs = retrieve_docs_from_query_vector(query_vector, custom_documents, topk)
    document_store = InMemoryDocumentStore()
    document_store.write_documents(returned_docs)
    result = reader.predict(query = query, documents = document_store, top_k = topk)
    return postprocess_answers(result, topk)

def postprocess_answers(answers, top_k):
    final_ans = []
    graph, _ = get_graph_and_embeddings(KG_Graph_File, KG_Embeddings_File)
    for ans in answers['answers']:
        ans_seq = ans.answer
        for node in graph.nodes():
            if node in ans_seq and node not in final_ans:
                final_ans.append(node)
    return final_ans[:top_k]

In [24]:
# Step 1: Creating custom docs from graph nodes and embeddings
graph, embeddings = get_graph_and_embeddings(KG_Graph_File, KG_Embeddings_File)
custom_documents = []
for node in graph.nodes():
    doc = {}
    doc['node'] = node
    doc['content'] = get_content_from_embeddings(node, embeddings)
    doc['embedding'] = embeddings.wv[node]
    custom_documents.append(doc)

In [25]:
query = "pk released in which year?"
topk = 10

In [26]:
answers = get_answers_from_question(query, topk)
print(answers)

Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.71s/ Batches]

['sanjay dutt', 'rajkumar hirani', '3 idiots', 'her', 'forgetting sarah marshall', 'gran torino', 'ti', 'bee vang', 'ahney her', 'cynthia nixon']





### Testing

In [27]:
## Read 20 Questions from test file
test_file = '../data/test.txt'
answers_file = '../data/answers.txt'
questions = []
true_answers = []
topk_predicted_answers = []

In [28]:
with open(test_file, 'r') as file:
    lines = file.readlines()
    for i, line in enumerate(lines):
        if i % 2 == 0 and i < 39:
            questions.append(line.strip())
        elif i > 41 and i < 62:
            true_answers.append(line.strip()[8:-1].lower())
        else:
            continue

# Begin inference
topk = 9

# Write answers in file
g = open(answers_file, 'w+')
N = len(questions)
acc = 0
for i in range(N):
    g.write('QUESTION: ' + questions[i] + '\n')
    g.write('TRUE ANSWER: ' + true_answers[i] + '\n')
    preds = get_answers_from_question(questions[i], topk)
    g.write('PREDICTIONS: ' + ','.join(preds) + '\n\n')
    if true_answers[i].lower() in preds:
        acc = acc + 1

print('ACCURACY: ' + str(acc * 100/N) + '%\n')
g.close()

Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.50s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.53s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.82s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.62s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.60s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.62s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.62s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.83s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.69s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.71s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.79s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.67s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.70s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00

ACCURACY: 35.0%




