In [5]:
import pickle
import logging
import numpy as np
from haystack.nodes import TfidfRetriever
from haystack.document_stores.memory import InMemoryDocumentStore
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline

In [11]:
## KG Config : To create them you need to run KG notebook in notebook folder
KG_Graph_File = '../models/graph.pkl'
KG_Embeddings_File = '../models/embeddings.pkl'

## LM Config : To finetune it use the LM Finetuning notebook from notebook folder
Finetuned_LM_Path = '../models/fine_tuned_roberta_squad2'

## Above models and files is available in github release section

In [7]:
#import graph and embeddings from pkl file
with open(KG_Graph_File, 'rb') as f:
    graph = pickle.load(f)

with open(KG_Embeddings_File, 'rb') as f:
    embeddings = pickle.load(f)

  "class": algorithms.Blowfish,



In [8]:
# Helper functions that use graph embeddings
def get_similar_nodes(node, topn):
    similar_nodes = embeddings.wv.most_similar(node, topn = topn)
    return similar_nodes

def get_content_from_embeddings(node, embeddings):
    similar_nodes = embeddings.wv.most_similar(node, topn = 10)
    content = str(node)
    for n in similar_nodes:
        content = content + ' ' + str(n[0])
    return content

In [9]:
# Design a retriever for LLM that will retireve docs from query

# Step 1: Creating custom docs from graph nodes and embeddings
custom_documents = []
for node in graph.nodes():
    doc = {}
    doc['content'] = get_content_from_embeddings(node, embeddings)
    custom_documents.append(doc)
    
# Step 2: Create an InMemoryDocumentStore and add your custom documents
document_store = InMemoryDocumentStore()
document_store.write_documents(custom_documents)

# Step 3: Create a custom retriever
retriever = TfidfRetriever(document_store = document_store)

In [100]:
# This is only to retrieve documents, we need LM for answer
# question = "pk released in which year?"
# retrieved_documents = retriever.retrieve(query=question, top_k=5)
# retrieved_documents

In [12]:
# Use finetuned LM instead of pretrained one
reader = FARMReader(model_name_or_path = Finetuned_LM_Path, use_gpu = True)

  return self.fget.__get__(instance, owner)()



In [13]:
# Merge KG Retriever and LM to create pipeline
pipe = ExtractiveQAPipeline(reader, retriever)

In [16]:
# Helper functions for question answering to postprocess and return top k
def get_answer_from_question(graph, pipe, query, top_k):
    preds = pipe.run(query = query, params={"Retriever": {"top_k": top_k}, "Reader" : {"top_k" : top_k}})
    ans = postprocess_answers(graph, preds, top_k)
    return ans    

def postprocess_answers(graph, answers, top_k):
    final_ans = []
    for ans in answers['answers']:
        ans_seq = ans.answer
        for node in graph.nodes():
            if node in ans_seq and node not in final_ans:
                final_ans.append(node)
    return final_ans[:top_k]

In [17]:
question = "Which film stars Leonardo DiCaprio and was released in 2015?"
top_k = 10
print(get_answer(graph, pipe, question, top_k))

Inferencing Samples: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.58s/ Batches]

['body of lies', 'martin scorsese', 'ti', 'faster', 'jack nicholson', 'antoine fuqua', 'blood diamond', 'legend', 'the revenant', 'alejandro gonzález iñárritu']



