In [1]:
import pickle
import logging
import numpy as np
from haystack.nodes import TfidfRetriever
from haystack.document_stores.memory import InMemoryDocumentStore
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline

In [2]:
#import graph and embeddings from pkl file
with open('graph.pkl', 'rb') as f:
    graph = pickle.load(f)

with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

  "class": algorithms.Blowfish,


In [142]:
# Helper functions that use graph embeddings
def get_similar_nodes(node, topn):
    similar_nodes = embeddings.wv.most_similar(node, topn = topn)
    return similar_nodes

def get_content_from_embeddings(node, embeddings):
    similar_nodes = embeddings.wv.most_similar(node, topn = 10)
    content = str(node)
    for n in similar_nodes:
        content = content + ' ' + str(n[0])
    return content

In [143]:
# Design a retriever

# Step 1: Creating custom docs from graph nodes and embeddings
custom_documents = []
for node in graph.nodes():
    doc = {}
    doc['content'] = get_content_from_embeddings(node, embeddings)
    custom_documents.append(doc)
    
# Step 2: Create an InMemoryDocumentStore and add your custom documents
document_store = InMemoryDocumentStore()
document_store.write_documents(custom_documents)

# Step 3: Create a custom retriever
retriever = TfidfRetriever(document_store = document_store)

In [100]:
# This is only to retrieve documents, we need LM for answer
# question = "pk released in which year?"
# retrieved_documents = retriever.retrieve(query=question, top_k=5)
# retrieved_documents

In [60]:
# Use finetuned LM instead of pretrained one
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [164]:
# Merge KG Retriever and LM to create pipeline
pipe = ExtractiveQAPipeline(reader, retriever)

In [165]:
# Helper functions for question answering to postprocess and return top k
def get_answer(graph, pipe, query, top_k):
    preds = pipe.run(query = query, params={"Retriever": {"top_k": top_k}, "Reader" : {"top_k" : top_k}})
    ans = postprocess_answers(graph, preds, top_k)
    return ans    

def postprocess_answers(graph, answers, top_k):
    final_ans = []
    for ans in answers['answers']:
        ans_seq = ans.answer
        for node in graph.nodes():
            if node in ans_seq and node not in final_ans:
                final_ans.append(node)
    return final_ans[:top_k]

In [160]:
question = "Can you name a film from 2015 with an ensemble cast, including a well-known actor named Tom Hardy??"

In [168]:
question = "Who is the actress in the captain america movie?"
top_k = 5
print(get_answer(graph, pipe, question, top_k))


Inferencing Samples:   0%|                                                                                                                                                     | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.21 Batches/s][A

['chris evans', 'joe johnston', 'hayley atwell', 'anthony russo', 'wes ball']



