In [None]:
import nltk

nltk.download('punkt')

In [None]:
import os

import torch
from haystack.document_stores import FAISSDocumentStore, InMemoryDocumentStore, OpenSearchDocumentStore
from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader
from haystack.pipelines import DocumentSearchPipeline
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from torch.nn.functional import cosine_similarity

document_store = InMemoryDocumentStore(embedding_dim=384)
doc_dir = "data/dataset"

files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)
retriever = EmbeddingRetriever(
      document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L12-v2", use_gpu=True,
  )
document_store.update_embeddings(retriever)
docsearchpipeline = DocumentSearchPipeline(retriever=retriever)

reader = FARMReader(model_name_or_path="deepset/roberta-large-squad2", use_gpu=True)
docsearchpipeline.add_node(component=reader, name="FARMReader", inputs=["Retriever"])

In [None]:
def extract_similar_answer_sentences(
    query_content: str,
    doc_content: str,
    retriever: EmbeddingRetriever,
    threshold: float = 0.6,
    ) -> str:
    query_lines = nltk.sent_tokenize(query_content)
    answer_lines = nltk.sent_tokenize(doc_content)

    answer_embed = torch.tensor(retriever.embed_queries(answer_lines))
    query_embed = torch.tensor(retriever.embed_queries(query_lines))

    similarity_matrix = cosine_similarity(query_embed.unsqueeze(1), answer_embed.unsqueeze(0), dim=2)

    _, answer_indices = torch.nonzero(similarity_matrix > threshold, as_tuple=True)

    if len(answer_indices) == 0:
        _, max_indices = torch.max(similarity_matrix, dim=1)
        best_answer_combined = "\n".join([answer_lines[i] for i in torch.unique(max_indices)])
    else:
        best_answer_combined = "\n".join([answer_lines[i] for i in torch.unique(answer_indices)])

    return best_answer_combined

In [None]:
def semantic_search_and_question_answer(query: str) -> dict:
    results = docsearchpipeline.run(
        query=query,
        params={
            "Retriever": {"top_k": 1},
            "FARMReader": {"top_k": 1},
        },
        debug=False,
    )

    docs = []
    for i, result in enumerate(results["answers"]):
        if result.score >= 0.25:
            docs.append(
                {
                    "answer": result.answer,
                },
            )
        else:
            docs.append(
                {
                    "answer": extract_similar_answer_sentences(query, results["documents"][i].content, retriever),
                },
            )

    return {
        "suggested_answer": docs,
    }

In [None]:
query = """
Arya runs away from King's Landing with Yoren and his group.
She becomes friends with Lommy, Gendry, and Hot Pie.
They are attacked by Amory Lorch, but Arya and the others escape through a tunnel.
Arya frees three prisoners, including Jaqen H'ghar.
"""
# query = " Why is Daenerys Targaryen determined to reclaim the Iron Throne? "
answer = semantic_search_and_question_answer(query)
answer["suggested_answer"]