<a href="https://colab.research.google.com/github/S4HRKBOY/HaystackQAPipelineExample/blob/main/QA_Pipeline_Haystack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!cat /etc/*release

In [None]:
%%bash

# Install the latest main of Haystack
pip install --upgrade pip
pip install farm-haystack[colab,ocr,preprocessing,file-conversion,pdf,elasticsearch,inference,faiss]
pip install farm-haystack[beir]

apt install libgraphviz-dev
pip install pygraphviz

pip install datasets
pip install wikipedia

In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.DEBUG)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time

time.sleep(30)

In [None]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
document_store = ElasticsearchDocumentStore(host = "localhost",
                                            port = 9200,
                                            embedding_dim = 768)

In [None]:
from haystack.nodes import EmbeddingRetriever

top_k_retriever = 2

retriever = EmbeddingRetriever(
    top_k=top_k_retriever,
    document_store=document_store,
    embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
    model_format="sentence_transformers"
)

In [None]:
## EVALUATION
'''
from haystack.nodes import PreProcessor
from haystack.utils import fetch_archive_from_http

doc_dir = "data/eval"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

label_preprocessor = PreProcessor(
    clean_empty_lines=False,
    clean_whitespace=False,
    split_by="word",
    split_length=300,
    split_overlap=0,
    split_respect_sentence_boundary=False,
)

document_store.add_eval_data(
    filename="data/eval/nq_dev_subset_v2.json",
    doc_index=document_store.index,
    label_index=document_store.label_index,
    preprocessor=label_preprocessor,
)
'''

In [None]:
'''
document_store.update_embeddings(retriever=retriever)
retriever_eval_results = retriever.eval(top_k=5, label_index=document_store.label_index, doc_index=document_store.index)
# Retriever Recall is the proportion of questions for which the correct document containing the answer is
# among the correct documents
print("Retriever Recall:", retriever_eval_results["recall"])
# Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
print("Retriever Mean Avg Precision:", retriever_eval_results["map"])
'''
# END OF EVALUATION

In [None]:
import wikipedia

wikipedia_entries = 15

def process_wikipedia_query(question):
  article_names = wikipedia.search(question, results=10)
  wikipedia_documents = []
  for article_name in article_names:
    try:
      wikipedia_documents.append(wikipedia.page(article_name).content)
    except:
      continue
  return wikipedia_documents

def retrive_article_titles(question):
  return wikipedia.search(question, results=10)

In [None]:
doc_dir = "data/wikipedia"

if not os.path.exists(doc_dir):
  os.makedirs(doc_dir)

In [None]:
import os
import glob

def clear_data_dir():
  files = glob.glob('data/wikipedia/*.txt')
  for f in files:
      os.remove(f)

In [None]:
def create_files_for_retriever(articles,titles):
  files = []
  for article,title in zip(articles,titles):
    title = title.replace("/","")
    f = open(doc_dir + "/" + title + ".txt", "w")
    f.write(str(article.encode('utf-8', 'replace')))
    files.append(f)
    f.close()
  return files

In [None]:
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser

'''
rag_prompt = PromptTemplate(
    prompt="""Synthesize a comprehensive answer from the following text for the given question.
                             Provide a clear and concise response that summarizes the key points and information presented in the text.
                             Your answer should be in your own words and be not longer than 50 words.
                             \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:""",
    output_parser=AnswerParser(),
)

prompt_node = PromptNode(model_name_or_path="google/flan-t5-large", default_prompt_template=rag_prompt, output_variable="my_answer")
'''
'''
elaboration_prompt = PromptTemplate(
        prompt="""Elaborate on the answer to the following question given the related texts.
                                 Provide additional details to the answer in your own words.
                                 The final response should be between 100-200 words.
                                 \n\n Related text: {join(documents)} \n\n Question:
              {questions} \n\n Previous answer: {my_answer} \n\n New answer:""",
        output_parser=AnswerParser(),
    )
elaboration_node = PromptNode(model_name_or_path="google/flan-t5-large", default_prompt_template=elaboration_prompt)
'''

rag_prompt = PromptTemplate(
    prompt="""Elaborate on the answer to the following question given the related texts.
                              \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:"""
)

rag_node = PromptNode(
    model_name_or_path="google/flan-t5-large",
    max_length=75,
    default_prompt_template=rag_prompt,
    use_gpu=True,
    output_variable="my_answer"
)

'''
elaboration_prompt = PromptTemplate(
    prompt="""Elaborate on the answer to the following question given the related texts.
                Provide additional details to the answer in your own words.
                The final response should be between 100-200 words.
                \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Previous answer: {my_answer} \n\n New answer: """
)
elaboration_node = PromptNode(model_name_or_path="vblagoje/bart_lfqa", default_prompt_template=elaboration_prompt)
'''

'''
  Models tried: vblagoje/bart_lfqa,
                pszemraj/t5-base-askscience-lfqa
                google/flan-t5-large
                google/flan-ul2
  Prompts tried:
    Synthesize a comprehensive answer from the following text for the given question.
                                Provide a clear and concise response that summarizes the key points and information presented in the text.
                                Your answer should be in your own words and be no longer than 50 words.
                                \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:
  PromptNode can be adjusted with the use of different models and different Prompts!
'''

In [None]:
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor

indexing_pipeline = Pipeline()
text_converter = TextConverter()

indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["TextConverter"])

In [None]:
def get_files_to_index():
  files_to_index = []
  for f in os.listdir(doc_dir):
    if(isinstance(f,TextIOWrapper)):
      files_to_index.append(doc_dir + "/" + f)
  return files_to_index

In [None]:
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=100,
    split_overlap=10,
    split_respect_sentence_boundary=True,
)

def process_documents():
  docs = []
  unprocessed_documents = document_store.get_all_documents()
  document_store.delete_documents()

  processed_documents = preprocessor.process(unprocessed_documents)
  embeds = retriever.embed_documents(processed_documents)
  for i, doc in enumerate(processed_documents):
    doc.embedding = embeds[i]
  document_store.write_documents(processed_documents)
  docs.clear()


In [None]:
pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=rag_node, name="rag_node", inputs=["retriever"])

In [None]:
indexing_pipeline.draw()

In [None]:
pipe.draw()

In [None]:
from haystack.utils import fetch_archive_from_http, convert_files_to_docs, clean_wiki_text

def process_question_and_create_answer(question):
  clear_data_dir()
  document_store.delete_documents()
  wikipedia_documents = process_wikipedia_query(question)
  wikipedia_titles = retrive_article_titles(question)
  create_files_for_retriever(wikipedia_documents,wikipedia_titles)
  files_to_index = [doc_dir + "/" + f  for f in os.listdir(doc_dir)]
  # 1st Element needs to be removed!
  files_to_index.pop(0)
  if "data/wikipedia/.ipynb_checkpoints" in files_to_index:
    files_to_index.remove("data/wikipedia/.ipynb_checkpoints")
  indexing_pipeline.run(file_paths=files_to_index)
  process_documents()
  output = pipe.run(query=question)
  return output

In [None]:
question = "Why is the sky blue?"
result = process_question_and_create_answer(question)
print(result)

In [None]:
question = "Who was elected president of the USA in 2020?"
result = process_question_and_create_answer(question)
print(result)

In [None]:
question = "2 times 2 plus 3 equals what number?"
result = process_question_and_create_answer(question)
print(result)

In [None]:
question = "What is binary search?"
result = process_question_and_create_answer(question)
print(result)

In [None]:
question = "How does binary search work?"
result = process_question_and_create_answer(question)
print(result)

In [None]:
question = "What is the answer to life?"
result = process_question_and_create_answer(question)
print(result)

In [None]:
question = "Who was Michael Jackson?"
result = process_question_and_create_answer(question)
print(result)

In [None]:
question = "What does NaCl stands for?"
result = process_question_and_create_answer(question)
print(result)

In [None]:
question = "What is Star Wars?"
result = process_question_and_create_answer(question)
print(result)