In [None]:
# Set up logging
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [None]:
# Imports for the DPR pipeline
from haystack.nodes import TransformersReader, DensePassageRetriever, PreProcessor, PDFToTextConverter
from haystack.document_stores import FAISSDocumentStore
from haystack.pipelines import ExtractiveQAPipeline

# Library for path handling
import pathlib as pl

# Library for data handling
import pandas as pd

In [None]:
# Model for text extraction from pdf
converter = PDFToTextConverter(remove_numeric_tables=True)
# Extracting text from pdf
extracted = converter.convert(file_path=pl.Path("../data/raw/sustainability-report-2020.pdf"), meta=False, encoding="UTF-8")[0]

In [None]:
# Preprocessing text
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="sentence",
    split_length=4,
    split_respect_sentence_boundary=False,
    split_overlap=0
)
cleaned = preprocessor.process([extracted])

In [None]:
# Storing the text in a FAISS document store (local SQL database)
document_store = FAISSDocumentStore(faiss_index_factory_str='Flat', similarity="dot_product")
document_store.write_documents(cleaned)

In [None]:
# Defining the dense passage retriever model
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=16,
    use_gpu=True,
    embed_title=True,
    use_fast_tokenizers=True,
)
# Updating the embeddings in the document store using the model
document_store.update_embeddings(retriever)

In [None]:
# Defining the reader model (same as for the QA generation pipeline)
reader = TransformersReader("deepset/roberta-base-squad2", use_gpu=1)

In [None]:
# Defining the pipeline for context extraction
pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
# Test prediction only
prediction = pipe.run(query="When were the anticorruption policies and procedures revised?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})

In [None]:
prediction