In [7]:
# Set up logging
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [16]:
# Imports for the DPR pipeline
from haystack.nodes import TransformersReader, DensePassageRetriever, PreProcessor, PDFToTextConverter
from haystack.document_stores import FAISSDocumentStore
from haystack.pipelines import ExtractiveQAPipeline

# Library for path handling
import pathlib as pl

# Library for data handling
import pandas as pd

In [9]:
# Model for text extraction from pdf
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
# Extracting text from pdf
extracted = converter.convert(file_path=pl.Path("../data/raw/sustainability-report-2020.pdf"), meta=False, encoding="UTF-8")[0]

In [12]:
# Preprocessing text
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="sentence",
    split_length=4,
    split_respect_sentence_boundary=False,
    split_overlap=0
)
cleaned = preprocessor.process([extracted])

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

In [14]:
# Storing the text in a FAISS document store (local SQL database)
document_store = FAISSDocumentStore(faiss_index_factory_str='Flat', similarity="dot_product")
document_store.write_documents(cleaned)

Writing Documents:   0%|          | 0/256 [00:00<?, ?it/s]

In [15]:
# Defining the dense passage retriever model
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=16,
    use_gpu=True,
    embed_title=True,
    use_fast_tokenizers=True,
)
# Updating the embeddings in the document store using the model
document_store.update_embeddings(retriever)

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
  return self.fget.__get__(instance, owner)()
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.document_stores.faiss:Updating embeddings for 256 docs...


Updating Embedding:   0%|          | 0/256 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/256 [00:00<?, ? Docs/s]

In [17]:
# Defining the reader model (same as for the QA generation pipeline)
reader = TransformersReader("deepset/roberta-base-squad2", use_gpu=1)

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [19]:
# Defining the pipeline for context extraction
pipe = ExtractiveQAPipeline(reader, retriever)

In [21]:
# Test prediction only
prediction = pipe.run(query="When were the anticorruption policies and procedures revised?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})

In [22]:
prediction

{'query': 'When were the anticorruption policies and procedures revised?',
 'answers': [<Answer {'answer': 'second\nhalf of 2020', 'type': 'extractive', 'score': 0.8555408716201782, 'context': 'o corruption\nThe anticorruption policy and\nprocedures were revised in second\nhalf of 2020. There were no confirmed\ncases of corruption in 2020.\nNon-discriminat', 'offsets_in_document': [{'start': 123, 'end': 142}], 'offsets_in_context': [{'start': 70, 'end': 89}], 'document_ids': ['baeb9fdd6531b33d53a369cd89476249'], 'meta': {'_split_id': 2, 'vector_id': '170'}}>,
  <Answer {'answer': 'second half of 2020', 'type': 'extractive', 'score': 0.795420229434967, 'context': 'embers: /\nThe anticorruption policy and procedures were\nrevised in in second half of 2020. Implementation\nin NLB d.d. ', 'offsets_in_document': [{'start': 178, 'end': 197}], 'offsets_in_context': [{'start': 70, 'end': 89}], 'document_ids': ['e0b7aedcd1b234e2d71e232868af0dc9'], 'meta': {'_split_id': 182, 'vector_id': '218'}}