In [None]:
# Imports for the DPR pipeline
from haystack.nodes import TransformersReader, DensePassageRetriever, PreProcessor, PDFToTextConverter
from haystack.document_stores import FAISSDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
import shutil

# Library for path handling
import pathlib as pl

# Library for data handling
import pandas as pd

# Year identifier
year = 2020

# Target location for the document store
ds_path = f"../app/"

In [None]:
# Model for text extraction from pdf
converter = PDFToTextConverter(remove_numeric_tables=True)
# Extracting text from pdf
extracted = converter.convert(file_path=pl.Path(f"../data/raw/sustainability-report-{year}.pdf"), meta=False, encoding="UTF-8")[0]

In [None]:
# Preprocessing text
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="sentence",
    split_length=4,
    split_respect_sentence_boundary=False,
    split_overlap=0
)
cleaned = preprocessor.process([extracted])

In [None]:
# Storing the text in a FAISS document store (local SQL database)
document_store = FAISSDocumentStore(faiss_index_factory_str='Flat', similarity="dot_product")
document_store.write_documents(cleaned)

In [None]:
# Defining the dense passage retriever model
retriever = DensePassageRetriever.load(load_dir=f"../models/DPR/{year}", document_store=document_store, use_gpu=True)
# Updating the embeddings in the document store using the model
document_store.update_embeddings(retriever)
document_store.save("document_store.faiss")

In [None]:
shutil.move("./faiss_document_store.db", ds_path)