In [7]:
from haystack.nodes import PDFToTextConverter, PreProcessor
import pathlib as pl

from haystack.pipelines import QuestionAnswerGenerationPipeline
from haystack.nodes import QuestionGenerator
from haystack.nodes import TransformersReader

import pandas as pd


In [8]:
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="sentence",
    split_length=4,
    split_respect_sentence_boundary=False,
    split_overlap=0
)

reader = TransformersReader("deepset/roberta-base-squad2", use_gpu=1)

qg = QuestionGenerator()

qag_pipeline = QuestionAnswerGenerationPipeline(qg, reader)

In [49]:
extracted = converter.convert(file_path=pl.Path("../data/raw/sustainability-report-2020.pdf"), meta=False, encoding="UTF-8")[0]

In [50]:
cleaned = preprocessor.process([extracted])

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

In [39]:
data_frame = pd.DataFrame(columns=["question", "answer", "context"])


In [None]:
for i, x in enumerate(cleaned):
    result = qag_pipeline.run(documents=[x])
    answers = [sorted(answer, key=lambda x: x.score, reverse=True)[0] for answer in result["answers"]]
    answers_x = [answer.answer for answer in answers]
    context_x = [answer.context for answer in answers]
    data_frame = pd.concat([data_frame,
        pd.DataFrame({"question": result["queries"], "answer": answers_x, "context": context_x})], 
        ignore_index=True
    )


In [44]:
data_frame.to_csv("../data/processed/sustainability-report-2020.csv", index=False, encoding="utf-8")