In [29]:
# Haystack model for question answering generation
from haystack.nodes import PDFToTextConverter, PreProcessor
from haystack.pipelines import QuestionAnswerGenerationPipeline
from haystack.nodes import QuestionGenerator
from haystack.nodes import TransformersReader

# Library for path handling
import pathlib as pl

# Library for data handling
import pandas as pd


In [None]:
# Model for text extraction from pdf
converter = PDFToTextConverter(remove_numeric_tables=True)

# File preprocessor
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="sentence",
    split_length=4,
    split_respect_sentence_boundary=False,
    split_overlap=0
)

# Model for text analysis 
reader = TransformersReader("deepset/roberta-base-squad2", use_gpu=1)

# Question generation model
qg = QuestionGenerator()

# Question answering generation pipeline
qag_pipeline = QuestionAnswerGenerationPipeline(qg, reader)

In [None]:
# Use the model to extract text from pdf
extracted = converter.convert(file_path=pl.Path("../data/raw/sustainability-report-2022.pdf"), meta=False, encoding="UTF-8")[0]

In [None]:
# Preprocess the extracted text
cleaned = preprocessor.process([extracted])

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

In [None]:
# Create empty dataframe for the results
data_frame = pd.DataFrame(columns=["question", "answer", "context"])


In [None]:
# Go over the generated document contents
for i, x in enumerate(cleaned):
    # Generate questions and answers from the before pipeline
    result = qag_pipeline.run(documents=[x])
    # Get the best answer for each question
    answers = [sorted(answer, key=lambda x: x.score, reverse=True)[0] for answer in result["answers"]]
    # Get the answer and context for each question
    answers_x = [answer.answer for answer in answers]
    context_x = [answer.context for answer in answers]
    # Add the results to the dataframe
    data_frame = pd.concat([data_frame,
        pd.DataFrame({"question": result["queries"], "answer": answers_x, "context": context_x})], 
        ignore_index=True
    )


NameError: name 'qag_pipeline' is not defined

In [None]:
# Remove all non-valid spacial characters from the dataset
data_frame = data_frame.replace('[^a-zA-Z0-9 /.?!,čšžćđ\-%]', '', regex=True)

In [None]:
# Store the dataset
data_frame.to_csv("../data/processed/sustainability-report-2022.csv", index=False, encoding="utf-8")