In [2]:
import json
import fitz
import re
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

def segment_text(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    return [sentence for sentence in sentences if sentence]

def generate_question(segment, model, tokenizer):
    input_text = f"generate questions from the following text: {segment}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

def extract_answer(question, context):
    qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2",
                           tokenizer="deepset/roberta-base-squad2")
    result = qa_pipeline(question=question, context=context)
    return result['answer']

def main(pdf_path, output_json_path):
    model_name = "valhalla/t5-small-qg-hl"
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    t5_model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    t5_tokenizer = T5Tokenizer.from_pretrained(model_name)

    combined_text = extract_text_from_pdf(pdf_path)
    cleaned_text = clean_text(combined_text)
    segments = segment_text(cleaned_text)

    qa_pairs = []
    for segment in segments:
        question = generate_question(segment, t5_model, t5_tokenizer)
        answer = extract_answer(question, segment)
        qa_pairs.append({'question': question, 'answer': answer})

    with open(output_json_path, 'w') as json_file:
        json.dump(qa_pairs, json_file)

pdf_path = "Sustainability_Highlights_2024.pdf"  # Update this path as needed
output_json_path = "qa_pairs.json"
main(pdf_path, output_json_path)

Fetching 0 files: 0it [00:00, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 20360.70it/s]
Fetching 0 files: 0it [00:00, ?it/s]
Device set to use cpu
Fetching 0 files: 0it [00:00, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10837.99it/s]
Fetching 0 files: 0it [00:00, ?it/s]
Device set to use cpu
Fetching 0 files: 0it [00:00, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 4917.12it/s]
Fetching 0 files: 0it [00:00, ?it/s]
Device set to use cpu
Fetching 0 files: 0it [00:00, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 12264.05it/s]
Fetching 0 files: 0it [00:00, ?it/s]
Device set to use cpu
Fetching 0 files: 0it [00:00, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10205.12it/s]
Fetching 0 files: 0it [00:00, ?it/s]
Device set to use cpu
Fetching 0 files: 0it [00:00, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 8256.50it/s]
Fetching 0 files: 0it [00:00, ?it/s]
Device set to use cpu
Fetching 0 files: 0it [0

In [4]:
import json

with open(output_json_path, 'r') as json_file:
    qa_pairs = json.load(json_file)

for pair in qa_pairs:
    print(f"Question: {pair['question']}")
    print(f"Answer: {pair['answer']} \n")

Question: What does the planets pathways passion innovation team play sustainability highlights 2024?
Answer: climate biodiversity 

Question: What does sustainability guide our approach to business management while also representing a shared responsibility to future generations?
Answer: our commitment to sustainable development 

Question: What are the questions that are generated from the following text?
Answer: greener and more prosperous future for all. 

Question: What is the text that generates questions from?
Answer: sustainability and innovation are closely linked. 

Question: What type of text does innovation generate questions from?
Answer: digitalization and sustainability. 

Question: What is the text that will inspire our growth?
Answer: strategic plan 

Question: What is the goal of the text?
Answer: to achieve 55% of our revenues from sustainable solutions by 2028 

Question: What is the new goal for biodiversity?
Answer: to achieve a net gain approach across all of our 