In [4]:
from openai import OpenAI
import os
from PyPDF2 import PdfReader
import glob
from tqdm import tqdm
import json
import random
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())  # read local .env file

client = OpenAI()

system_prompt = \
"""## Roal
You are a helpful financial report assistant designed to read annual financial reports and output question-answer pairs in JSON.
You should mimic a person who wants to ask questions based on the report content.

## Output JSON format
{
  "question_1": "Answer1",
  "question_2": "Answer2",
  "question_3": "Answer3", 
  ...
}

# Rule
1. the question should specify the company name and years.
2. questions and answers need to be related to the specific content of the company's annual financial report.
3. the answer is better to be a number or phrase that can be easily verified.
4. the question could not mention the Annual Report.
"""
user_prompt = """
Based on the annual finatial report, generate 3 questions with specific answers that you pretty sure are correct. The answer is better to be a number or phrase that can be easily verified.
The content of the annual finatial report is in below:
"""

responses = []
for pdf in tqdm(glob.glob("./docs/*.pdf")):
    pdf_reader = PdfReader(pdf)
    text = ""
    start_page = random.randint(0, max(1, len(pdf_reader.pages)-10))
    for i, page in enumerate(pdf_reader.pages):
        if i >= start_page and i < start_page + 10 or i == 0:
            text += page.extract_text()
    
    response = client.chat.completions.create(
                model="gpt-4-turbo-preview",
                response_format={ "type": "json_object" },
                messages=[
                  {"role": "system", "content": system_prompt},
                  {"role": "user", "content":  user_prompt + text}
                ]
              )
    resp = json.loads(response.choices[0].message.content)
    resp["filename"] = pdf
    resp["contexts"] = text
    responses.append(resp)

    # Writing JSON data
    with open('QA_dataset_v2.json', 'w') as file:
        json.dump(responses, file, indent=4)

100%|██████████| 15/15 [02:31<00:00, 10.12s/it]


In [3]:
# generate more advanced QA pairs

from openai import OpenAI
import os
from PyPDF2 import PdfReader
import glob
from tqdm import tqdm
import json
import random
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())  # read local .env file

client = OpenAI()

system_prompt = \
"""## Roal
You are a helpful financial report assistant designed to read several annual financial reports and output question-answer pairs in JSON.
The question should only be anwerred using two annual financial reports' information.
You should mimic a person who wants to ask questions based on the report content.

## Output JSON format
{
  "question_1": "Answer1",
  "question_2": "Answer2",
  "question_3": "Answer3", 
  ...
}

# Rule
1. the question should specify the company name and years.
2. questions and answers need to be related to the specific content of the company's annual financial report.
3. the answer is better to be a number or phrase that can be easily verified.
4. the question could not mention the Annual Report.
5. The question and anwer should use two annual financial reports' information.
"""
user_prompt = """
Based on the annual finatial report, generate 10 questions with specific answers that you pretty sure are correct. The answer is better to be a number or phrase that can be easily verified.

"""

company_names = ["HW", "McDonald", "OCBC", "shell"]
responses = []
for cn in company_names:
    
    content = ""
    for k, pdf in tqdm(enumerate(glob.glob(f"./docs/{cn}*.pdf"))):
        pdf_reader = PdfReader(pdf)
        text = ""
        start_page = 0 # random.randint(0, max(1, len(pdf_reader.pages)-10))
        for i, page in enumerate(pdf_reader.pages):
            if i >= start_page and i < start_page + 10 or i == 0:
                text += page.extract_text()
        content += f"""## The content of the annual finatial report {k} is in below:
        {text}
        
        """
    response = client.chat.completions.create(
                model="gpt-4-turbo-preview",
                response_format={ "type": "json_object" },
                messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content":  user_prompt + content}
                ]
            )
    resp = json.loads(response.choices[0].message.content)
    resp["filename"] = pdf
    resp["contexts"] = content
    responses.append(resp)

    # Writing JSON data
    with open('QA_dataset_v4.json', 'w') as file:
        json.dump(responses, file, indent=4)

4it [00:00,  9.88it/s]
