In [19]:
import cohere
import os
import PyPDF2
import json
import re
from datetime import datetime

co = cohere.Client(api_key=os.getenv("COHERE_API_KEY"))

In [20]:
pdf_obj = open("Draft-OASIS-E All Items_03122020.pdf", "rb")
pdf_reader = PyPDF2.PdfReader(pdf_obj)
num_pages = len(pdf_reader.pages)
detected_text = ""

for page_num in range(1, num_pages):
    page_obj = pdf_reader.pages[page_num]
    detected_text += page_obj.extract_text() + "\n\n"
pdf_obj.close()

In [21]:
def extract_questions_options(text):
    # Regex pattern to identify questions (starting with a capital letter or ending with a question mark)
    question_pattern = re.compile(r"^[A-Z].*|.*\?$")
    # Regex pattern to identify options (starting with a letter/number and a period)
    option_pattern = re.compile(r"^([A-Za-z0-9]{1,2}|[A-Z]{2})\.")
    # Regex pattern to identify footer text (e.g., page numbers, common footer phrases)
    footer_pattern = re.compile(
        r"(Page \d+ of \d+|OASIS-E All Items|OASIS- E All Items|Effective \d{2}/\d{2}/\d{4})|Centers for Medicare & Medicaid Services|Enter Code"
    )

    # Split the text into lines for easier processing
    lines = text.split("\n")

    q_and_o = []
    current_question = None

    for line in lines:
        line = line.strip()
        if footer_pattern.match(line):
            continue  # Skip footer lines
        if question_pattern.match(line) and not option_pattern.match(line):
            if (
                current_question
            ):  # If there is a current question, store it before starting a new one
                q_and_o.append(current_question)
            current_question = {"question": line, "options": []}
        elif option_pattern.match(line):
            if current_question:  # Add option to the current question
                current_question["options"].append(line)

    # Append the last question-options pair
    if current_question:
        q_and_o.append(current_question)

    return q_and_o

In [22]:
questions_options = extract_questions_options(detected_text)

In [23]:
questions_options_json = json.dumps(questions_options, indent=4)

In [24]:
with open(f"{datetime.now().strftime('%Y-%m-%d')}__questions.json", "w") as json_file:
    json_file.write(questions_options_json)

In [25]:
with open("2024-05-29__questions.json", "r") as f:
    file1_data = json.load(f)

In [26]:
with open("answers.txt", "r") as f:
    file2_data = f.read()

In [27]:
prompt = f"""
You have been provided two sets of data. One contains questions and options as keys of a JSON object. The second contains text of questions and answers in random order. Your task is to return structured output in the form of question and corresponding answer.

Here are the details:
** File 1 (JSON) **
{json.dumps(file1_data, indent=4)}

** File 2 (text) ** 
{file2_data}

** Instructions ** 
1. Match each question from file 1 with the corresponding answer from file 2.
2. Return the structured output as follows:
   - question: The question text.
   - answer: The correct answer.

** Example output **
[
    {{
        "question": "Are you of Hispanic, Latino/a, or Spanish origin?",
        "answer": "Yes, Cuban" 
    }}
]

Please provide me with the questions and their corresponding answers from both files. Ensure that the questions are paired correctly with their respective answers.
"""

In [28]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt},
]

In [29]:
response = co.chat(
    model="command-r",
    message=prompt,
    temperature=0.5,
)

In [30]:
print(response.text)

Here is the structured output based on your instructions:
```json
[
    {
        "question": "From which of the following Inpatient Facilities was the patient discharged within the past 14 days?",
        "answer": "Short-stay acute hospital (IPPS)"
    },
    {
        "question": "SOC/ROC assessment indicates the patient is not at risk of",
        "answer": "Pressure ulcer treatment based on"
    },
    {
        "question": "Reported or observed within the last 14 days.",
        "answer": "During the day and evening, but not constantly"
    },
    {
        "question": "How often do you feel lonely or isolated from those around you?",
        "answer": "Rarely"
    },
    {
        "question": "Current Payment Sources for Home Care",
        "answer": "Medicaid (traditional fee-for-service)"
    },
    {
        "question": "Has lack of transportation kept you from medical appointments, meetings, work, or from getting things needed for daily living?",
        "answer": "Yes, it h