In [None]:
import openai
from dotenv import load_dotenv

import os
os.chdir("..")
load_dotenv()


In [None]:
from open_extract.llm import QUESTIONS
from open_extract.data_model import QA
from pathlib import Path

In [None]:
client = openai.OpenAI()
model = "o3-mini"

In [None]:
def extract(md_path: Path, question: str, client: openai.OpenAI, model: str) -> QA:
    """Extract Paper structure from text."""
    system_message = {
        "role": "system",
        "content": "You are a research assistant specializing in agriculture, your role is to extract data from academic papers and provide accurate answers based on their findings.",
    }
    user_message= {
        "role": "user",
        "content": f"{md_path.read_text()} \n\n Answer this question based on the above information only: {question}",
    }

    completion = client.beta.chat.completions.parse(
        model=model,
        messages=[system_message, user_message],
        response_format=QA,
    )

    if completion.choices[0].message.parsed is None:
        raise ValueError("Failed to extract paper structure.")
    return completion.choices[0].message.parsed

In [None]:
def pipeline(md_file: Path, questions: dict, client: openai.OpenAI, model: str) -> None:

    run_path = Path("runs/openai_qa_250211")
    run_path.mkdir(exist_ok=True, parents=True)
    output_file = run_path / f"{md_file.stem}.jsonl"

    if output_file.exists():
        return
    
    for question in questions.values():
        try:
            answer = extract(md_file, question, client, model)
            with open(output_file, "a") as f:
                f.write(answer.model_dump_json(indent=4) + "\n")
        except Exception as e:
            print(e)


In [None]:
for  md_file in Path("data/prototype_250124/mds").glob("*.md"):
    pipeline(md_file, QUESTIONS, client, model)


Seems better, especially in extracting: 
- study_is_answering_question
- confidence

The qualitative answer seems to be similar