In [None]:
import os
from tqdm import tqdm

from pathlib import Path

os.chdir("..")

In [None]:
from open_extract.llm import Extractor, QUESTIONS
from open_extract.data_model import QA, Screening

In [None]:
# from open_extract.llm import keep_alive
# keep_alive("deepseek-r1-70b-15k-ctx", host="olvi-1:11434")

In [None]:
# Get all docs that passed the screening critieria

# results = sorted(Path("runs/screening_250205/").glob("*.json"))
# passed = []

# for result in results:
#     screened_doc = Screening.model_validate_json(result.read_text())
#     if screened_doc.study_within_us & screened_doc.is_soybean_study & screened_doc.has_yield_data:
#         passed.append(result)

# len(passed), len(results)

In [None]:
from pydantic import BaseModel

class ExtractedDoc(BaseModel):
    file_name: str
    screening: Screening
    qas: list[QA]
    word_count: int | None = None
    over_context_length: bool | None = None


def full_extract(md_path: Path, model: str = "deepseek-r1-70b-15k-ctx") -> ExtractedDoc:
    """Extract QAs and screening result.
    
    Note. Very inefficient, but it is just a prototype. Speed up or parallelize if needed.
    """

    screening_extractor = Extractor(model_name=model, ollama_host="olvi-1:11434", target_model=Screening)
    qa_extractor = Extractor(model_name=model,  ollama_host="olvi-1:11434", target_model=QA)

    text = md_path.read_text()
    screening = screening_extractor.run(content=text)

    qas = []
    for question in tqdm(QUESTIONS.values()):
        prompt = f"Answer this question {question}. \n\nOnly based on this study information: {text}"
        try:
            qas.append(qa_extractor.run(prompt))
        except Exception as e:
            print(f"Failed to extract {question} from {screening.stem}, {e}")

    return ExtractedDoc(
        file_name=md_path.stem,
        screening=screening,
        qas=qas,
        word_count=len(text.split(" ")),
        over_context_length=len(text.split()) > 15000
    )


In [None]:
md_files = Path("data/prototype_250124/mds").glob("*.md")
run_path = Path("runs/qa_250211")

for md_file in tqdm(md_files):
    extracted = full_extract(md_file)
    run_path.mkdir(exist_ok=True, parents=True)
    (run_path / f"{extracted.file_name}.json").write_text(extracted.model_dump_json())