In [None]:
import os
from tqdm import tqdm

from pathlib import Path
from pydantic import BaseModel

os.chdir("..")

In [None]:
from open_extract.llm import OLLAMAExtractor, QUESTIONS
from open_extract.data_model import QA, Screening

In [None]:
# from open_extract.llm import keep_alive
# keep_alive("deepseek-r1-70b-15k-ctx", host="olvi-1:11434")

In [None]:

class ExtractedDoc(BaseModel):
    file_name: str
    screening: Screening
    qas: list[QA]
    word_count: int | None = None
    over_context_length: bool | None = None


def full_extract(md_path: Path, model: str = "deepseek-r1-70b-15k-ctx") -> ExtractedDoc:
    """Extract QAs and screening result.
    
    Note. Very inefficient, but it is just a prototype. Speed up or parallelize if needed.
    """

    screening_extractor = OLLAMAExtractor(model_name=model, ollama_host="olvi-1:11434", target_model=Screening)
    qa_extractor = OLLAMAExtractor(model_name=model,  ollama_host="olvi-1:11434", target_model=QA)

    text = md_path.read_text()
    screening = screening_extractor.run(content=text)

    qas = []
    for question in tqdm(QUESTIONS.values()):
        prompt = f"{text} \n\n Answer this question based on the above information only: {question}"
        try:
            qas.append(qa_extractor.run(prompt))
        except Exception as e:
            print(f"Failed to extract {question} from {screening.stem}, {e}")

    return ExtractedDoc(
        file_name=md_path.stem,
        screening=screening,
        qas=qas,
        word_count=len(text.split(" ")),
        over_context_length=len(text.split()) > 15000
    )


In [None]:
md_files = Path("data/prototype_250124/mds").glob("*.md")
run_path = Path("runs/qa_250211")


for md_file in tqdm(md_files):
    extracted = full_extract(md_file)
    run_path.mkdir(exist_ok=True, parents=True)
    (run_path / f"{extracted.file_name}.json").write_text(extracted.model_dump_json(indent=4))