In [None]:
import os
from tqdm import tqdm

from pathlib import Path

os.chdir("..")

In [None]:
from open_extract.llm import Extractor, QUESTIONS
from open_extract.data_model import QA, Screening

In [None]:
# Get all docs that passed the screening critieria

results = sorted(Path("runs/screening_250205/").glob("*.json"))
passed = []

for result in results:
    screened_doc = Screening.model_validate_json(result.read_text())
    if screened_doc.study_within_us & screened_doc.is_soybean_study & screened_doc.has_yield_data:
        passed.append(result)

len(passed), len(results)

In [None]:
passed[:3]

In [None]:
from pydantic import BaseModel, Field

class ExtractedDoc(BaseModel):
    file_name: str
    title: str
    authors: list[str]
    publication_date: str
    publication_year: int
    publication_name: str
    publication_doi: str
    study_within_us: bool
    study_location: str
    is_soybean_study: bool
    has_yield_data: bool
    qas: list[QA]
    word_count: int
    over_context_length: bool


extractor = Extractor(model_name="long-context-deepseek", ollama_host="olvi-1:11434", target_model=QA)


def extract(screening_result_path: Path) -> ExtractedDoc:
    """Extract QAs from a screening result. (e.g., article.json)"""

    screening = Screening.model_validate_json(screening_result_path.read_text())

    md_dir = Path("data/screening_250205/mds/")
    md_file = md_dir / f"{screening_result_path.stem}.md"
    doc_text = md_file.read_text()


    qas = []
    for question in tqdm(QUESTIONS.values()):
        prompt = f"Answer this question {question}. \n\nOnly based on this study information: {doc_text}"
        try:
            qas.append(extractor.run(prompt))
        except Exception as e:
            print(f"Failed to extract {question} from {screening.stem}, {e}")

    return ExtractedDoc(
        file_name=screening_result_path.stem,
        title=screening.title,
        authors=screening.authors,
        publication_date=screening.publication_date,
        publication_year=screening.publication_year,
        publication_name=screening.publication_name,
        publication_doi=screening.publication_doi,
        study_within_us=screening.study_within_us,
        study_location=screening.study_location,
        is_soybean_study=screening.is_soybean_study,
        has_yield_data=screening.has_yield_data,
        qas=qas,
        word_count=len(doc_text.split()),
        over_context_length=len(doc_text.split()) > 60000
    )



In [None]:
extracted_qa = extract(passed[0])

In [None]:
Path("full.json").write_text(extracted_qa.model_dump_json(indent=4))