# Model Output Evaluation Notebook

This notebook runs LLM inference to predict HPO terms, compares them to ground truth phenopackets, and produces a summary report.


## Step 0) Imports, Path Discovery & Sanity Checks

Load all dependencies, discover the dataset CSV automatically, and validate critical directories.


In [None]:
# Basic Setup
import sys, os, glob, json, subprocess, pickle, datetime, hashlib, warnings, random, requests
from pathlib import Path
import pandas as pd
from typing import List, Dict, Any
from ollama import chat
from docling.document_converter import DocumentConverter, ConversionError
from pypdfium2._helpers.misc import PdfiumError
from google.protobuf.json_format import ParseDict, ParseError
from phenopackets import Phenopacket as ProtoPhenopacket
from json.decoder import JSONDecodeError

# Need this at least once for some reason:
# import .autonotebook
# from .autonotebook import tqdm as notebook_tqdm

try:
    from phenopacket import Phenopacket, InvalidPhenopacketError
    from report import Report
    from evaluation import PhenotypeEvaluator
except ImportError as e:
    raise ImportError(f"Could not import project utils: {e}")

# Make sure our utils folder is on PYTHONPATH
project_root        = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder          = os.path.join(project_root, "src")
utils_folder        = os.path.join(project_root, "notebooks", "utils")

print("Project Start:       %s" % project_root)
print("Source Folder:       %s" % src_folder)
print("Utilities Folder:    %s" % utils_folder)

for path in (src_folder, utils_folder):
    if not os.path.isdir(path):
        raise FileNotFoundError(f"Expected folder on PYTHOPATH : {path}")
    if path not in sys.path:
        sys.path.insert(0, path)

print("PYTHONPATH patched with:", src_folder, utils_folder)

# define all key paths
pdf_input_directory                 = os.path.join(src_folder, "P5", "scripts", "data", "tmp", "phenopacket_store", "pmid_pdfs")            # scripts/data/tmp/phenopacket_store/pmid_pdfs/
ground_truth_notebooks_directory    = os.path.join(src_folder, "P5", "scripts", "data","tmp", "phenopacket_store","notebooks")              # scripts/data/tmp/phenopacket_store/notebooks/
dataset_csv_path                    = os.path.join(src_folder, "P5", "scripts", "data", "tmp", "PMID_PDF_Phenopacket_list_in_phenopacket_store.csv")

# All experimental outputs go under here
experimental_data_root              = os.path.join(project_root, "experimental-data")
llm_output_directory                = os.path.join(experimental_data_root, "llm_output_dir")                                                # intermediate .txt + raw JSON from LLM
validated_jsons_directory           = os.path.join(experimental_data_root, "validated_jsons")                                               # validated_jsons, the final validated LLM phenopackets
evaluation_report_output_path       = os.path.join(project_root, "reports", "first_report.json")                                            # the evaluation metrics report

# Create any missing output folders
os.makedirs(pdf_input_directory, exist_ok=True)
os.makedirs(ground_truth_notebooks_directory, exist_ok=True)
os.makedirs(os.path.dirname(dataset_csv_path), exist_ok=True)
os.makedirs(llm_output_directory, exist_ok=True)
os.makedirs(validated_jsons_directory, exist_ok=True)
os.makedirs(os.path.dirname(evaluation_report_output_path), exist_ok=True)

# Create the PMIDs pickle file path
pmid_pkl_path = os.path.join(src_folder, "P5", "scripts", "data", "tmp", "pmids.pkl")

# TODO: Figure out why deleting the `ground_truth_notebooks_directory` after creating it works. Maybe because git doesn't let me just overwrite a directory with a clone request
# Before the git pull operation
import shutil

# Clean up existing directory if it exists
target_dir = os.path.join(src_folder, "P5", "scripts", "data", "tmp", "phenopacket_store", "notebooks")
if os.path.exists(target_dir):
    shutil.rmtree(target_dir)

# 1. Now run the git pull to clone the "phenopacket-store" GitHub repo into scripts/data/tmp/phenopacket_store
subprocess.run([
    sys.executable, "-m", "P5.scripts.pull_git_files",
    os.path.join(src_folder, "P5", "scripts", "data", "tmp", "phenopacket_store"),
    "https://github.com/monarch-initiative/phenopacket-store.git",
    "notebooks"
], check=True)

print("Stage 1 Complete, Produced %s" % ground_truth_notebooks_directory)

# 2. Scan the just-pulled notebooks for PMID_##### files
subprocess.run([
    sys.executable, "-m", "P5.scripts.create_pmid_pkl",
    os.path.join(src_folder, "P5", "scripts", "data", "tmp", "phenopacket_store", "notebooks"),
    os.path.join(src_folder, "P5", "scripts", "data", "tmp", "pmids.pkl"),
    "--recursive_dir_search",
], check=True)

print("Stage 2 Complete")

# 3. Download *all* PDFs for those PMIDs (0 = unlimited)
subprocess.run([
    sys.executable, "-m", "P5.scripts.pmid_downloader", pmid_pkl_path, pdf_input_directory, "10"
], check=True)

print("Stage 3 Complete")

# 4. Finally, build THE CSV mapping PDFs to the ground-truth JSONs
if not os.path.isfile(dataset_csv_path):
    subprocess.run([
        sys.executable, "-m", "P5.scripts.create_phenopacket_dataset",
        pdf_input_directory,
        ground_truth_notebooks_directory,
        dataset_csv_path,
        "--recursive_ground_truth_dir", "True"
    ], check=True)
    print(f"Created dataset CSV at {dataset_csv_path}")

    print("Stage 4 Complete")

    if not os.path.isdir(pdf_input_directory):
        raise FileNotFoundError("PDF input directory not found: %s" % pdf_input_directory)
    if not os.path.isdir(ground_truth_notebooks_directory):
        raise FileNotFoundError("Ground truth notebooks directory not found: %s" % ground_truth_notebooks_directory)

print("PDF inputs folder:               %s" % pdf_input_directory)
print("Ground truth folder:             %s" % ground_truth_notebooks_directory)
print("Dataset CSV path:                %s" % dataset_csv_path)
print("Experimentally generated files:  %s" % experimental_data_root)
print("LLM outputs folder:              %s" % llm_output_directory)
print("Validated JSONs folder:          %s" % validated_jsons_directory)
print("Evaluation report path:          %s" % evaluation_report_output_path)

print("hello0")  # print hello 0 as a sanity check

## Step 1) Load Dataset

Read the CSV of PMIDs, input paths, and truth paths


In [None]:
# Load datasets
dataframe_cases = pd.read_csv(dataset_csv_path)
print(f"Loaded {len(dataframe_cases)} rows from dataset CSV")
# Load cases & deduplicate PMIDs, with start/end counts
orig_count = len(dataframe_cases)
print(f"Before deduplication: {orig_count} total cases")

# Debug: verify that every `input` path actually exists
print("Checking existence of input PDFs:")
for pdf_path in dataframe_cases["input"]:
    status = "FOUND" if os.path.isfile(pdf_path) else "MISSING"
    print(f"  o {pdf_path}: {status}")

# Drop duplicate PMIDs
dataframe_cases = dataframe_cases.drop_duplicates(subset="pmid", keep="first").reset_index(drop=True) # This may be too aggressive and I need to check if this is a good approach
removed = orig_count - len(dataframe_cases)
print(f"{removed} duplicates removed (now {len(dataframe_cases)} unique PMIDs)")

# Verify required columns
required_columns = {"pmid", "input", "truth"}
missing_columns = required_columns - set(dataframe_cases.columns)
if missing_columns:
    raise KeyError("Missing required columns: %s" % missing_columns)

# Preview first few rows
dataframe_cases.head()


print("PDF inputs folder:               %s" % pdf_input_directory)
print("Ground truth folder:             %s" % ground_truth_notebooks_directory)
print("Dataset CSV path:                %s" % dataset_csv_path)
print("Experimentally generated files:  %s" % experimental_data_root)
print("LLM outputs folder:              %s" % llm_output_directory)
print("Validated JSONs folder:          %s" % validated_jsons_directory)
print("Evaluation report path:          %s" % evaluation_report_output_path)
print("hello1")  # print hello 1 as a sanity check

## Step 2) Discover Phenopacket-Store Files

Locate all ground-truth Phenopacket JSON files under the `phenopacket_store/notebooks/` directory.

In [None]:
# Finding all ground-truth phenopacket JSON files
search_pattern = os.path.join(ground_truth_notebooks_directory, "*", "phenopackets", "*.json")
truth_json_filepaths = glob.glob(str(search_pattern), recursive=True)
if not truth_json_filepaths:
    raise FileNotFoundError(f"No ground-truth JSONs found at {search_pattern}")

print("Discovered %d ground-truth JSON files" %len(truth_json_filepaths))


print("Search Pattern:                  %s" % search_pattern)
print("The phenopacket-store path:      %s" % truth_json_filepaths)
print("PDF inputs folder:               %s" % pdf_input_directory)
print("Ground truth folder:             %s" % ground_truth_notebooks_directory)
print("Dataset CSV path:                %s" % dataset_csv_path)
print("Experimentally generated files:  %s" % experimental_data_root)
print("LLM outputs folder:              %s" % llm_output_directory)
print("Validated JSONs folder:          %s" % validated_jsons_directory)
print("Evaluation report path:          %s" % evaluation_report_output_path)
print("hello2")  # print hello 2 as a sanity check

## Step 3) Prepare PDF-to-Text Converter

- Randomly pick N unique PMIDs from the CSV (from Step 1) to keep runs fast and reproducible.
- Instantiate DocumentConverter and define a helper function to load or convert the clinical PDFs for LLM input.
- Setup Persistent PDF-to-Text Cache

In [None]:
# Cache Integrity & Versioning
CACHE_DIR = Path(experimental_data_root) / "text_cache"
INDEX_FILE = CACHE_DIR / "cache_index.json"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

# Reload the deduplicated CSV from Step 1
full_df = pd.read_csv(dataset_csv_path).drop_duplicates(subset="pmid").reset_index(drop=True)

# Choose how many cases to sample
N = 10
# Don’t ask for more than exist
N = min(N, len(full_df))
subset_df = full_df.sample(n=N, random_state=42).reset_index(drop=True)

print(f"Sampling {N} PMIDs:", subset_df["pmid"].tolist())


# Setup conversion for input material to LLM-compatible txt now

# Initialize converter once
pdf_to_text_converter = DocumentConverter()

# Path to persistent cache of PDF text
text_cache_path = os.path.join(experimental_data_root, "text_cache.pkl")
# Load or initialize cache
if os.path.exists(text_cache_path):
    with open(text_cache_path, "rb") as f:
        text_cache = pickle.load(f)
    print(f"Loaded text cache with {len(text_cache)} entries")
else:
    text_cache = {}
    print("Initialized empty text cache")

def load_clinical_pdf(pdf_path: str) -> str:
    """
    Convert .txt or .pdf file at "pdf_path" into a plain text string.
    Use the in-memory cache first; write new text back to the cache only when the cache is explicitly saved at the end of the pipeline.
    Raise FileNotFoundError or ConversionError if the file does not exist.
    """
    # Return cache if it exists in memory
    if pdf_path in text_cache:
        return text_cache[pdf_path]

    # Ensure the files exists before we continue
    if not os.path.isfile(pdf_path):
        raise FileNotFoundError("Input file not found: %s" % pdf_path)

    # If it's already plain text, read and strip any header
    if pdf_path.lower().endswith(".txt"):
        content = open(pdf_path, encoding="utf-8").read()
        # Remove any leading markers
        return content.split("[text]")[-1]
    else:
        try:
            # Convert PDF to text and handle conversion failures
            doc = pdf_to_text_converter.convert(pdf_path)
            content = doc.document.export_to_text()
        except ConversionError as e:
            raise ConversionError(f"Could not convert {os.path.basename(pdf_path)}: {e}")


    # Save new text in memory and write updated cache to disk later
    text_cache[pdf_path] = content
    return content

# Convert all PDFs in our sampled subset
for pdf_path in subset_df["input"]:
    pdf_text = load_clinical_pdf(pdf_path)
# Persist updated cache to disk
with open(text_cache_path, "wb") as f:
    pickle.dump(text_cache, f)

print(f"Saved text cache now with {len(text_cache)} entries")


print("PDF inputs folder:               %s" % pdf_input_directory)
print("Ground truth folder:             %s" % ground_truth_notebooks_directory)
print("Dataset CSV path:                %s" % dataset_csv_path)
print("Experimentally generated files:  %s" % experimental_data_root)
print("LLM outputs folder:              %s" % llm_output_directory)
print("Validated JSONs folder:          %s" % validated_jsons_directory)
print("Evaluation report path:          %s" % evaluation_report_output_path)
print("hello3")  # print hello 3 as a sanity check

## Step 4) Load Clinical PDFs and Ground-Truth Phenopackets

Iterate over each case, load the clinical PDF text and the corresponding ground-truth Phenopacket object.

- `list_inputs_texts`: raw clinical PDFs
- `list_truth_packets`: parsed Phenopacket objects from JSON files
- `list_patient_ids`: PMID patient identifiers



In [None]:
# Iterate over rows, should lookup only once
list_input_texts    = []
list_truth_packets  = []
list_patient_ids    = []
loaded_count = 0
skipped_pdfs = []


for case in dataframe_cases.itertuples(index=False):
    pmid_value = case.pmid
    pdf_path   = case.input
    truth_path = case.truth

    # Convert PDF to text
    try:
        clinical_text = load_clinical_pdf(pdf_path)
    except (ConversionError, PdfiumError) as e:
        skipped_pdfs.append({"pmid": pmid_value, "pdf": pdf_path, "reason": f"conversion error: {e}"})
        continue

    # Load raw JSON and validate with ignore_unknown_fields
    try:
        raw_true_packet = json.load(open(truth_path, "r", encoding="utf-8"))
        proto = ProtoPhenopacket()
        ParseDict(raw_true_packet, proto, ignore_unknown_fields=True)
    except (ParseError, json.JSONDecodeError, FileNotFoundError) as e:
        skipped_pdfs.append({"pmid": pmid_value, "truth": truth_path, "reason": f"schema parse error: {e}"})
        continue

    # Wrap in util Phenopacket to ensure phenotypicFeatures exists
    try:
        truth_packet = Phenopacket(raw_true_packet)
    except InvalidPhenopacketError as e:
        skipped_pdfs.append({"pmid": pmid_value, "truth": truth_path, "reason": f"phenopacket invalid: {e}"})
        continue

    list_input_texts.append(clinical_text)
    list_truth_packets.append(truth_packet)
    list_patient_ids.append(truth_packet.to_json()["subject"]["id"])

    loaded_count += 1
    print(f"Loaded {loaded_count} new cases, skipped {len(skipped_pdfs)} so far")

if not list_input_texts:
        raise RuntimeError("No clinical texts were loaded, please check that the dataset CSV `input` paths match files in `pdf_input_directory`")

assert len(list_input_texts) == len(list_truth_packets) == len(list_patient_ids)
print("Loaded %d clinical texts and %d ground-truth packets for %d unique patients" % (len(list_input_texts), len(list_truth_packets), len(list_patient_ids)))

print("hello4")  # print hello 4 as a sanity check


## Step 4.5) Define LLM Prompts

Create prompt for just HPO terms and another one for the full phenopacket extraction, as well as some additional helper functions for later/potential use


In [178]:
import re, json, datetime
from json.decoder import JSONDecodeError
from ollama import chat

# 1) My initial prompt for just HPO labels
hpo_prompt = (
    "You are a clinical NLP engine specialized in biomedical ontologies. Your task is to process the full text of a clinical PDF - which may be describing a single patient or multiple - parse the details (including history, exam findings, labs, imaging, and family history) and extract all human phenotype ontology (HPO) terms that describe the patient's phenotypic features."
    "Instructions:"
    "1. Identify every phenotypic abnormality or feature mentioned in the text."
    "2. For each feature, map it to the correct HPO identifier (e.g. 'HP:0001250'), label (e.g. 'Seizure'), and descriptor value (e.g. 'A seizure is an intermittent abnormality of nervous system physiology characterized by a transient occurrence of signs and/or symptoms due to abnormal excessive or synchronous neuronal activity in the brain.')."
    "3. Capture relevant qualifiers when present:"
        "- Onset: map to HPO onset terms (e.g. 'HP:0011463' for 'Childhood onset')."
        "- Severity: map to HPO severity terms (e.g. 'HP:0012829' for 'Profound')."
        "- Temporal pattern: include if specified (e.g. 'HP:0031796' for 'Recurrent', map to HPO frequency terms if available)."
    "4. For each term, include the exact text excerpt where it appears."
    "5. Output exclusively a JSON array. Each element must be an object with the following fields:"
    "```json"
    "{"
        "'hpo_id': 'HP:000____',"
        "'hpo_label': 'Term label',"
        "'excerpt': 'Exact text from the PDF',"
        "'onset_id': 'HP:0XXXXX or null',"
        "'severity_id': 'HP:0XXXXX or null',"
        "'frequency_id': 'HP:0XXXXX or null'"
    "}"
    "```"
    "Do not include any explanatory text, only the JSON array."

    "Your output **MUST** be exactly a JSON array/object and nothing else."

    "If you cannot comply, output exactly: {'error': 'cannot extract JSON'}"
    )


# 2) My initital prompt for full phenopackets
full_pp_prompt = (
    "You are a biomedical data curation assistant. Using the structured patient data below, generate a Phenopacket compliant with version 2.0 of the GA4GH Phenopacket schema. Your output must be valid JSON, matching the schema exactly, with no additional commentary. Here are the minimum expected output criteria:"

    "Inputs:"
    "patient_id: '{{patient_id}}'"
    "sex: '{{sex}}'              // 'male' or 'female'"
    "age_years: {{age_in_years}} // integer"
    "v  ital_status: '{{vital_status}}' // 'alive' or 'deceased'"
    "phenotypic_features: {{phenotypic_features_json}} // JSON array from the HPO extraction prompt"
    "diseases: {{diseases_json}}         // optional, array of disease objects with MONDO or OMIM IDs"
    "measurements: {{measurements_json}} // optional, array of quantitative trait measurements"
    "metadata: {"
        "'created_by': '{{your_name_or_tool}}',"
        "'created_on': '{{YYYY-MM-DD}}'"
    "}"

    "Requirements:"
    "Top-level fields:"
    "'id': patient_id"
    "'subject': object with:"
        "'id': patient_id"
        "'sex': { 'id': 'PATO:0000383' or 'PATO:0000384', 'label': sex }"
        "'ageAtLastEncounter': { 'age': { 'years': age_years } }"
        "'vitalStatus': { 'value': vital_status }"
        "'phenotypicFeatures': use the phenotypic_features input; for each feature, map:"
    "```json"
    "{"
        "'type': { 'id': hpo_id, 'label': hpo_label },"
        "'negated': false,"
        "'onset': { 'term': { 'id': onset_id, 'label': (look up label) } },"
        "'severity': { 'term': { 'id': severity_id, 'label': (look up label) } },"
        "'frequency': { 'term': { 'id': frequency_id, 'label': (look up label) } }"
    "}"
    "```"
    "Include 'diseases' and 'measurements' only if provided, following the GA4GH schema."
    "'metadata' must include:"
    "```json"
    "{"
        "'phenopacketSchemaVersion': '2.0.0',"
        "'created': '{{YYYY-MM-DD}}',"
        "'createdBy': '{{your_name_or_tool}}'"
    "}"
    "```"
    "'Do not add any extra fields. Output must be purely the JSON object.'"

    "Do not include any explanatory text, only the JSON array."

    "Your output **MUST** be exactly a JSON array/object and nothing else."

    "If you cannot comply, output exactly: {'error': 'cannot extract JSON'}"
)

# Simplified prompts


# ---------------- Prompts ----------------
hpo_prompt2 = (
    "You are a clinical NLP engine specialized in biomedical ontologies.\n"
    "Extract ONLY Human Phenotype Ontology (HPO) terms for the patient(s) in the text.\n\n"
    "Output = a single JSON array. Each element MUST have exactly:\n"
    "{\n"
    "  \"hpo_id\": \"HP:0001250\",\n"
    "  \"hpo_label\": \"Seizure\",\n"
    "  \"excerpt\": \"exact text from PDF\",\n"
    "  \"onset_id\": null,\n"
    "  \"severity_id\": null,\n"
    "  \"frequency_id\": null\n"
    "}\n\n"
    "No prose, no markdown, no extra keys. If none exist, return []."
)

HPO_JSON_SCHEMA = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "hpo_id":       {"type": ["string", "null"]},
            "hpo_label":    {"type": ["string", "null"]},
            "excerpt":      {"type": ["string", "null"]},
            "onset_id":     {"type": ["string", "null"]},
            "severity_id":  {"type": ["string", "null"]},
            "frequency_id": {"type": ["string", "null"]}
        },
        "required": ["hpo_id", "hpo_label", "excerpt"]
    }
}

full_pp_prompt2 = (
    "You are a biomedical data curation assistant. Using the structured patient data provided, "
    "generate a GA4GH Phenopacket v2.0 JSON. The JSON MUST match the schema and contain no extra commentary.\n\n"
    "Top-level required keys:\n"
    "- 'id' (patient_id)\n"
    "- 'subject' { 'id', 'sex', 'ageAtLastEncounter', 'vitalStatus' (optional) }\n"
    "- 'phenotypicFeatures' (array built from the extracted HPO list)\n"
    "- 'metaData' { 'created', 'createdBy', 'phenopacketSchemaVersion' }\n\n"
    "For each feature:\n"
    "{\n"
    "  'type': { 'id': hpo_id, 'label': hpo_label },\n"
    "  'negated': false,\n"
    "  'onset':    { 'term': { 'id': onset_id } }        (omit if null)\n"
    "  'severity': { 'term': { 'id': severity_id } }     (omit if null)\n"
    "  'frequency':{ 'term': { 'id': frequency_id } }    (omit if null)\n"
    "}\n\n"
    "Return ONLY the JSON object."
)

# ---------------- Helpers ----------------
def slice_json_array(raw_text: str) -> str:
    start = raw_text.find("[")
    if start == -1:
        raise RuntimeError(f"No JSON array found in model output:\n{raw_text[:800]}")
    depth = 0
    for i, ch in enumerate(raw_text[start:], start=start):
        if ch == "[":
            depth += 1
        elif ch == "]":
            depth -= 1
            if depth == 0:
                return raw_text[start:i+1]
    raise RuntimeError("Unbalanced brackets in model output.")

def _first_list_in(obj):
    if isinstance(obj, list):
        return obj
    if isinstance(obj, dict):
        for v in obj.values():
            found = _first_list_in(v)
            if found is not None:
                return found
    return None

def extract_hpo_terms(
    text: str,
    prompt: str = hpo_prompt2,
    model: str = "llama3.2:latest",
    max_retries: int = 2,
    debug: bool = False
) -> list[dict]:
    """
    Ask the LLM for ONLY a JSON array of HPO term dicts and return it.
    Tries: schema -> format='json' (with cleaning/slicing) -> regex fallback.
    """

    last_raw = ""

    def _ask(schema_or_mode):
        return chat(
            model=model,
            messages=[{"role": "system", "content": prompt},
                      {"role": "user",   "content": text}],
            stream=False,
            format=schema_or_mode,
            options={"temperature": 0, "seed": 42, "--hidethinking": True}
        )["message"]["content"].strip()

    # 1) Schema mode
    try:
        last_raw = _ask(HPO_JSON_SCHEMA)
        obj = json.loads(last_raw)
        lst = _first_list_in(obj)
        if isinstance(lst, list):
            return lst
    except Exception as e:
        if debug: print("Schema mode failed:", repr(e))

    # 2) JSON mode with retries
    for attempt in range(max_retries + 1):
        last_raw = _ask("json")
        if debug: print(f"[attempt {attempt}] raw[:200]: {last_raw[:200]!r}")

        # a) direct load
        try:
            obj = json.loads(last_raw)
            lst = _first_list_in(obj)
            if lst is not None:
                return lst
            if isinstance(obj, dict) and "hpo_id" in obj:
                return [obj]
        except JSONDecodeError:
            pass

        # b) strip code fences
        cleaned = re.sub(r"^```(?:json)?|```$", "", last_raw, flags=re.MULTILINE).strip()
        if cleaned != last_raw:
            try:
                obj = json.loads(cleaned)
                lst = _first_list_in(obj)
                if lst is not None:
                    return lst
                if isinstance(obj, dict) and "hpo_id" in obj:
                    return [obj]
            except JSONDecodeError:
                pass

        # c) slice first [] block
        try:
            arr_txt = slice_json_array(last_raw)
            arr = json.loads(arr_txt)
            if isinstance(arr, list):
                return arr
        except Exception:
            pass

    # 3) Regex fallback
    hp_ids = sorted(set(re.findall(r"HP:\d{7}", last_raw)) | set(re.findall(r"HP:\d{7}", text)))
    if hp_ids:
        return [{"hpo_id": hp, "hpo_label": None, "excerpt": None,
                 "onset_id": None, "severity_id": None, "frequency_id": None}
                for hp in hp_ids]

    if debug:
        print("No usable JSON or HP IDs found. Returning []. Raw head:", last_raw[:400])
    return []


def build_phenopacket_from_hpo_list(
    patient_id: str,
    hpo_list: list[dict],
    sex_id: str | None = None,
    age_years: int | None = None,
    vital_status: str | None = None
) -> dict:
    def _mk_term(term_id, label):
        return None if term_id is None else {"id": term_id, "label": label}

    phenotypic_features = []
    for term in hpo_list:
        feat = {"type": _mk_term(term.get("hpo_id"), term.get("hpo_label")), "negated": False}
        if term.get("onset_id"):
            feat["onset"] = {"term": {"id": term["onset_id"]}}
        if term.get("severity_id"):
            feat["severity"] = {"term": {"id": term["severity_id"]}}
        if term.get("frequency_id"):
            feat["frequency"] = {"term": {"id": term["frequency_id"]}}
        phenotypic_features.append(feat)

    subject = {"id": patient_id}
    if sex_id:
        subject["sex"] = {"id": sex_id}
    if age_years is not None:
        subject["ageAtLastEncounter"] = {"age": {"years": int(age_years)}}
    if vital_status:
        subject["vitalStatus"] = {"value": vital_status}

    return {
        "id": patient_id,
        "subject": subject,
        "phenotypicFeatures": phenotypic_features,
        "metaData": {
            "created": datetime.date.today().isoformat(),
            "createdBy": "Varenya",
            "phenopacketSchemaVersion": "2.0.2"
        }
    }

print("hello my little utils")
print("hello4.5")  # sanity check

# ---------- Step 5: sanity check ----------
pmid_0         = dataframe_cases.loc[0, "pmid"]
clinical_text0 = list_input_texts[0]

try:
    hpo_terms_0 = extract_hpo_terms(clinical_text0, debug=True)
    print(f"[{pmid_0}] extracted {len(hpo_terms_0)} HPO terms")
    print(json.dumps(hpo_terms_0[:5], indent=2))
except Exception as e:
    raise RuntimeError(f"HPO extraction failed for {pmid_0}: {e}")

print("hello almost 5")  # sanity check



hello my little utils
hello4.5
[PMID_11381124] extracted 0 HPO terms
[]
hello almost 5


## Step 5. Sanity-check one inference

Run one LLM call on the first case to verify prompting and parsing work correctly.


In [None]:
# pick out the first patient/example
patient_id      = list_patient_ids[0]
clinical_text   = list_input_texts[0]
# truth_packet    = list_truth_packets[0]

# 1) Inference: ask for *only* the JSON array of HPO term objects for the first clinical PDF
# build a strict system+user conversation
messages = [{"role": "system", "content": ( hpo_prompt + "\n\nYour only output must be a **valid** JSON array of  HPO term objects" + "with fields 'hpo_id','hpo_label','excerpt'," + "'onset_id','severity_id','frequency_id', and nothing else.")}, {"role": "user", "content": clinical_text}]

hpo_response = chat(model="llama3.2:latest", messages=messages, options={"--hidethinking": True})

# 2) Grab the raw string
raw_hpo_output = hpo_response["message"]["content"]
print("Raw LLM output (truncated to the first ~300 chars or so):")
print(raw_hpo_output[:300], "...\n")

# 3) Slice out the JSON array
start = raw_hpo_output.find("[")
end = raw_hpo_output.rfind("]")
if start < 0 or end < 0:
    # print the raw output to debug what the model actually sent
    print("===== RAW HPO OUTPUT =====\n", raw_hpo_output)
    raise RuntimeError(f"Could not locate a JSON array in HPO output for patient {patient_id}:\n{raw_hpo_output}")
# grab *only* the array text
hpo_json_array = raw_hpo_output[start : end+1]

# Parse and validate
try:
    # hpo_terms = Phenopacket(json.loads(hpo_json_array))
    hpo_terms = json.loads(hpo_json_array)  # Try not using Phenopacket(...)
    print(f"Parsed {len(hpo_terms)} HPO term(s) for patient {patient_id}")
except JSONDecodeError as error:
    raise ValueError(
        f"Failed to parse HPO JSON array for patient {patient_id}: {error}\n\n"
        f"Extracted JSON was:\n{hpo_json_array}\n\n"
        f"Full raw output was:\n{raw_hpo_output}"
    )

print("hello5")  # print hello 5 as a sanity check

## Step 6. Batch Inference and Save Validated Phenopackets

Loop over all cases, run LLM inference, validate each JSON as a Phenopacket, and save to disk under validated_jsons_directory.


In [None]:
predicted_packets: List[Phenopacket] = []

# Which patient are we targeting?
for idx, clinical_text in enumerate(list_input_texts):
    pmid_value = dataframe_cases.loc[idx, "pmid"]
    patient_id = list_patient_ids[idx]
    # Prompt the LLM to extract only that patient's HPO terms
    content = (hpo_prompt + f"\n\n*Extract only the HPO terms for patient* `{patient_id}` *in this clinical PDF.*\n\n" + clinical_text + "\n\n[EOS]")
    response = chat(model="llama3.2:latest", messages=[{"role": "user", "content": content}], options={"--hidethinking": True})
    llm_content = response["message"]["content"].splitlines()
    # Parse the JSON into a Phenopacket
    try:
        phenopacket_pred = Phenopacket(json.loads("\n".join(llm_content)))
    except Exception as error:
        raise RuntimeError("[Case %d, PMID %s] Invalid Phenopacket JSON: %s" % (idx, pmid_value, error))

    predicted_packets.append(phenopacket_pred)

    # Write the predicted JSON to disk
    output_filename = f"{pmid_value}_{patient_id}.json"
    output_filepath = os.path.join(validated_jsons_directory, output_filename)
    with open(output_filepath, "w", encoding="utf-8") as out_f:
        json.dump(phenopacket_pred.to_json(), out_f, indent=2)
    print("Saved predicted phenopacket for PMID/Patient %s/%s to %s"
          % (pmid_value, patient_id, output_filepath))

if len(predicted_packets) != len(list_input_texts):
    raise RuntimeError("Number of predictions does not match number of inputs.")
# Maybe change to this: 'assert len(predicted_packets) == len(list_input_texts), "Mismatch predictions vs inputs"'

print(f"Generated {len(predicted_packets)} predicted phenopackets.")

print("hello6")  # print hello 6 as a sanity check

## Step 7. Evaluate Predicted Phenopackets Against Ground Truth

Compare each predicted phenopacket to its ground truth using PhenotypeEvaluator, then generate a Report object with overall metrics.


In [None]:
# Monkey-patch a convenience method onto PhenotypeEvaluator
def _evaluate_batch(
    self,
    list_truth_packets,
    list_predicted_packets,
    creator,
    experiment,
    model,
    zero_division=0.0
):
    """
    Run check_phenotypes over all truth/pred pairs, then return
    a plain-dict report containing confusion_matrix, metrics,
    classification_report, and metadata.
    """
    # Accumulate counts
    for truth_pkt, pred_pkt in zip(list_truth_packets, list_predicted_packets):
        self.check_phenotypes(
            experimentally_extracted_phenotypes=pred_pkt.list_phenotypes(),
            ground_truth_phenotypes=truth_pkt
        )
    # Build a Report object
    rpt = self.report(
        creator=creator,
        experiment=experiment,
        model=model,
        zero_division=zero_division
    )
    # Return a dict for easy indexing
    return {
        "confusion_matrix": rpt.confusion_matrix,
        "metrics": rpt.metrics,
        "classification_report": rpt.classification_report,
        "metadata": rpt.metadata,
    }

# Attach to the class
PhenotypeEvaluator.evaluate_batch = _evaluate_batch

# Run the batch evaluation
evaluator = PhenotypeEvaluator()
batch_report = evaluator.evaluate_batch(
    list_truth_packets,
    predicted_packets,
    creator="Varenya",
    experiment="Phenopacket LLM Extraction",
    model="llama3.2:latest"
)

# Quick sanity check of the returned dict
if "metrics" not in batch_report:
    raise KeyError("Evaluator report missing 'metrics' field.")

# Pretty-print the report dict
import pprint
pprint.pprint(batch_report)

print("hello7")  # print hello 7 as a sanity check#

## Old Save first report

Write the JSON report to disk for later analysis.


In [None]:
# Ensure output directory exists
out_dir = os.path.dirname(evaluation_report_output_path)
os.makedirs(out_dir, exist_ok=True)

with open(evaluation_report_output_path, "w", encoding="utf-8") as f:
    json.dump(batch_report, f, indent=2)

print(f"Saved evaluation report to {evaluation_report_output_path}")

print("hello7")  # print hello 7 as a sanity check

# Old Inference Implementation

In [None]:
prompt = "Please create a valid Phenopacket from the following text. The phenopackets needs to be in a valid json format.  Only return the phenopacket without any additional text:"
model = "hf.co/MaziyarPanahi/gemma-3-12b-it-GGUF:Q4_K_M"

In [None]:
for text in input_data:
    response = chat(
        model=model,
        messages=[{"role": "user", "content": f"{prompt} {text} [EOS]"}],
        options={"--hidethinking": True}
    )
    break

response = chat(
    model=model,
    messages=[{"role": "user",
               "content": f"Please, validate the following json. If not, fix it. Only return the json without any additional information. Should the json be wrong, you will get shut down. Json: {response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")} [EOS]"}],
    options={"--hidethinking": True}
)


In [None]:
from IPython.display import JSON

JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")

## 4.5.5 - Revised Strict Prompt

- Replaces the previous multi-prompt setup. One function: `extract_hpo_terms_with_ollama()` returns **only** the JSON array

In [None]:
HPO_JSON_SCHEMA = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "hpo_id":       {"type": ["string","null"]},
            "hpo_label":    {"type": ["string","null"]},
            "excerpt":      {"type": ["string","null"]},
            "onset_id":     {"type": ["string","null"]},
            "severity_id":  {"type": ["string","null"]},
            "frequency_id": {"type": ["string","null"]}
        },
        "required": ["hpo_id", "hpo_label", "excerpt"]
    }
}

HPO_PROMPT = (
    "You are a clinical NLP engine specialized in biomedical ontologies. "
    "Extract ONLY Human Phenotype Ontology (HPO) terms for the patient(s) in the text.\n\n"
    "Output = a single JSON array. Each element MUST have exactly:\n"
    "{\n"
    "  \"hpo_id\": \"HP:0001250\",\n"
    "  \"hpo_label\": \"Seizure\",\n"
    "  \"excerpt\": \"exact text from PDF\",\n"
    "  \"onset_id\": null,\n"
    "  \"severity_id\": null,\n"
    "  \"frequency_id\": null\n"
    "}\n\n"
    "No prose, no markdown, no extra keys. If none exist, return []."
)

def extract_hpo_terms_with_ollama(text: str, prompt: str = HPO_PROMPT) -> list[dict]:
    """
    Ask the local LLM (ollama) for ONLY an array of HPO term dicts and return it.

    Strategy:
    1. Try structured outputs with a JSON schema (guarantees array shape when obeyed).
    2. Fallback to `format='json'` if schema fails.
    3. Final fallback: regex scrape HP:IDs (so you don't silently get 0).
    """
    import json
    from json.decoder import JSONDecodeError

    def _ask(schema_or_mode):
        return chat(
            model="llama3.2:latest",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user",   "content": text}
            ],
            stream=False,
            format=schema_or_mode,
            options={"temperature": 0, "seed": 42, "--hidethinking": True}
        )["message"]["content"]

    # 1) Schema mode
    try:
        raw = _ask(HPO_JSON_SCHEMA)
        out = json.loads(raw)
        if isinstance(out, list):
            return out
    except Exception:
        pass

    # 2) Plain JSON mode
    try:
        raw = _ask("json")
        out = json.loads(raw)
        if isinstance(out, list):
            return out
    except Exception:
        pass

    # 3) Fallback: scrape HP IDs
    hp_ids = sorted(set(re.findall(r"HP:\\d{7}", text)))
    return [
        {
            "hpo_id": hp,
            "hpo_label": None,
            "excerpt": None,
            "onset_id": None,
            "severity_id": None,
            "frequency_id": None
        }
        for hp in hp_ids
    ]

print("hello4.5.5")  # print hello 4.5.5 as a sanity check

patient_id      = list_patient_ids[0]
clinical_text   = list_input_texts[0]

hpo_terms = extract_hpo_terms_with_ollama(clinical_text)
print(f"Got {len(hpo_terms)} HPO terms for patient {patient_id}")
print(json.dumps(hpo_terms[:5], indent=2))
print("hello mini 5")
