# Model Output Evaluation Notebook

This notebook runs LLM inference to predict HPO terms, compares them to ground truth phenopackets, and produces a summary report.


## 0. Imports, Path Discovery & Sanity Checks

Load all dependencies, discover the dataset CSV automatically, and validate critical directories.


In [44]:
# Basic Setup
import sys, os, glob, json, subprocess, pickle
import pandas as pd
from typing import List, Dict
from ollama import chat
from docling.document_converter import DocumentConverter, ConversionError

# Need this at least once for some reason:
# import .autonotebook
# from .autonotebook import tqdm as notebook_tqdm

# Make sure our utils folder is on PYTHONPATH
project_root = os.path.abspath("..")
utils_folder   = os.path.join(project_root, "notebooks", "utils")
if not os.path.isdir(utils_folder):
    raise FileNotFoundError("Expected utils under %s" % utils_folder)
sys.path.insert(0, project_root)

try:
    from notebooks.utils.phenopacket import Phenopacket
    from notebooks.utils.report import Report
    from notebooks.utils.evaluation import PhenotypeEvaluator
except ImportError as e:
    raise ImportError(f"Could not import project utils: {e}")


# define all key paths
pdf_input_directory = os.path.join(project_root, "scripts", "data", "tmp", "phenopacket_store", "pmid_pdfs")            # scripts/data/tmp/phenopacket_store/pmid_pdfs/
ground_truth_notebooks_directory = os.path.join(project_root, "scripts","data","tmp", "phenopacket_store","notebooks")  # scripts/data/tmp/phenopacket_store/notebooks/

# CSV location, which is generated from "python -m scripts.create_phenopacket_dataset "scripts/data/tmp/phenopacket_store/pmid_pdfs" "scripts/data/tmp/phenopacket_store/notebooks" "scripts/data/tmp/PMID_PDF_Phenopacket_list_in_phenopacket_store.csv" --recursive_ground_truth_dir True"
dataset_csv_path = os.path.join(project_root, "scripts", "data", "tmp", "PMID_PDF_Phenopacket_list_in_phenopacket_store.csv")

# All experimental outputs go under here
experimental_data_root = os.path.join(project_root, "experimental-data")
llm_output_directory = os.path.join(experimental_data_root, "llm_output_dir")                                           # intermediate .txt + raw JSON from LLM
validated_jsons_directory = os.path.join(experimental_data_root, "validated_jsons")                                     # validated_jsons, the final validated LLM phenopackets
evaluation_report_output_path = os.path.join(project_root, "reports", "first_report.json")                              # the evaluation metrics report

# Create any missing output folders
os.makedirs(os.path.dirname(dataset_csv_path), exist_ok=True)
os.makedirs(llm_output_directory, exist_ok=True)
os.makedirs(validated_jsons_directory, exist_ok=True)
os.makedirs(os.path.dirname(evaluation_report_output_path), exist_ok=True)


# If dataset CSV does not exist, run the CLI to generate it
if not os.path.isfile(dataset_csv_path):
    if not os.path.isdir(pdf_input_directory):
        raise FileNotFoundError(
            "PDF input directory not found: %s" % pdf_input_directory
        )
    if not os.path.isdir(ground_truth_notebooks_directory):
        raise FileNotFoundError(
            "Ground truth notebooks directory not found: %s" % ground_truth_notebooks_directory)

    subprocess.run([
        sys.executable, "-m", "scripts.create_phenopacket_dataset",
        pdf_input_directory,
        ground_truth_notebooks_directory,
        dataset_csv_path,
        "--recursive_ground_truth_dir", "True"
    ], check=True)
    print(f"Created dataset CSV at {dataset_csv_path}")

print("PDF inputs folder:       %s" % pdf_input_directory)
print("Ground truth folder:     %s" % ground_truth_notebooks_directory)
print("Dataset CSV path:        %s" % dataset_csv_path)
print("LLM outputs folder:      %s" % llm_output_directory)
print("Validated JSONs folder:  %s" % validated_jsons_directory)
print("Evaluation report path:  %s" % evaluation_report_output_path)

print("hello0")  # print hello 0 as a sanity check

PDF inputs folder:       /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/phenopacket_store/pmid_pdfs
Ground truth folder:     /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/phenopacket_store/notebooks
Dataset CSV path:        /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/PMID_PDF_Phenopacket_list_in_phenopacket_store.csv
LLM outputs folder:      /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/experimental-data/llm_output_dir
Validated JSONs folder:  /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/experimental-data/validated_jsons
Evaluation report path:  /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/r

## 1. Load Dataset

Read the CSV of PMIDs, input paths, and truth paths


In [45]:
# Load datasets
dataframe_cases = pd.read_csv(dataset_csv_path)
print("Loaded %d rows from dataset CSV" % len(dataframe_cases))
# Load cases & deduplicate PMIDs, with start/end counts
orig_count = len(dataframe_cases)
print(f"Before deduplication: {orig_count} total cases")

# Drop duplicate PMIDs
dataframe_cases = dataframe_cases.drop_duplicates(subset="pmid", keep="first").reset_index(drop=True) # This may be too aggressive and I need to check if this is a good approach
removed = orig_count - len(dataframe_cases)
print(f"{removed} duplicates removed (now {len(dataframe_cases)} unique PMIDs)")

# Verify required columns
required_columns = {"pmid", "input", "truth"}
missing_columns = required_columns - set(dataframe_cases.columns)
if missing_columns:
    raise KeyError("Missing required columns: %s" % missing_columns)

# Preview first few rows
dataframe_cases.head()

print("hello1")  # print hello 1 as a sanity check

Loaded 5135 rows from dataset CSV
Before deduplication: 5135 total cases
4501 duplicates removed (now 634 unique PMIDs)
hello1


## 2 Discover Phenopacket-Store Files

Locate all ground-truth Phenopacket JSON files under the `phenopacket_store/notebooks/` directory.

In [46]:
# Prepare phenopacket JSON cache, skip logic, and cap how many PDFs to process
import hashlib

# Set number of cases to process in this run
MAX_LOAD = 100

# Persist cache of already-parsed phenopacket cases
phenopacket_cache_path = os.path.join(experimental_data_root, "phenopacket_cache.json")

# Load existing cache is present
try:
    with open(phenopacket_cache_path, "r") as f:
        phenopacket_cache: Dict[str, Any] = json.load(f)
        print(f"Loaded phenopacket cache with {len(phenopacket_cache)} entries")
except FileNotFoundError:
    phenopacket_cache = {}
    print("Initialized empty phenopacket cache")

# Lookup parsed PDFs - by MD5 hash?
# TODO: check aboud SHA-256 or SHA-512
existing_hashes = {info["hash"] for info in phenopacket_cache.values()}

# Prepare to record skipped cases due to failures, as well as the reasons why
skipped_pdfs: List[Dict[str, str]] = []
loaded_count = 0

# Finding all ground-truth phenopacket JSON files
search_pattern = os.path.join(ground_truth_notebooks_directory, "*", "phenopackets", "*.json")
truth_json_filepaths = glob.glob(search_pattern, recursive=True)
if not truth_json_filepaths:
    raise FileNotFoundError("No ground-truth phenopacket JSON files found with pattern: %s" % search_pattern)

print("Discovered %d ground-truth JSON files" %len(truth_json_filepaths))

print("hello2")  # print hello 2 as a sanity check

Initialized empty phenopacket cache
Discovered 7969 ground-truth JSON files
hello2


## 3. Prepare PDF-to-Text Converter and Helper Function

Instantiate DocumentConverter and define a helper function to load or convert the clinical PDFs for LLM input.


In [47]:
# Setup conversion for input material to LLM-compatible txt

# Initialize converter once
pdf_to_text_converter = DocumentConverter()

# Path to persistent cache of PDF text
text_cache_path = os.path.join(experimental_data_root, "text_cache.pkl")
# Load or initialize cache
if os.path.exists(text_cache_path):
    with open(text_cache_path, "rb") as f:
        _text_cache: Dict[str, str] = pickle.load(f)
    print(f"Loaded text cache with {len(_text_cache)} entries")
else:
    _text_cache: Dict[str, str] = {}
    print("Initialized empty text cache")

def load_clinical_pdf(input_path: str) -> str:
    """
    Convert .txt or .pdf file at "input_path" into a plain text string.
    Use the in-memory cache first; write new text back to the cache only when the cache is explicitly saved at the end of the pipeline.
    Raise FileNotFoundError or ConversionError if the file does not exist.
    """
    # Return cache if it exists in memory
    if input_path in _text_cache:
        return _text_cache[input_path]

    # Ensure the files exists before we continue
    if not os.path.isfile(input_path):
        raise FileNotFoundError("Input file not found: %s" % input_path)

    # If it's already plain text, read and strip any header
    if input_path.lower().endswith(".txt"):
        content = open(input_path, encoding="utf-8").read()
        # Remove any leading markers
        return content.split("[text]")[-1]
    else:
        try:
            # Convert PDF to text and handle conversion failures
            doc = pdf_to_text_converter.convert(input_path)
            content = doc.document.export_to_text()
        except ConversionError as e:
            raise ConversionError(f"Could not convert {os.path.basename(input_path)}: {e}")


    # Save new text in memory and write updated cache to disk later
    _text_cache[input_path] = content
    return content

print("hello3")  # print hello 3 as a sanity check

Initialized empty text cache
hello3


## 4. Load Clinical PDFs and Ground-Truth Phenopackets

Iterate over each case, load the clinical PDF text and the corresponding ground-truth Phenopacket object.

- `list_inputs_texts`: raw clinical PDFs
- `list_truth_packets`: parsed Phenopacket objects from JSON files
- `list_patient_ids`: PMID patient identifiers


In [None]:
# Load Clinical PDFs and Ground-Truth Phenopackets (with full skip‐tracking & deduplication)
# Watch for any "bad" PDFs like "scripts/data/tmp/phenopacket_store/pmid_pdfs/PMID_32325141.pdf", which made `load_clinical_pdf` raise a ``ConversionError causing the whole cell to fail

import warnings
from google.protobuf.json_format import ParseDict, ParseError
from phenopackets import Phenopacket as ProtoPhenopacket

# Now iterate over rows, should lookup only once
list_input_texts    = []
list_truth_packets  = []
list_patient_ids    = []

for case in dataframe_cases.itertuples(index=False):
    pmid_value  = case.pmid
    pdf_path    = case.input
    truth_path  = case.truth

    # Compute an MD5 of the PDF to detect reruns
    try:
        with open(pdf_path, "rb") as f:
            pdf_bytes = f.read()
        pdf_hash = hashlib.md5(pdf_bytes).hexdigest()
    except Exception as e:
        skipped_pdfs.append({"pmid": pmid_value, "pdf": pdf_path, "reason": f"I/O error computing hash: {e}"})
        continue
    # Skip if we already processed this PDF
    if pdf_hash in existing_hashes:
        skipped_pdfs.append({"pmid": pmid_value, "pdf": pdf_path, "reason": "already in cache"})
        continue

    # Enforce MAX_LOAD
    if loaded_count >= MAX_LOAD:
        skipped_pdfs.append({"pmid": pmid_value, "pdf": pdf_path, "reason": "max load reached"})
        continue

    # Convert PDF to text
    try:
        clinical_text = load_clinical_pdf(pdf_path)
    except ConversionError as e:
        skipped_pdfs.append({"pmid": pmid_value, "pdf": pdf_path, "reason": f"conversion error: {e}"})
        continue

    Load raw JSON and validate with ignore_unknown_fields
    try:
        raw_true_packet = json.load(open(truth_path, "r", encoding="utf-8"))
        proto = ProtoPhenopacket()
        ParseDict(raw_true_packet, proto, ignore_unknown_fields=True)
    except (ParseError, json.JSONDecodeError, FileNotFoundError) as e:
        skipped_pdfs.append({"pmid": pmid_value, "truth": truth_path, "reason": f"schema parse error: {e}" })
        continue

    # Wrap in util Phenopacket class to catch missing phenotypicFeatures
    try:
        truth_packet = Phenopacket(raw_true_packet)
    except InvalidPhenopacketError as e:
        skipped_pdfs.append({"pmid": pmid_value, "truth": truth_path, "reason": f"phenopacket invalid: {e}"})
        continue

    # Record in memory caches and lists
    phenopacket_cache[str(pmid)] = {"hash": pdf_hash, "json": raw_true_packet}
    existing_hashes.add(pdf_hash)

    list_input_texts.append(clinical_text)
    list_truth_packets.append(truth_packet)
    list_patient_ids.append(truth_packet.to_json()["subject"]["id"])

    loaded_count += 1

    print(f"Loaded {loaded_count} new cases, skipped {len(skipped_pdfs)} so far")

assert len(list_input_texts) == len(list_truth_packets) == len(list_patient_ids)
print("Loaded %d clinical texts and %d ground-truth packets for %d unique patients" % (len(list_input_texts), len(list_truth_packets), len(list_patient_ids)))

print("hello4")  # print hello 4 as a sanity check

## 4.5. Define LLM Prompts

Create one prompt for just HPO terms and another one for the full phenopacket extraction


In [None]:
# Save cache to disk
with open(text_cache_path,"wb") as f:
    pickle.dump(_text_cache, f)
print(f"Saved text cache with {len(_text_cache)} entries to {text_cache_path}")

# 1) Prompt for just HPO labels
hpo_prompt = (
    "You are a clinical NLP engine specialized in biomedical ontologies. Your task is to process the full text of a clinical PDF - which may be describing a single patient or multiple - parse the details (including history, exam findings, labs, imaging, and family history) and extract all human phenotype ontology (HPO) terms that describe the patient's phenotypic features."
    "Instructions:"
    "1. Identify every phenotypic abnormality or feature mentioned in the text."
    "2. For each feature, map it to the correct HPO identifier (e.g. 'HP:0001250'), label (e.g. 'Seizure'), and descriptor value (e.g. 'A seizure is an intermittent abnormality of nervous system physiology characterized by a transient occurrence of signs and/or symptoms due to abnormal excessive or synchronous neuronal activity in the brain.')."
    "3. Capture relevant qualifiers when present:"
        "- Onset: map to HPO onset terms (e.g. 'HP:0011463' for 'Childhood onset')."
        "- Severity: map to HPO severity terms (e.g. 'HP:0012829' for 'Profound')."
        "- Temporal pattern: include if specified (e.g. 'HP:0031796' for 'Recurrent', map to HPO frequency terms if available)."
    "4. For each term, include the exact text excerpt where it appears."
    "5. Output exclusively a JSON array. Each element must be an object with the following fields:"
    "```json"
    "{"
        "'hpo_id': 'HP:000____',"
        "'hpo_label': 'Term label',"
        "'excerpt': 'Exact text from the PDF',"
        "'onset_id': 'HP:0XXXXX or null',"
        "'severity_id': 'HP:0XXXXX or null',"
        "'frequency_id': 'HP:0XXXXX or null'"
    "}"
    "```"
    "Do not include any explanatory text, only the JSON array."
    )

# 2) Prompt for full phenopacket
full_pp_prompt = (
    "You are a biomedical data curation assistant. Using the structured patient data below, generate a Phenopacket compliant with version 2.0 of the GA4GH Phenopacket schema. Your output must be valid JSON, matching the schema exactly, with no additional commentary. Here are the minimum expected output criteria:"

    "Inputs:"
    "patient_id: '{{patient_id}}'"
    "sex: '{{sex}}'              // 'male' or 'female'"
    "age_years: {{age_in_years}} // integer"
    "v  ital_status: '{{vital_status}}' // 'alive' or 'deceased'"
    "phenotypic_features: {{phenotypic_features_json}} // JSON array from the HPO extraction prompt"
    "diseases: {{diseases_json}}         // optional, array of disease objects with MONDO or OMIM IDs"
    "measurements: {{measurements_json}} // optional, array of quantitative trait measurements"
    "metadata: {"
        "'created_by': '{{your_name_or_tool}}',"
        "'created_on': '{{YYYY-MM-DD}}'"
    "}

    "Requirements:"
    "Top-level fields:"
    "'id': patient_id"
    "'subject': object with:"
        "'id': patient_id"
        "'sex': { 'id': 'PATO:0000383' or 'PATO:0000384', 'label': sex }"
        "'ageAtLastEncounter': { 'age': { 'years': age_years } }"
        "'vitalStatus': { 'value': vital_status }"
        "'phenotypicFeatures': use the phenotypic_features input; for each feature, map:"
    "```json"
    "{"
        "'type': { 'id': hpo_id, 'label': hpo_label },"
        "'negated': false,"
        "'onset': { 'term': { 'id': onset_id, 'label': (look up label) } },"
        "'severity': { 'term': { 'id': severity_id, 'label': (look up label) } },"
        "'frequency': { 'term': { 'id': frequency_id, 'label': (look up label) } }"
    "}"
    "```"
    "Include 'diseases' and 'measurements' only if provided, following the GA4GH schema."
    "'metadata' must include:"
    "```json"
    "{"
        "'phenopacketSchemaVersion': '2.0.0',"
        "'created': '{{YYYY-MM-DD}}',"
        "'createdBy': '{{your_name_or_tool}}'"
    "}"
    "```"
    "'Do not add any extra fields. Output must be purely the JSON object.'"
)

## 5. Sanity-check one inference

Run one LLM call on the first case to verify prompting and parsing work correctly.


In [None]:
# Perform inference on the first clinical PDF
first_response = chat(
    model="llama3.2:latest",
    messages=[{"role": "user", "content": hpo_prompt + "\n\n" + list_input_texts[0] + "\n\n[EOS]"}]
)
raw_prediction = first_response["message"]["content"]
print("Raw LLM output (first ~300 chars or so):")
print(raw_prediction[:300] + "...")

# Parse and validate the first Phenopacket
try:
    predicted_first_packet = Phenopacket.from_dict(json.loads(raw_prediction))
    print("Parsed first prediction successfully:", predicted_first_packet)
except Exception as error:
    raise ValueError("Failed to parse first LLM output: %s" % error)

print("hello5")  # print hello 5 as a sanity check

## 6. Batch Inference and Save Validated Phenopackets

Loop over all cases, run LLM inference, validate each JSON as a Phenopacket, and save to disk under validated_jsons_directory.


In [None]:
predicted_packets: List[Phenopacket] = []

# Which patient are we targeting?
for idx, clinical_text in enumerate(list_input_texts):
    pmid_value = dataframe_cases.loc[idx, "pmid"]
    patient_id = list_patient_ids[idx]
    # Prompt the LLM to extract only that patient's HPO terms
    content = (hpo_prompt + f"\n\n*Extract only the HPO terms for patient* `{patient_id}` *in this clinical PDF.*\n\n" + clinical_text + "\n\n[EOS]")
    response = chat(model="llama3.2:latest", messages=[{"role": "user", "content": content}], options={"--hidethinking": True})
    llm_content = response["message"]["content"].splitlines()
    # Parse the JSON into a Phenopacket
    try:
        phenopacket_pred = Phenopacket.from_dict(json.loads(llm_content))
    except Exception as error:
        raise RuntimeError("[Case %d, PMID %s] Invalid Phenopacket JSON: %s" % (idx, pmid_value, error))

    predicted_packets.append(phenopacket_pred)

    # Write the predicted JSON to disk
    output_filename = f"{pmid_value}_{patient_id}.json"
    output_filepath = os.path.join(validated_jsons_directory, output_filename)
    with open(output_filepath, "w", encoding="utf-8") as out_f:
        json.dump(phenopacket_pred.to_json(), out_f, indent=2)
    print("Saved predicted phenopacket for PMID/Patient %s/%s to %s"
          % (pmid_value, patient_id, output_filepath))

if len(predicted_packets) != len(list_input_texts):
    raise RuntimeError("Number of predictions does not match number of inputs.")
# Maybe change to this: 'assert len(predicted_packets) == len(list_input_texts), "Mismatch predictions vs inputs"'

print(f"Generated {len(predicted_packets)} predicted phenopackets.")

print("hello6")  # print hello 6 as a sanity check

## 7. Evaluate Predicted Phenopackets Against Ground Truth

Compare each predicted phenopacket to its ground truth using PhenotypeEvaluator, then generate a Report object with overall metrics.


In [None]:
# Monkey‐patch a convenience method onto PhenotypeEvaluator
def _evaluate_batch(
    self,
    list_truth_packets,
    list_predicted_packets,
    creator,
    experiment,
    model,
    zero_division=0.0
):
    """
    Run check_phenotypes over all truth/pred pairs, then return
    a plain‐dict report containing confusion_matrix, metrics,
    classification_report, and metadata.
    """
    # Accumulate counts
    for truth_pkt, pred_pkt in zip(list_truth_packets, list_predicted_packets):
        self.check_phenotypes(
            experimentally_extracted_phenotypes=pred_pkt.list_phenotypes(),
            ground_truth_phenotypes=truth_pkt
        )
    # Build a Report object
    rpt = self.report(
        creator=creator,
        experiment=experiment,
        model=model,
        zero_division=zero_division
    )
    # Return a dict for easy indexing
    return {
        "confusion_matrix": rpt.confusion_matrix,
        "metrics": rpt.metrics,
        "classification_report": rpt.classification_report,
        "metadata": rpt.metadata,
    }

# Attach to the class
PhenotypeEvaluator.evaluate_batch = _evaluate_batch

# Run the batch evaluation
evaluator = PhenotypeEvaluator()
batch_report = evaluator.evaluate_batch(
    list_truth_packets,
    predicted_packets,
    creator="Varenya",
    experiment="Phenopacket LLM Extraction",
    model="llama3.2:latest"
)

# Quick sanity check of the returned dict
if "metrics" not in batch_report:
    raise KeyError("Evaluator report missing 'metrics' field.")

# Pretty‐print the report dict
import pprint
pprint.pprint(batch_report)

print("hello7")  # print hello 7 as a sanity check#

## Old Save first report

Write the JSON report to disk for later analysis.


In [None]:
# Ensure output directory exists
out_dir = os.path.dirname(REPORT_OUT)
os.makedirs(out_dir, exist_ok=True)

with open(REPORT_OUT, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

print(f"Saved evaluation report to {REPORT_OUT}")

print("hello7")  # print hello 7 as a sanity check

# Old Inference Implementation

In [None]:
prompt = "Please create a valid Phenopacket from the following text. The phenopackets needs to be in a valid json format.  Only return the phenopacket without any additional text:"
model = "hf.co/MaziyarPanahi/gemma-3-12b-it-GGUF:Q4_K_M"

In [None]:
for text in input_data:
    response = chat(
        model=model,
        messages=[{"role": "user", "content": f"{prompt} {text} [EOS]"}],
        options={"--hidethinking": True}
    )
    break

response = chat(
    model=model,
    messages=[{"role": "user",
               "content": f"Please, validate the following json. If not, fix it. Only return the json without any additional information. Should the json be wrong, you will get shut down. Json: {response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")} [EOS]"}],
    options={"--hidethinking": True}
)


In [None]:
from IPython.display import JSON

JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")