# Model Output Evaluation Notebook

This notebook runs LLM inference to predict HPO terms, compares them to ground truth phenopackets, and produces a summary report.


In [33]:
# 0. Imports and constants
import os, json, subprocess
import pandas as pd
import sys, os
from ollama import chat
from docling.document_converter import DocumentConverter

# Need this at least once for some reason:
# import .autonotebook
# from .autonotebook import tqdm as notebook_tqdm

# Ensure repo root is on sys.path so our utils can be imported
repo_root = os.path.abspath("..")
if not os.path.isdir(os.path.join(repo_root, "notebooks")):
    raise FileNotFoundError(f"Expected notebook utils under {repo_root}/notebooks")
sys.path.insert(0, repo_root)

try:
    from notebooks.utils.phenopacket import Phenopacket
    from notebooks.utils.evaluation import PhenotypeEvaluator
except ImportError as e:
    raise ImportError(f"Could not import project utils: {e}")

# Paths for dataset and report output
DATASET_CSV = os.path.join(repo_root, "scripts", "data", "tmp", "phenopacket_dataset.csv")
REPORT_OUT = os.path.join(repo_root, "reports", "first_report.json")

# Load dataset
df = pd.read_csv(DATASET_CSV)

# Discover phenopacket JSON files
import glob

store_root = os.path.join(repo_root, "scripts", "data", "tmp", "phenopacket_store", "notebooks")
if not os.path.isdir(store_root):
    raise FileNotFoundError(f"Phenopacket store directory not found: {store_root}")

pattern = os.path.join(store_root, "*", "phenopackets", "*.json")
packet_paths = glob.glob(pattern)
if not packet_paths:
    raise FileNotFoundError(f"No phenopacket JSON files found with pattern: {pattern}")

# Load and parse phenopackets
phenopacket_objs = []
for path in packet_paths:
    if not os.path.isfile(path):
        raise FileNotFoundError(f"Expected file but not found: {path}")
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON in file {path}: {e}")
    try:
        phenopacket_objs.append(Phenopacket.from_dict(data))
    except Exception as e:
        raise RuntimeError(f"Failed to parse Phenopacket from {path}: {e}")

print(f"Loaded dataset with {len(df)} entries and {len(phenopacket_objs)} phenopacket objects.")

print("hello0")  # print hello 0 as a sanity check

FileNotFoundError: [Errno 2] No such file or directory: '/Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/phenopacket_dataset.csv'

## 1. Load dataset

Read in the CSV that lists PMIDs, input file paths, and truth file paths.


In [None]:
import glob

# 1A) Try to locate phenopacket_dataset.csv
csv_cand = glob.glob("../**/phenopacket_dataset.csv", recursive=True)
if csv_cand:
    # Use the single CSV if found
    DATASET_CSV = csv_cand[0]
    print(f"Found dataset CSV at {DATASET_CSV}, loading…")
    df = pd.read_csv(DATASET_CSV)
else:
    # 1A-FALLBACK) No CSV—auto‐build DataFrame by scanning input & truth files
    print("No phenopacket_dataset.csv found; auto-discovering inputs and truths…")

    # adjust these patterns to your actual folder structure
    input_paths = glob.glob("../scripts/data/tmp/**/*.*(txt|pdf)", recursive=True)
    truth_paths = glob.glob("../scripts/data/tmp/**/*.json", recursive=True)


    # map PMID → file
    def pmid_key(path):
        return os.path.splitext(os.path.basename(path))[0]


    inputs_by_pmid = {pmid_key(p): p for p in input_paths}
    truths_by_pmid = {pmid_key(p): p for p in truth_paths}

    pmids = set(inputs_by_pmid) & set(truths_by_pmid)
    if not pmids:
        raise RuntimeError("No matching PMIDs found between inputs and truth files.")
    missing_inputs = set(truths_by_pmid) - set(inputs_by_pmid)
    missing_truths = set(inputs_by_pmid) - set(truths_by_pmid)
    if missing_inputs or missing_truths:
        raise RuntimeError(
            f"Unpaired files detected. "
            f"Missing inputs for {missing_inputs}, missing truths for {missing_truths}"
        )
    # build the DataFrame
    df = pd.DataFrame([
        {"pmid": pmid, "input": inputs_by_pmid[pmid], "truth": truths_by_pmid[pmid]}
        for pmid in sorted(pmids)
    ])

print(f"Assembled {len(df)} cases.")
# 1B) Deduplicate just in case
df = df.drop_duplicates("pmid").reset_index(drop=True)

# 1C) Sanity‐check columns
required_cols = {"pmid", "input", "truth"}
missing = required_cols - set(df.columns)
if missing:
    raise KeyError(f"Missing expected columns: {missing}")

df.head()

print("hello1")  # print hello 1 as a sanity check

## 2. Prepare document-to-text helper

Convert either `.txt` or PDF into the raw case summary text for the LLM.


In [35]:
# Initialize converter once
converter = DocumentConverter()


def load_text(path: str) -> str:
    """
    Convert .txt or PDF -> plain text for LLM input.
    Splits off any '[text]' marker if present.
    """
    if not os.path.isfile(path):
        raise FileNotFoundError(f"Input file not found: {path}")
    if path.lower().endswith(".txt"):
        with open(path, encoding="utf-8") as f:
            return f.read().split("[text]")[-1]
    else:
        doc = converter.convert(path)
        return doc.document.export_to_text()


print("hello2")  # print hello 2 as a sanity check

hello2


## 3. Load inputs & ground truth

- `inputs`: raw clinical summaries
- `truth_packets`: parsed Phenopacket objects from JSON files


In [36]:
input_texts = []
truth_packets = []

for idx, row in df.iterrows():
    inp, truth_fp = row["input"], row["truth"]

    # 3A) Load and check input text
    try:
        txt = load_text(inp)
    except Exception as e:
        raise RuntimeError(f"[Row {idx}] Error loading input '{inp}': {e}")
    input_texts.append(txt)

    # 3B) Load and check truth phenopacket
    if not os.path.isfile(truth_fp):
        raise FileNotFoundError(f"[Row {idx}] Truth file not found: {truth_fp}")
    try:
        with open(truth_fp, "r", encoding="utf-8") as f:
            data = json.load(f)
        truth_packets.append(Phenopacket.from_dict(data))
    except Exception as e:
        raise RuntimeError(f"[Row {idx}] Error parsing truth phenopacket '{truth_fp}': {e}")

print(f"Prepared {len(input_texts)} cases and {len(truth_packets)} ground-truth packets.")

print("hello3")  # print hello 3 as a sanity check

NameError: name 'df' is not defined

## 4. Sanity-check one inference

Run the model on the first case to ensure everything is wired up correctly.


In [None]:
# 4A) Verify we have at least one case
if not input_texts:
    raise RuntimeError("No input cases were loaded; aborting inference.")

case0_txt = input_texts[0]
prompt = (
    "Please create a valid Phenopacket from the following clinical summary. "
    "Return *only* the JSON phenopacket object."
)

# 4B) Run the model
resp = chat(
    model="llama3.2:latest",  # swap to your model of choice
    messages=[{"role": "user", "content": f"{prompt}\n\n{case0_txt}\n\n[EOS]"}]
)
raw = resp["message"]["content"]
print(raw)

# 4C) Parse and wrap
try:
    pred_packet0 = Phenopacket.from_dict(json.loads(raw))
except Exception as e:
    raise ValueError(f"Failed to parse model output as Phenopacket JSON: {e}")

pred_packet0  # inspect structure

print("hello4")  # print hello 4 as a sanity check

## 5. Batch inference

Loop over all cases, collect predicted phenopackets.


In [None]:
predicted_packets = []

for idx, txt in enumerate(input_texts):
    resp = chat(
        model="llama3.2:latest",
        messages=[{"role": "user", "content": f"{prompt}\n\n{txt}\n\n[EOS]"}],
        options={"--hidethinking": True}
    )
    content = resp["message"]["content"]
    try:
        pkt = Phenopacket.from_dict(json.loads(content))
    except Exception as e:
        raise RuntimeError(f"[Case {idx}] Invalid JSON phenopacket: {e}")
    predicted_packets.append(pkt)

if len(predicted_packets) != len(input_texts):
    raise RuntimeError("Number of predictions does not match number of inputs.")

print(f"Generated {len(predicted_packets)} predicted phenopackets.")

print("hello5")  # print hello 5 as a sanity check

## 6. Evaluate predictions

Use our `PhenotypeEvaluator` to compare predicted vs. ground truth and compute metrics.


In [None]:
evaluator = PhenotypeEvaluator()
report = evaluator.evaluate_batch(truth_packets, predicted_packets)

# Quick sanity check of report structure
if "metrics" not in report:
    raise KeyError("Evaluator report missing 'metrics' field.")

import pprint;

pprint.pprint(report)

print("hello6")  # print hello 6 as a sanity check

## 7. Save first report

Write the JSON report to disk for later analysis.


In [None]:
# Ensure output directory exists
out_dir = os.path.dirname(REPORT_OUT)
os.makedirs(out_dir, exist_ok=True)

with open(REPORT_OUT, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

print(f"Saved evaluation report to {REPORT_OUT}")

print("hello7")  # print hello 7 as a sanity check

#### Inference

In [None]:
prompt = "Please create a valid Phenopacket from the following text. The phenopackets needs to be in a valid json format.  Only return the phenopacket without any additional text:"
model = "hf.co/MaziyarPanahi/gemma-3-12b-it-GGUF:Q4_K_M"

In [None]:
for text in input_data:
    response = chat(
        model=model,
        messages=[{"role": "user", "content": f"{prompt} {text} [EOS]"}],
        options={"--hidethinking": True}
    )
    break

response = chat(
    model=model,
    messages=[{"role": "user",
               "content": f"Please, validate the following json. If not, fix it. Only return the json without any additional information. Should the json be wrong, you will get shut down. Json: {response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")} [EOS]"}],
    options={"--hidethinking": True}
)


In [None]:
from IPython.display import JSON

JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")