# Model Output Evaluation Notebook

This notebook runs LLM inference to predict HPO terms, compares them to ground truth phenopackets, and produces a summary report.


## 0. Imports, Path Discovery & Sanity Checks

Load all dependencies, discover the dataset CSV automatically, and validate critical directories.


In [4]:
# 0. Basic Setup
import sys, os, glob, json, subprocess
import pandas as pd
from ollama import chat
from docling.document_converter import DocumentConverter

# Need this at least once for some reason:
# import .autonotebook
# from .autonotebook import tqdm as notebook_tqdm

# ensure project utils are importable
repo_root = os.path.abspath("..")
if not os.path.isdir(os.path.join(repo_root, "notebooks")):
    raise FileNotFoundError(f"Expected notebook utils under {repo_root}/notebooks")
sys.path.insert(0, repo_root)
try:
    from notebooks.utils.phenopacket import Phenopacket
    from notebooks.utils.evaluation import PhenotypeEvaluator
except ImportError as e:
    raise ImportError(f"Could not import project utils: {e}")

# define all key paths
LLM_OUT_DIR = os.path.join(repo_root, "scripts", "data", "tmp", "phenopacket_store", "llm_output_dir")
GT_NOTEBKS  = os.path.join(repo_root, "scripts", "data", "tmp", "phenopacket_store", "notebooks")
PMID_PDFS    = os.path.join(repo_root, "scripts", "data", "tmp", "phenopacket_store", "pmid_pdfs")
CSV_PATH     = os.path.join(repo_root, "scripts", "data", "tmp", "phenopacket_store", "phenopacket_dataset.csv")
REPORT_OUT   = os.path.join(repo_root, "reports", "first_report.json")
os.makedirs(os.path.dirname(REPORT_OUT), exist_ok=True)


# If CSV doesn't exist, auto-generate it
if not os.path.isfile(CSV_PATH):
    if not os.path.isdir(LLM_OUT_DIR):
        raise FileNotFoundError(f"LLM output directory not found: {LLM_OUT_DIR}")
    if not os.path.isdir(GT_NOTEBKS):
        raise FileNotFoundError(f"Ground-truth notebooks dir not found: {GT_NOTEBKS}")
    subprocess.run([
        sys.executable, "-m", "scripts.create_phenopacket_dataset",
        LLM_OUT_DIR,
        GT_NOTEBKS,
        CSV_PATH,
        "--recursive_input_dir", "True",
        "--recursive_ground_truth_dir", "True"
    ], check=True)
    print(f"Created dataset CSV at {CSV_PATH}")


print(f"Using DATASET_CSV = {DATASET_CSV}")
print(f"Will write report to {REPORT_OUT}")

print("hello0")  # print hello 0 as a sanity check

FileNotFoundError: LLM output directory not found: /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/phenopacket_store/llm_output_dir

## 1. Load Dataset

Read the CSV of PMIDs, input paths, and truth paths into a DataFrame, dedupe, and verify columns.


In [None]:
## 1. Load datasets
df = pd.read_csv(DATASET_CSV)
print(f"Loaded {len(df)} rows from {DATASET_CSV}")

# Deduplicate and sanity-check columns
df = df.drop_duplicates("pmid").reset_index(drop=True)
required = {"pmid", "input", "truth"}
missing = required - set(df.columns)
if missing:
    raise KeyError(f"Missing expected columns in dataset CSV: {missing}")

df.head()

print("hello1")  # print hello 1 as a sanity check

## 1.5 Discover Phenopacket-Store Files

Locate all JSON phenopacket files in the `phenopacket_store` folder.


In [None]:
# 1.5. Find the ground truth phenopackets
store_root = os.path.join(repo_root, "scripts", "data", "tmp", "phenopacket_store", "notebooks")
if not os.path.isdir(store_root):
    raise FileNotFoundError(f"Phenopacket store directory not found: {store_root}")

pattern = os.path.join(store_root, "*", "phenopackets", "*.json")
packet_paths = glob.glob(pattern)
if not packet_paths:
    raise FileNotFoundError(f"No phenopacket JSON files found with pattern: {pattern}")

store_packets = []
for p in packet_paths:
    with open(p, "r", encoding="utf-8") as f:
        data = json.load(f)
    store_packets.append(Phenopacket.from_dict(data))

print(f"Discovered {len(store_packets)} phenopackets in store.")

print("hello1.5")  # print hello 1.5 as a sanity check

## 2. Prepare document-to-text helper

Convert either `.txt` or PDF into the raw case summary text for the LLM.


In [None]:
# Setup conversion for input material to LLM-compatible txt

# Initialize converter once
converter = DocumentConverter()


def load_text(path: str) -> str:
    """
    Convert .txt or PDF -> plain text for LLM input.
    Splits off any '[text]' marker if present.
    """
    if not os.path.isfile(path):
        raise FileNotFoundError(f"Input file not found: {path}")
    if path.lower().endswith(".txt"):
        with open(path, encoding="utf-8") as f:
            return f.read().split("[text]")[-1]
    else:
        doc = converter.convert(path)
        return doc.document.export_to_text()


print("hello2")  # print hello 2 as a sanity check

## 3. Load Inputs & Ground Truth

Iterate the DataFrame, load each input summary and corresponding truth phenopacket.

- `inputs`: raw clinical summaries
- `truth_packets`: parsed Phenopacket objects from JSON files


In [None]:
input_texts = []
truth_packets = []

for idx, row in df.iterrows():
    inp, truth_fp = row["input"], row["truth"]

    # 3A) Load and check input text
    try:
        txt = load_text(inp)
    except Exception as e:
        raise RuntimeError(f"[Row {idx}] Error loading input '{inp}': {e}")
    input_texts.append(txt)

    # 3B) Load and check truth phenopacket
    if not os.path.isfile(truth_fp):
        raise FileNotFoundError(f"[Row {idx}] Truth file not found: {truth_fp}")
    try:
        with open(truth_fp, "r", encoding="utf-8") as f:
            data = json.load(f)
        truth_packets.append(Phenopacket.from_dict(data))
    except Exception as e:
        raise RuntimeError(f"[Row {idx}] Error parsing truth phenopacket '{truth_fp}': {e}")

print(f"Prepared {len(input_texts)} cases and {len(truth_packets)} ground-truth packets.")

print("hello3")  # print hello 3 as a sanity check

## 4. Sanity-check one inference

Run the model on the first case to ensure everything is wired up correctly.


In [None]:
# 4A) Verify we have at least one case
if not input_texts:
    raise RuntimeError("No input cases were loaded; aborting inference.")

case0_txt = input_texts[0]
prompt = (
    "Please create a valid Phenopacket v2.0 from the following clinical summary. "
    "Return *only* the JSON phenopacket object, ensuring correct HPO terms, IDs, and definitions"
)

# 4B) Run the model
resp = chat(
    model="llama3.2:latest",  # swap to your model of choice
    messages=[{"role": "user", "content": f"{prompt}\n\n{case0_txt}\n\n[EOS]"}]
)
raw = resp["message"]["content"]
print(raw)

# 4C) Parse and wrap
try:
    pred_packet0 = Phenopacket.from_dict(json.loads(raw))
except Exception as e:
    raise ValueError(f"Failed to parse model output as Phenopacket JSON: {e}")

pred_packet0  # inspect structure

print("hello4")  # print hello 4 as a sanity check

## 5. Batch inference

Loop over all cases, collect predicted phenopackets.


In [None]:
predicted_packets = []

for idx, txt in enumerate(input_texts):
    resp = chat(
        model="llama3.2:latest",
        messages=[{"role": "user", "content": f"{prompt}\n\n{txt}\n\n[EOS]"}],
        options={"--hidethinking": True}
    )
    content = resp["message"]["content"]
    try:
        pkt = Phenopacket.from_dict(json.loads(content))
    except Exception as e:
        raise RuntimeError(f"[Case {idx}] Invalid JSON phenopacket: {e}")
    predicted_packets.append(pkt)

if len(predicted_packets) != len(input_texts):
    raise RuntimeError("Number of predictions does not match number of inputs.")

print(f"Generated {len(predicted_packets)} predicted phenopackets.")

print("hello5")  # print hello 5 as a sanity check

## 6. Evaluate predictions

Use our `PhenotypeEvaluator` to compare predicted vs. ground truth and compute metrics.


In [None]:
evaluator = PhenotypeEvaluator()
report = evaluator.evaluate_batch(truth_packets, predicted_packets)

# Quick sanity check of report structure
if "metrics" not in report:
    raise KeyError("Evaluator report missing 'metrics' field.")

import pprint; pprint.pprint(report)

print("hello6")  # print hello 6 as a sanity check

## 7. Save first report

Write the JSON report to disk for later analysis.


In [None]:
# Ensure output directory exists
out_dir = os.path.dirname(REPORT_OUT)
os.makedirs(out_dir, exist_ok=True)

with open(REPORT_OUT, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

print(f"Saved evaluation report to {REPORT_OUT}")

print("hello7")  # print hello 7 as a sanity check

#### Inference

In [None]:
prompt = "Please create a valid Phenopacket from the following text. The phenopackets needs to be in a valid json format.  Only return the phenopacket without any additional text:"
model = "hf.co/MaziyarPanahi/gemma-3-12b-it-GGUF:Q4_K_M"

In [None]:
for text in input_data:
    response = chat(
        model=model,
        messages=[{"role": "user", "content": f"{prompt} {text} [EOS]"}],
        options={"--hidethinking": True}
    )
    break

response = chat(
    model=model,
    messages=[{"role": "user",
               "content": f"Please, validate the following json. If not, fix it. Only return the json without any additional information. Should the json be wrong, you will get shut down. Json: {response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")} [EOS]"}],
    options={"--hidethinking": True}
)


In [None]:
from IPython.display import JSON

JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")