# Model Output Evaluation Notebook

This notebook runs LLM inference to predict HPO terms, compares them to ground truth phenopackets, and produces a summary report.


## 0. Imports, Path Discovery & Sanity Checks

Load all dependencies, discover the dataset CSV automatically, and validate critical directories.


In [3]:
# Basic Setup
import sys, os, glob, json, subprocess
import pandas as pd
from typing import List
from ollama import chat
from docling.document_converter import DocumentConverter

# Need this at least once for some reason:
# import .autonotebook
# from .autonotebook import tqdm as notebook_tqdm

# Make sure our utils folder is on PYTHONPATH
project_root = os.path.abspath("..")
utils_folder   = os.path.join(project_root, "notebooks", "utils")
if not os.path.isdir(utils_folder):
    raise FileNotFoundError("Expected utils under %s" % utils_folder)
sys.path.insert(0, project_root)

try:
    from notebooks.utils.phenopacket import Phenopacket
    from notebooks.utils.report import Report
    from notebooks.utils.evaluation import PhenotypeEvaluator
except ImportError as e:
    raise ImportError(f"Could not import project utils: {e}")

# define all key paths
pdf_input_directory = os.path.join(project_root, "scripts", "data", "tmp", "phenopacket_store", "pmid_pdfs")            # scripts/data/tmp/phenopacket_store/pmid_pdfs/
ground_truth_notebooks_directory = os.path.join(project_root, "scripts","data","tmp", "phenopacket_store","notebooks")  # scripts/data/tmp/phenopacket_store/notebooks/
# All experimental outputs go under here
experimental_data_root = os.path.join(project_root, "experimental-data")
llm_output_directory = os.path.join(experimental_data_root, "llm_output_dir")                                           # intermediate .txt + raw JSON from LLM
validated_jsons_directory = os.path.join(experimental_data_root, "validated_jsons")                                     # validated_jsons, the final validated LLM phenopackets

# Write phenopacket_dataset.csv into llm_output_directory
dataset_csv_path = os.path.join(llm_output_directory, "phenopacket_dataset.csv")                                        # a manifest of pmid -> input -> truth
evaluation_report_output_path = os.path.join(project_root, "reports", "first_report.json")                              # the evaluation metrics report

# Create any missing output folders
os.makedirs(llm_output_directory, exist_ok=True)
os.makedirs(validated_jsons_directory, exist_ok=True)
os.makedirs(os.path.dirname(evaluation_report_output_path), exist_ok=True)


# If dataset CSV does not exist, run the CLI to generate it
if not os.path.isfile(dataset_csv_path):
    if not os.path.isdir(pdf_input_directory):
        raise FileNotFoundError(
            "PDF input directory not found: %s" % pdf_input_directory
        )
    if not os.path.isdir(ground_truth_notebooks_directory):
        raise FileNotFoundError(
            "Ground truth notebooks directory not found: %s" % ground_truth_notebooks_directory)

    subprocess.run([
        sys.executable, "-m", "scripts.create_phenopacket_dataset",
        pdf_input_directory,
        ground_truth_notebooks_directory,
        dataset_csv_path,
        "--recursive_input_dir", "True",
        "--recursive_ground_truth_dir", "True"
    ], check=True)
    print("Created dataset CSV at %s" % dataset_csv_path)

print("PDF inputs folder:       %s" % pdf_input_directory)
print("Ground truth folder:     %s" % ground_truth_notebooks_directory)
print("LLM outputs folder:      %s" % llm_output_directory)
print("Dataset CSV path:        %s" % dataset_csv_path)
print("Validated JSONs folder:  %s" % validated_jsons_directory)
print("Evaluation report path:  %s" % evaluation_report_output_path)

print("hello0")  # print hello 0 as a sanity check

Created dataset CSV at /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/experimental-data/llm_output_dir/phenopacket_dataset.csv
PDF inputs folder:       /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/phenopacket_store/pmid_pdfs
Ground truth folder:     /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/phenopacket_store/notebooks
LLM outputs folder:      /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/experimental-data/llm_output_dir
Dataset CSV path:        /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/experimental-data/llm_output_dir/phenopacket_dataset.csv
Validated JSONs folder:  /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/P

## 1. Load Dataset

Read the CSV of PMIDs, input paths, and truth paths


In [4]:
# Load datasets
dataframe_cases = pd.read_csv(dataset_csv_path)
print("Loaded %d rows from dataset CSV" % len(dataframe_cases))

# Drop duplicate PMIDs
dataframe_cases = dataframe_cases.drop_duplicates("pmid").reset_index(drop=True)
print("After deduplication: %d unique PMID cases" %len(dataframe_cases))

# Verify required columns
required_columns = {"pmid", "input", "truth"}
missing_columns = required_columns - set(dataframe_cases.columns)
if missing_columns:
    raise KeyError("Missing required columns: %s" % missing_columns)

# Preview first few rows
dataframe_cases.head()

print("hello1")  # print hello 1 as a sanity check

Loaded 5135 rows from dataset CSV
After deduplication: 634 unique PMID cases
hello1


## 2 Discover Phenopacket-Store Files

Locate all ground-truth Phenopacket JSON files under the `phenopacket_store/notebooks/` directory.

In [5]:
search_pattern = os.path.join(ground_truth_notebooks_directory, "*", "phenopackets", "*.json")
truth_json_filepaths = glob.glob(search_pattern, recursive=True)
if not truth_json_filepaths:
    raise FileNotFoundError("No ground-truth phenopacket JSON files found with pattern: %s" % search_pattern)

print("Discovered %d ground-truth JSON files" %len(truth_json_filepaths))

print("hello2")  # print hello 2 as a sanity check

Discovered 7969 ground-truth JSON files
hello2


## 3. Prepare PDF-to-Text Converter and Helper Function

Instantiate DocumentConverter and define a helper function to load or convert the clinical PDFs for LLM input.


In [6]:
# Setup conversion for input material to LLM-compatible txt

# Initialize converter once
converter = DocumentConverter()

pdf_to_text_converter = DocumentConverter()

def load_clinical_summary(input_path):
    """
    Convert .txt or .pdf file at input_path into a plain text string.

    Raises FileNotFoundError if the file does not exist.
    """
    if not os.path.isfile(input_path):
        raise FileNotFoundError("Input file not found: %s" % input_path)
    lower = input_path.lower()
    if lower.endswith(".txt"):
        with open(input_path, encoding="utf-8") as f:
            content = f.read()
        # Remove any leading markers
        return content.split("[text]")[-1]
    else:
        doc = pdf_to_text_converter.convert(input_path)
        return doc.document.export_to_text()


print("hello3")  # print hello 3 as a sanity check

hello3


## 4. Load Clinical Summaries and Ground-Truth Phenopackets

Iterate over each case, load the clinical summary text #   and the corresponding ground-truth Phenopacket object.

- `list_inputs`: raw clinical summaries
- `list_truth_packets`: parsed Phenopacket objects from JSON files


In [7]:
list_input_texts   = []
list_truth_packets = []

for idx, row in dataframe_cases.iterrows():
    pmid_value = row["pmid"]
    pdf_path   = row["input"]
    truth_path = row["truth"]

    # Load the clinical summary
    clinical_summary = load_clinical_summary(pdf_path)
    list_input_texts.append(clinical_summary)

    # Load and validate the ground-truth Phenopacket
    truth_packet = Phenopacket.load_from_file(truth_path)
    list_truth_packets.append(truth_packet)

assert len(list_input_texts) == len(list_truth_packets)
print("Loaded %d clinical summaries and %d ground-truth packets" %
      (len(list_input_texts), len(list_truth_packets)))

print("hello4")  # print hello 4 as a sanity check

Parameter `strict_text` has been deprecated and will be ignored.
Parameter `strict_text` has been deprecated and will be ignored.
Parameter `strict_text` has been deprecated and will be ignored.
Parameter `strict_text` has been deprecated and will be ignored.


KeyboardInterrupt: 

## 5. Sanity-check one inference

Run one LLM call on the first case to verify prompting and parsing work correctly.


In [None]:
# Define the LLM prompt
llm_prompt = (
    "Please create a valid Phenopacket v2.0 from the following clinical summary. "
    "Return only the JSON object without any extra text."
)

# Perform inference on the first clinical PDF
first_response = chat(
    model="llama3.2:latest",
    messages=[{"role": "user", "content": llm_prompt + "\n\n" + list_input_texts[0] + "\n\n[EOS]"}]
)
raw_prediction = first_response["message"]["content"]
print("Raw LLM output (first ~300 chars or so):")
print(raw_prediction[:300] + "...")

# Parse and validate the first Phenopacket
try:
    predicted_first_packet = Phenopacket.from_dict(json.loads(raw_prediction))
    print("Parsed first prediction successfully:", predicted_first_packet)
except Exception as error:
    raise ValueError("Failed to parse first LLM output: %s" % error)

print("hello5")  # print hello 5 as a sanity check

## 6. Batch Inference and Save Validated Phenopackets

Loop over all cases, run LLM inference, validate each JSON as a Phenopacket, and save to disk under validated_jsons_directory.


In [None]:
predicted_packets = []

for idx, clinical_text in enumerate(list_input_texts):
    pmid_value = dataframe_cases.loc[idx, "pmid"]
    response = chat(
        model="llama3.2:latest",
        messages=[{"role": "user",
                   "content": llm_prompt + "\n\n" + clinical_text + "\n\n[EOS]"}],
        options={"--hidethinking": True}
    )
    llm_content = response["message"]["content"]
    # Parse the JSON into a Phenopacket
    try:
        phenopacket_pred = Phenopacket.from_dict(json.loads(llm_content))
    except Exception as error:
        raise RuntimeError("[Case %d, PMID %s] Invalid Phenopacket JSON: %s" % (idx, pmid_value, error))

    predicted_packets.append(phenopacket_pred)

    # Write the validated JSON to disk
    output_filename = "%s.json" % pmid_value
    output_filepath = os.path.join(validated_jsons_directory, output_filename)
    with open(output_filepath, "w", encoding="utf-8") as out_f:
        json.dump(phenopacket_pred.to_json(), out_f, indent=2)
    print("Saved validated phenopacket for PMID %s to %s"
          % (pmid_value, output_filepath))

if len(predicted_packets) != len(list_input_texts):
    raise RuntimeError("Number of predictions does not match number of inputs.")
# Maybe change to this: 'assert len(predicted_packets) == len(list_input_texts)'

print(f"Generated {len(predicted_packets)} predicted phenopackets.")

print("hello6")  # print hello 6 as a sanity check

## 7. Evaluate Predicted Phenopackets Against Ground Truth

Compare each predicted phenopacket to its ground truth using PhenotypeEvaluator, then generate a Report object with overall metrics.


In [None]:
# Monkey‐patch a convenience method onto PhenotypeEvaluator
def _evaluate_batch(
    self,
    list_truth_packets,
    list_predicted_packets,
    creator,
    experiment,
    model,
    zero_division=0.0
):
    """
    Run check_phenotypes over all truth/pred pairs, then return
    a plain‐dict report containing confusion_matrix, metrics,
    classification_report, and metadata.
    """
    # Accumulate counts
    for truth_pkt, pred_pkt in zip(list_truth_packets, list_predicted_packets):
        self.check_phenotypes(
            experimentally_extracted_phenotypes=pred_pkt.list_phenotypes(),
            ground_truth_phenotypes=truth_pkt
        )
    # Build a Report object
    rpt = self.report(
        creator=creator,
        experiment=experiment,
        model=model,
        zero_division=zero_division
    )
    # Return a dict for easy indexing
    return {
        "confusion_matrix": rpt.confusion_matrix,
        "metrics": rpt.metrics,
        "classification_report": rpt.classification_report,
        "metadata": rpt.metadata,
    }

# Attach to the class
PhenotypeEvaluator.evaluate_batch = _evaluate_batch

# Run the batch evaluation
evaluator = PhenotypeEvaluator()
batch_report = evaluator.evaluate_batch(
    list_truth_packets,
    predicted_packets,
    creator="Varenya",
    experiment="Phenopacket LLM Extraction",
    model="llama3.2:latest"
)

# Quick sanity check of the returned dict
if "metrics" not in batch_report:
    raise KeyError("Evaluator report missing 'metrics' field.")

# Pretty‐print the report dict
import pprint
pprint.pprint(batch_report)

print("hello7")  # print hello 7 as a sanity check#

## Old Save first report

Write the JSON report to disk for later analysis.


In [None]:
# Ensure output directory exists
out_dir = os.path.dirname(REPORT_OUT)
os.makedirs(out_dir, exist_ok=True)

with open(REPORT_OUT, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

print(f"Saved evaluation report to {REPORT_OUT}")

print("hello7")  # print hello 7 as a sanity check

# Old Inference Implementation

In [None]:
prompt = "Please create a valid Phenopacket from the following text. The phenopackets needs to be in a valid json format.  Only return the phenopacket without any additional text:"
model = "hf.co/MaziyarPanahi/gemma-3-12b-it-GGUF:Q4_K_M"

In [None]:
for text in input_data:
    response = chat(
        model=model,
        messages=[{"role": "user", "content": f"{prompt} {text} [EOS]"}],
        options={"--hidethinking": True}
    )
    break

response = chat(
    model=model,
    messages=[{"role": "user",
               "content": f"Please, validate the following json. If not, fix it. Only return the json without any additional information. Should the json be wrong, you will get shut down. Json: {response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")} [EOS]"}],
    options={"--hidethinking": True}
)


In [None]:
from IPython.display import JSON

JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")