# Demonstration Notebook

This notebook runs LLM inference to predict HPO terms, compares them to ground truth phenopackets, and produces a summary report.

# Step 0) Imports, Path Discovery, & Sanity Checks

Load all dependencies, discover the dataset CSV automatically, and validate critical directories.

In [None]:
# Basic Setup
import sys, os, glob, json, subprocess, pickle, datetime, hashlib, warnings, random, requests
from pathlib import Path
import pandas as pd
from typing import List, Dict, Any
from ollama import chat
from docling.document_converter import DocumentConverter, ConversionError
from pypdfium2._helpers.misc import PdfiumError
from google.protobuf.json_format import ParseDict, ParseError
from phenopackets import Phenopacket as ProtoPhenopacket
from json.decoder import JSONDecodeError

# Need this at least once for some reason:
# import .autonotebook
# from .autonotebook import tqdm as notebook_tqdm

# Setup the PYTHONPATH for this project demonstration
# Make sure our utils folder is on PYTHONPATH
project_root        = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder          = os.path.join(project_root, "src")
utils_folder        = os.path.join(project_root, "notebooks", "utils")

for path in (src_folder, utils_folder):
    if not os.path.isdir(path):
        raise FileNotFoundError(f"Expected folder on PYTHOPATH : {path}")
    if path not in sys.path:
        sys.path.insert(0, path)

print("PYTHONPATH patched with:", src_folder, utils_folder)

print("Project Start:       %s" % project_root)
print("Source Folder:       %s" % src_folder)
print("Utilities Folder:    %s" % utils_folder)

try:
    from phenopacket import Phenopacket, InvalidPhenopacketError
    from report import Report
    from evaluation import PhenotypeEvaluator
except ImportError as e:
    raise ImportError(f"Could not import project utils: {e}")

# define all key paths
pdf_input_directory                 = os.path.join(src_folder, "P5", "scripts", "data", "tmp", "phenopacket_store", "pmid_pdfs")            # scripts/data/tmp/phenopacket_store/pmid_pdfs/
ground_truth_notebooks_directory    = os.path.join(src_folder, "P5", "scripts", "data","tmp", "phenopacket_store","notebooks")              # scripts/data/tmp/phenopacket_store/notebooks/
dataset_csv_path                    = os.path.join(src_folder, "P5", "scripts", "data", "tmp", "PMID_PDF_Phenopacket_list_in_phenopacket_store.csv")

# All experimental outputs go under here
experimental_data_root              = os.path.join(project_root, "experimental-data")
llm_output_directory                = os.path.join(experimental_data_root, "llm_output_dir")                                                # intermediate .txt + raw JSON from LLM
validated_jsons_directory           = os.path.join(experimental_data_root, "validated_jsons")                                               # validated_jsons, the final validated LLM phenopackets
evaluation_report_output_path       = os.path.join(project_root, "reports", "first_report.json")                                            # the evaluation metrics report

# Create any missing output folders
os.makedirs(pdf_input_directory, exist_ok=True)
os.makedirs(ground_truth_notebooks_directory, exist_ok=True)
os.makedirs(os.path.dirname(dataset_csv_path), exist_ok=True)
os.makedirs(llm_output_directory, exist_ok=True)
os.makedirs(validated_jsons_directory, exist_ok=True)
os.makedirs(os.path.dirname(evaluation_report_output_path), exist_ok=True)

# Create the PMIDs pickle file path
pmid_pkl_path = os.path.join(src_folder, "P5", "scripts", "data", "tmp", "pmids.pkl")

# TODO: Figure out why deleting the `ground_truth_notebooks_directory` after creating it works. Maybe because git doesn't let me just overwrite a directory with a clone request
# Before the git pull operation
import shutil

# Clean up existing directory if it exists
target_dir = os.path.join(src_folder, "P5", "scripts", "data", "tmp", "phenopacket_store", "notebooks")
if os.path.exists(target_dir):
    shutil.rmtree(target_dir)

# 1. Now run the git pull to clone the "phenopacket-store" GitHub repo into scripts/data/tmp/phenopacket_store
subprocess.run([
    sys.executable, "-m", "P5.scripts.pull_git_files",
    os.path.join(src_folder, "P5", "scripts", "data", "tmp", "phenopacket_store"),
    "https://github.com/monarch-initiative/phenopacket-store.git",
    "notebooks"
], check=True)

print("Stage 1 Complete, Produced %s" % ground_truth_notebooks_directory)

# 2. Scan the just-pulled notebooks for PMID_##### files
subprocess.run([
    sys.executable, "-m", "P5.scripts.create_pmid_pkl",
    os.path.join(src_folder, "P5", "scripts", "data", "tmp", "phenopacket_store", "notebooks"),
    os.path.join(src_folder, "P5", "scripts", "data", "tmp", "pmids.pkl"),
    "--recursive_dir_search",
], check=True)

print("Stage 2 Complete")

# 3. Download *all* PDFs for those PMIDs (0 = unlimited)
subprocess.run([
    sys.executable, "-m", "P5.scripts.pmid_downloader", pmid_pkl_path, pdf_input_directory, "10"
], check=True)

print("Stage 3 Complete")

# 4. Finally, build THE CSV mapping PDFs to the ground-truth JSONs
if not os.path.isfile(dataset_csv_path):
    subprocess.run([
        sys.executable, "-m", "P5.scripts.create_phenopacket_dataset",
        pdf_input_directory,
        ground_truth_notebooks_directory,
        dataset_csv_path,
        "--recursive_ground_truth_dir", "True"
    ], check=True)
    print(f"Created dataset CSV at {dataset_csv_path}")

    print("Stage 4 Complete")

    if not os.path.isdir(pdf_input_directory):
        raise FileNotFoundError("PDF input directory not found: %s" % pdf_input_directory)
    if not os.path.isdir(ground_truth_notebooks_directory):
        raise FileNotFoundError("Ground truth notebooks directory not found: %s" % ground_truth_notebooks_directory)

print("Created the PDF inputs folder:                               %s" % pdf_input_directory)
print("Created the ground truth folder:                             %s" % ground_truth_notebooks_directory)
print("Created the PATH for the CSV of `phenopacket-store`:         %s" % dataset_csv_path)
print("Created the PATH for the experimentally generated files:     %s" % experimental_data_root)
print("Created the LLM outputs folder:                              %s" % llm_output_directory)
print("Created the validated JSONs folder:                          %s" % validated_jsons_directory)
print("Created the evaluation report path:                          %s" % evaluation_report_output_path)

print("hello0")  # print hello 0 as a sanity check