# Model Output Evaluation Notebook

This notebook runs LLM inference to predict HPO terms, compares them to ground truth phenopackets, and produces a summary report.


## 0. Imports, Path Discovery & Sanity Checks

Load all dependencies, discover the dataset CSV automatically, and validate critical directories.


In [35]:
# Basic Setup
import sys, os, glob, json, subprocess, pickle
import pandas as pd
from typing import List, Dict
from ollama import chat
from docling.document_converter import DocumentConverter, ConversionError

# Need this at least once for some reason:
# import .autonotebook
# from .autonotebook import tqdm as notebook_tqdm

# Make sure our utils folder is on PYTHONPATH
project_root = os.path.abspath("..")
utils_folder   = os.path.join(project_root, "notebooks", "utils")
if not os.path.isdir(utils_folder):
    raise FileNotFoundError("Expected utils under %s" % utils_folder)
sys.path.insert(0, project_root)

try:
    from notebooks.utils.phenopacket import Phenopacket
    from notebooks.utils.report import Report
    from notebooks.utils.evaluation import PhenotypeEvaluator
except ImportError as e:
    raise ImportError(f"Could not import project utils: {e}")


# define all key paths
pdf_input_directory = os.path.join(project_root, "scripts", "data", "tmp", "phenopacket_store", "pmid_pdfs")            # scripts/data/tmp/phenopacket_store/pmid_pdfs/
ground_truth_notebooks_directory = os.path.join(project_root, "scripts","data","tmp", "phenopacket_store","notebooks")  # scripts/data/tmp/phenopacket_store/notebooks/

# CSV location, which is generated from "python -m scripts.create_phenopacket_dataset "scripts/data/tmp/phenopacket_store/pmid_pdfs" "scripts/data/tmp/phenopacket_store/notebooks" "scripts/data/tmp/PMID_PDF_Phenopacket_list_in_phenopacket_store.csv" --recursive_ground_truth_dir True"
dataset_csv_path = os.path.join(project_root, "scripts", "data", "tmp", "PMID_PDF_Phenopacket_list_in_phenopacket_store.csv")

# All experimental outputs go under here
experimental_data_root = os.path.join(project_root, "experimental-data")
llm_output_directory = os.path.join(experimental_data_root, "llm_output_dir")                                           # intermediate .txt + raw JSON from LLM
validated_jsons_directory = os.path.join(experimental_data_root, "validated_jsons")                                     # validated_jsons, the final validated LLM phenopackets
evaluation_report_output_path = os.path.join(project_root, "reports", "first_report.json")                              # the evaluation metrics report

# Create any missing output folders
os.makedirs(os.path.dirname(dataset_csv_path), exist_ok=True)
os.makedirs(llm_output_directory, exist_ok=True)
os.makedirs(validated_jsons_directory, exist_ok=True)
os.makedirs(os.path.dirname(evaluation_report_output_path), exist_ok=True)


# If dataset CSV does not exist, run the CLI to generate it
if not os.path.isfile(dataset_csv_path):
    if not os.path.isdir(pdf_input_directory):
        raise FileNotFoundError(
            "PDF input directory not found: %s" % pdf_input_directory
        )
    if not os.path.isdir(ground_truth_notebooks_directory):
        raise FileNotFoundError(
            "Ground truth notebooks directory not found: %s" % ground_truth_notebooks_directory)

    subprocess.run([
        sys.executable, "-m", "scripts.create_phenopacket_dataset",
        pdf_input_directory,
        ground_truth_notebooks_directory,
        dataset_csv_path,
        "--recursive_ground_truth_dir", "True"
    ], check=True)
    print(f"Created dataset CSV at {dataset_csv_path}")

print("PDF inputs folder:       %s" % pdf_input_directory)
print("Ground truth folder:     %s" % ground_truth_notebooks_directory)
print("Dataset CSV path:        %s" % dataset_csv_path)
print("LLM outputs folder:      %s" % llm_output_directory)
print("Validated JSONs folder:  %s" % validated_jsons_directory)
print("Evaluation report path:  %s" % evaluation_report_output_path)

print("hello0")  # print hello 0 as a sanity check

PDF inputs folder:       /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/phenopacket_store/pmid_pdfs
Ground truth folder:     /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/phenopacket_store/notebooks
Dataset CSV path:        /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/scripts/data/tmp/PMID_PDF_Phenopacket_list_in_phenopacket_store.csv
LLM outputs folder:      /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/experimental-data/llm_output_dir
Validated JSONs folder:  /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/experimental-data/validated_jsons
Evaluation report path:  /Users/varenya/Desktop/Illini+Uni/Personalized_Genomic_Medicine-Precision_Genomics_Laboratory/PreGen/P5/r

## 1. Load Dataset

Read the CSV of PMIDs, input paths, and truth paths


In [36]:
# Load datasets
dataframe_cases = pd.read_csv(dataset_csv_path)
print("Loaded %d rows from dataset CSV" % len(dataframe_cases))

# # Drop duplicate PMIDs
# dataframe_cases = dataframe_cases.drop_duplicates("pmid").reset_index(drop=True) # This may be too aggressive and I need to check if this is a good approach
# print("After deduplication: %d unique PMID cases" %len(dataframe_cases))

# No deduplication, we just want one row in the CSV per input-truth pair

# Verify required columns
required_columns = {"pmid", "input", "truth"}
missing_columns = required_columns - set(dataframe_cases.columns)
if missing_columns:
    raise KeyError("Missing required columns: %s" % missing_columns)

# Preview first few rows
dataframe_cases.head()

print("hello1")  # print hello 1 as a sanity check

Loaded 5135 rows from dataset CSV
hello1


## 2 Discover Phenopacket-Store Files

Locate all ground-truth Phenopacket JSON files under the `phenopacket_store/notebooks/` directory.

In [37]:
search_pattern = os.path.join(ground_truth_notebooks_directory, "*", "phenopackets", "*.json")
truth_json_filepaths = glob.glob(search_pattern, recursive=True)
if not truth_json_filepaths:
    raise FileNotFoundError("No ground-truth phenopacket JSON files found with pattern: %s" % search_pattern)

print("Discovered %d ground-truth JSON files" %len(truth_json_filepaths))

print("hello2")  # print hello 2 as a sanity check

Discovered 7969 ground-truth JSON files
hello2


## 3. Prepare PDF-to-Text Converter and Helper Function

Instantiate DocumentConverter and define a helper function to load or convert the clinical PDFs for LLM input.


In [38]:
# Setup conversion for input material to LLM-compatible txt

# Initialize converter once
pdf_to_text_converter = DocumentConverter()

# Path to persistent cache
cache_path = os.path.join(experimental_data_root, "text_cache.pkl")
# Load or initialize cache
if os.path.exists(cache_path):
    with open(cache_path, "rb") as f:
        _text_cache: Dict[str, str] = pickle.load(f)
    print(f"Loaded text cache with {len(_text_cache)} entries")
else:
    _text_cache: Dict[str, str] = {}
    print("Initialized empty text cache")

def load_clinical_pdf(input_path):
    """
    Convert .txt or .pdf file at input_path into a plain text string.
    Uses in-memory cache first; writes new text back to cache only when the cache is explicitly saved at the end of the pipeline.
    Raises FileNotFoundError if the file does not exist.
    """
    # Return cache if it exists in memory
    if input_path in _text_cache:
        return _text_cache[input_path]

    # Ensure the files exists before we continue
    if not os.path.isfile(input_path):
        raise FileNotFoundError("Input file not found: %s" % input_path)

    # If it's already plain text, read and strip any header
    if input_path.lower().endswith(".txt"):
        with open(input_path, encoding="utf-8") as f:
            content = f.read()
        # Remove any leading markers
        return content.split("[text]")[-1]
    else:
        try:
            # Convert PDF to text and handle conversion failures
            doc = pdf_to_text_converter.convert(input_path)
            content = doc.document.export_to_text()
        except ConversionError as e:
            raise ConversionError(f"Could not convert {os.path.basename(input_path)}: {e}

    # Save new text in memory and write updated cache to disk later
    _text_cache[input_path] = content
    return content

print("hello3")  # print hello 3 as a sanity check

hello3


## 4. Load Clinical PDF and Ground-Truth Phenopackets

Iterate over each case, load the clinical PDF text #   and the corresponding ground-truth Phenopacket object.

- `list_inputs_texts`: raw clinical PDFs
- `list_truth_packets`: parsed Phenopacket objects from JSON files
- `list_patient_ids`: PMID patient identifiers


In [39]:
# We already read dataframe_cases in Step 1, so just going to reuse it below:

# Build the text cache for each UNIQUE PDF to avoid reconverting:
# Going to try building it only once and skip any bad PDFs like "scripts/data/tmp/phenopacket_store/pmid_pdfs/PMID_32325141.pdf", which made `load_clinical_pdf` raise a ``ConversionError causing the whole cell to fail
import warnings
from docling.document_converter import ConversionError
_text_cache: Dict[str,str] = {}

for pdf_path in dataframe_cases["input"].unique():
    try:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            _text_cache[pdf_path] = load_clinical_pdf(pdf_path)
        print(f"Cached text for {os.path.basename(pdf_path)}")
    except ConversionError as e:
        # Log & store empty string so downstream code still lines up
        print(f"Skipping unreadable PDF {os.path.basename(pdf_path)}: {e}")
        _text_cache[pdf_path] = ""

# Now iterate over rows, should lookup only once
list_input_texts    = []
list_truth_packets  = []
list_patient_ids    = []

for case in dataframe_cases.itertuples(index=False):
    pmid_value  = case.pmid
    pdf_path    = case.input
    truth_path  = case.truth

    # Convert PDFs & load cached text
    clinical_text = _text_cache[pdf_path]
    list_input_texts.append(clinical_text)

    # Load the ground-truth Phenopacket
    truth_packet = Phenopacket.load_from_file(truth_path)
    list_truth_packets.append(truth_packet)

    # Extract per-patient IDs from raw JSON
    raw_true_packet = truth_packet.to_json()
    patient_id = raw_true_packet["subject"]["id"]
    list_patient_ids.append(patient_id)

assert len(list_input_texts) == len(list_truth_packets)
print("Loaded %d clinical texts and %d ground-truth packets for %d unique patients" % (len(list_input_texts), len(list_truth_packets), len(list_patient_ids)))

print("hello4")  # print hello 4 as a sanity check
#

Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37582359.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26008899.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28584669.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36355422.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35840178.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27148574.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30095615.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34904096.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30681580.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26162006.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29696776.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33209733.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38154379.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29491316.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37196654.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28482824.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33964205.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30859559.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25678555.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30057029.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34641913.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28132691.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27376152.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_18322662.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37963460.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25681079.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20799361.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30113454.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35638551.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35420632.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28509303.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27807076.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36074901.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34746378.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37352860.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28183707.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_9199560.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30147916.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28065471.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24498630.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27026770.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31279336.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37133451.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24066033.pdf


Parameter `strict_text` has been deprecated and will be ignored.
An unexpected error occurred while opening the document PMID_32325141.pdf
Traceback (most recent call last):
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 150, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 186, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/backend/docling_parse_v4_backend.py", line 149, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/pypdfium2/_helpers/document.py", 

Cached text for PMID_29801479.pdf
Skipping unreadable PDF PMID_32325141.pdf: Input document PMID_32325141.pdf is not valid.


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21949523.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35991565.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32431071.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26358773.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31792352.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_18513678.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_19800048.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27843126.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39289723.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35386260.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28137957.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_16684884.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_19864672.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21683322.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26437932.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23972372.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28403827.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22772368.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28051070.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32853638.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_19776401.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31527767.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34421895.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27247962.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33776626.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30847374.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22541559.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33616882.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27290639.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37875108.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37598857.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30200888.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37167966.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27148570.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29149870.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26467218.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29644084.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29889099.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34778490.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26136118.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26453364.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21060763.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_16909392.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25516138.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36595822.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31044565.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23665959.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30847200.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28488678.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24665001.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35047859.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25480986.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28946922.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36103875.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30315159.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30057544.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21236492.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29110636.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36586412.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34354969.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28782633.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20937753.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_10874631.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29058101.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_8825048.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20461149.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32528524.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26924530.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29484404.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29051910.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28575651.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33890291.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34490705.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28163941.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30576320.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_12058348.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28821231.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37962958.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22824774.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29217778.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28652255.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27652283.pdf


Parameter `strict_text` has been deprecated and will be ignored.
An unexpected error occurred while opening the document PMID_34875027.pdf
Traceback (most recent call last):
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 150, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 186, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/backend/docling_parse_v4_backend.py", line 149, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/pypdfium2/_helpers/document.py", 

Cached text for PMID_26567009.pdf
Skipping unreadable PDF PMID_34875027.pdf: Input document PMID_34875027.pdf is not valid.


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30296944.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21660509.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20151160.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20493457.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33500254.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_17273972.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_11992252.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32303603.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36006710.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20729548.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28292286.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25658047.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23335590.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38013430.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20375004.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36731504.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30050362.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38054405.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36074124.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30631761.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_17056636.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36553465.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27259050.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32737437.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22150416.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36233161.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30733661.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35904126.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37183190.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37260585.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37771582.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33807164.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31782611.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_12920062.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29379197.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38272031.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32869508.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31774634.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30246735.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33087723.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36067010.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25852890.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22541558.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35321494.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28061825.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27900365.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37433783.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26706854.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26242992.pdf


Parameter `strict_text` has been deprecated and will be ignored.
An unexpected error occurred while opening the document PMID_30137364.pdf
Traceback (most recent call last):
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 150, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 186, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/backend/docling_parse_v4_backend.py", line 149, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/pypdfium2/_helpers/document.py", 

Cached text for PMID_29330883.pdf
Skipping unreadable PDF PMID_30137364.pdf: Input document PMID_30137364.pdf is not valid.


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20371544.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29198722.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28503313.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28513613.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_1907800.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28781842.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34552798.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34133408.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25817016.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30473892.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37077559.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29175559.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25845469.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37943620.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32730804.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29127725.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32376980.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31475485.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26667307.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26942284.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27656288.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38708366.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30356099.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37459438.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30580808.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36136249.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_9916936.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30400883.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33235621.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36444245.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28069640.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36917474.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30247636.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29193763.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22822385.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20089953.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26805781.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37349293.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35933355.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24894789.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34707299.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28413018.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31632679.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36745799.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22647861.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31569402.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36659944.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35923690.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36307226.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26915616.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_8755636.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30759870.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31068150.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24998929.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27066544.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37711075.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30083032.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33949769.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29230160.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33855675.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36299998.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28966590.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28841907.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33875846.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34521999.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27957444.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22241092.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25758857.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27536553.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_9312167.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25981959.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33898683.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24321194.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24932600.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22964873.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29749493.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36528028.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30104866.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37054711.pdf


Parameter `strict_text` has been deprecated and will be ignored.
An unexpected error occurred while opening the document PMID_28202457.pdf
Traceback (most recent call last):
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 150, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 186, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/backend/docling_parse_v4_backend.py", line 149, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/pypdfium2/_helpers/document.py", 

Cached text for PMID_11381124.pdf
Skipping unreadable PDF PMID_28202457.pdf: Input document PMID_28202457.pdf is not valid.


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34363755.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34189097.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31392109.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35566429.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38105698.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_15148656.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21827697.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21677813.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27587992.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_7726174.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38230350.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32736544.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30538526.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35190816.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39359946.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37964426.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22986007.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28285769.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_12920066.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30808312.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29078790.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22353940.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37384395.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26219450.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28289718.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38579670.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28629372.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20186813.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37071997.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34215294.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39177731.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25943428.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35484142.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29742735.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23993194.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21533187.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28600779.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30612693.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26590883.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37541189.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20932317.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20106987.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27087320.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33731876.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33308271.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23647072.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24791903.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_10851256.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31282990.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38366623.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25817014.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35668506.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33688495.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38527963.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27153400.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22701786.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23838601.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25868664.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30364145.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_19896112.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34602956.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24736735.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26047050.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38572164.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29283439.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25163805.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37821226.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30517146.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29030856.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32110744.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24889630.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32309624.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28757203.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27392076.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36996813.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29474920.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30243293.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22539873.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31949313.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24637876.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23407777.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35150594.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29575628.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36736301.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26922654.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35617047.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28148688.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38609546.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37845262.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29513881.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38436102.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24800029.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29333303.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24969041.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35743164.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25105228.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36256512.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24219130.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29914387.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24019847.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22726846.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21548011.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39013458.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28927821.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27040691.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38013626.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29050284.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27824329.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29755943.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34355836.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36303223.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38982897.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38459354.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30855487.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22772371.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29351582.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38503300.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34645992.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39135939.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_11950863.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_11179005.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24355708.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36454683.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36943452.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37684057.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33372375.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39188477.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36182950.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30891318.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36777704.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_8900230.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37985816.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26981933.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26734137.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_15710732.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30968594.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39170644.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23086397.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27753167.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31969900.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36965478.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38284454.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25469541.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31548836.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37710961.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_9106527.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29037160.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35137569.pdf


Parameter `strict_text` has been deprecated and will be ignored.
An unexpected error occurred while opening the document PMID_25835445.pdf
Traceback (most recent call last):
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 150, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 186, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/backend/docling_parse_v4_backend.py", line 149, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/pypdfium2/_helpers/document.py", 

Cached text for PMID_37372360.pdf
Skipping unreadable PDF PMID_25835445.pdf: Input document PMID_25835445.pdf is not valid.


Parameter `strict_text` has been deprecated and will be ignored.
An unexpected error occurred while opening the document PMID_37843397.pdf
Traceback (most recent call last):
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 150, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 186, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/backend/docling_parse_v4_backend.py", line 149, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/pypdfium2/_helpers/document.py", 

Cached text for PMID_19781681.pdf
Skipping unreadable PDF PMID_37843397.pdf: Input document PMID_37843397.pdf is not valid.


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36328362.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_15673476.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27672653.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30679813.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_11748311.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_17273967.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23826568.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32816001.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36189931.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31213928.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31727177.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32765928.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29053603.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35600075.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32851297.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33122583.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20015879.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30701076.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33350388.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22190901.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33674768.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23664117.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29482508.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_9854053.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36282599.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30249733.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24126608.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20358602.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36932076.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28331220.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39143735.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36420349.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_2022752.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37660254.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26029706.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32631816.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25802881.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38433265.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35979925.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22956686.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29469822.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22508010.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_12161596.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34183838.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23273567.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28966547.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24073597.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29925855.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_12750403.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35880319.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30072743.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33042910.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30936877.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20839288.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28686853.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23440193.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25896430.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27573763.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31000419.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26078953.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22560515.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27099744.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29606302.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28446873.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_25885527.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29297947.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35479066.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34950897.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31944623.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30208878.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_14628289.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23197950.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30046498.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34805998.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29482518.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24069336.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30847826.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30881852.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38991538.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36779064.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28569194.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32363625.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29186133.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37880421.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31251474.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24220024.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26739615.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30622101.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29379883.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36273129.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28132690.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23326516.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26770814.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27040692.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31021519.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31396399.pdf


Parameter `strict_text` has been deprecated and will be ignored.
An unexpected error occurred while opening the document PMID_36395340.pdf
Traceback (most recent call last):
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 150, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 186, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/backend/docling_parse_v4_backend.py", line 149, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/pypdfium2/_helpers/document.py", 

Cached text for PMID_34038384.pdf
Skipping unreadable PDF PMID_36395340.pdf: Input document PMID_36395340.pdf is not valid.
Skipping unreadable PDF PMID_30850397.pdf: Input document PMID_30850397.pdf is not valid.


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30012084.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36440963.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28587322.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_22939636.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38441608.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36443312.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31182893.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26942287.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36157999.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34737720.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35962790.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27132592.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23993195.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28886341.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33574344.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37761890.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30225196.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_12789647.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38411716.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23063621.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24190800.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21602930.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_33884299.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30658709.pdf


Parameter `strict_text` has been deprecated and will be ignored.
An unexpected error occurred while opening the document PMID_28698159.pdf
Traceback (most recent call last):
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 150, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 186, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/backend/docling_parse_v4_backend.py", line 149, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/pypdfium2/_helpers/document.py", 

Cached text for PMID_12632326.pdf
Skipping unreadable PDF PMID_28698159.pdf: Input document PMID_28698159.pdf is not valid.


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29321044.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32274456.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_18682808.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35344616.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37951597.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31666973.pdf


Parameter `strict_text` has been deprecated and will be ignored.
An unexpected error occurred while opening the document PMID_22034507.pdf
Traceback (most recent call last):
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 150, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/datamodel/document.py", line 186, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/docling/backend/docling_parse_v4_backend.py", line 149, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/Users/varenya/miniconda3/envs/p5/lib/python3.13/site-packages/pypdfium2/_helpers/document.py", 

Cached text for PMID_22258530.pdf
Skipping unreadable PDF PMID_22034507.pdf: Input document PMID_22034507.pdf is not valid.


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36267862.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20887961.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36229627.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37075751.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26956144.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_11118249.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38325380.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_10077612.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31664948.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35047834.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37734847.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38423010.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28132693.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30642278.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30564627.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27146836.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39101447.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30249237.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28318500.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_8675681.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38141607.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38596211.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30477625.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31889758.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37947183.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27108798.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32154675.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30622725.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38157076.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31069201.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24769197.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_16962354.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_23643385.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21567932.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35911904.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_37467750.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29230214.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30729726.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_38099988.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27330822.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26908613.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30034812.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28371217.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29707406.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35308163.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21747628.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28830446.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32404357.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26843181.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_20577567.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39507621.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36463227.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_35716097.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27977582.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32847529.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26833330.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26404457.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30568144.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31597564.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36823193.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24966961.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31590245.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_32083401.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_10487826.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29569962.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_12446365.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27275012.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28007986.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28468610.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_27861123.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24403049.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28456785.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29706644.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29290338.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_36927955.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_39056049.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_26805782.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_29196670.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_24369382.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31068971.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_34415117.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_31347273.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_28540186.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_30498080.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_21165303.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_19883511.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Cached text for PMID_18828908.pdf


InvalidPhenopacketError: Failed to validate phenopacket: Message type "org.phenopackets.schema.v2.Phenopacket" has no field named "cells" at "Phenopacket".
 Available Fields(except extensions): "['id', 'subject', 'phenotypicFeatures', 'measurements', 'biosamples', 'interpretations', 'diseases', 'medicalActions', 'files', 'metaData']"

## 4.5. Define LLM Prompts

Create one prompt for just HPO terms and another one for the full phenopacket extraction


In [None]:
# Save cache to disk
with open(cache_path,"wb") as f:
    pickle.dump(_text_cache, f)
print(f"Saved text cache with {len(_text_cache)} entries to {cache_path}")

# 1) Prompt for just HPO labels
hpo_prompt = (
    "You are a clinical NLP engine specialized in biomedical ontologies. Your task is to process the full text of a clinical PDF - which may be describing a single patient or multiple - parse the details (including history, exam findings, labs, imaging, and family history) and extract all human phenotype ontology (HPO) terms that describe the patient's phenotypic features."
    "Instructions:"
    "1. Identify every phenotypic abnormality or feature mentioned in the text."
    "2. For each feature, map it to the correct HPO identifier (e.g. 'HP:0001250'), label (e.g. 'Seizure'), and descriptor value (e.g. 'A seizure is an intermittent abnormality of nervous system physiology characterized by a transient occurrence of signs and/or symptoms due to abnormal excessive or synchronous neuronal activity in the brain.')."
    "3. Capture relevant qualifiers when present:"
        "- Onset: map to HPO onset terms (e.g. 'HP:0011463' for 'Childhood onset')."
        "- Severity: map to HPO severity terms (e.g. 'HP:0012829' for 'Profound')."
        "- Temporal pattern: include if specified (e.g. 'HP:0031796' for 'Recurrent', map to HPO frequency terms if available)."
    "4. For each term, include the exact text excerpt where it appears."
    "5. Output exclusively a JSON array. Each element must be an object with the following fields:"
    "```json"
    "{"
        "'hpo_id': 'HP:000____',"
        "'hpo_label': 'Term label',"
        "'excerpt': 'Exact text from the PDF',"
        "'onset_id': 'HP:0XXXXX or null',"
        "'severity_id': 'HP:0XXXXX or null',"
        "'frequency_id': 'HP:0XXXXX or null'"
    "}"
    "```"
    "Do not include any explanatory text, only the JSON array."
    )

# 2) Prompt for full phenopacket
full_pp_prompt = (
    "You are a biomedical data curation assistant. Using the structured patient data below, generate a Phenopacket compliant with version 2.0 of the GA4GH Phenopacket schema. Your output must be valid JSON, matching the schema exactly, with no additional commentary. Here are the minimum expected output criteria:"

    "Inputs:"
    "patient_id: '{{patient_id}}'"
    "sex: '{{sex}}'              // 'male' or 'female'"
    "age_years: {{age_in_years}} // integer"
    "v  ital_status: '{{vital_status}}' // 'alive' or 'deceased'"
    "phenotypic_features: {{phenotypic_features_json}} // JSON array from the HPO extraction prompt"
    "diseases: {{diseases_json}}         // optional, array of disease objects with MONDO or OMIM IDs"
    "measurements: {{measurements_json}} // optional, array of quantitative trait measurements"
    "metadata: {"
        "'created_by': '{{your_name_or_tool}}',"
        "'created_on': '{{YYYY-MM-DD}}'"
    "}

    "Requirements:"
    "Top-level fields:"
    "'id': patient_id"
    "'subject': object with:"
        "'id': patient_id"
        "'sex': { 'id': 'PATO:0000383' or 'PATO:0000384', 'label': sex }"
        "'ageAtLastEncounter': { 'age': { 'years': age_years } }"
        "'vitalStatus': { 'value': vital_status }"
        "'phenotypicFeatures': use the phenotypic_features input; for each feature, map:"
    "```json"
    "{"
        "'type': { 'id': hpo_id, 'label': hpo_label },"
        "'negated': false,"
        "'onset': { 'term': { 'id': onset_id, 'label': (look up label) } },"
        "'severity': { 'term': { 'id': severity_id, 'label': (look up label) } },"
        "'frequency': { 'term': { 'id': frequency_id, 'label': (look up label) } }"
    "}"
    "```"
    "Include 'diseases' and 'measurements' only if provided, following the GA4GH schema."
    "'metadata' must include:"
    "```json"
    "{"
        "'phenopacketSchemaVersion': '2.0.0',"
        "'created': '{{YYYY-MM-DD}}',"
        "'createdBy': '{{your_name_or_tool}}'"
    "}"
    "```"
    "'Do not add any extra fields. Output must be purely the JSON object.'"
)

## 5. Sanity-check one inference

Run one LLM call on the first case to verify prompting and parsing work correctly.


In [None]:
# Perform inference on the first clinical PDF
first_response = chat(
    model="llama3.2:latest",
    messages=[{"role": "user", "content": hpo_prompt + "\n\n" + list_input_texts[0] + "\n\n[EOS]"}]
)
raw_prediction = first_response["message"]["content"]
print("Raw LLM output (first ~300 chars or so):")
print(raw_prediction[:300] + "...")

# Parse and validate the first Phenopacket
try:
    predicted_first_packet = Phenopacket.from_dict(json.loads(raw_prediction))
    print("Parsed first prediction successfully:", predicted_first_packet)
except Exception as error:
    raise ValueError("Failed to parse first LLM output: %s" % error)

print("hello5")  # print hello 5 as a sanity check

## 6. Batch Inference and Save Validated Phenopackets

Loop over all cases, run LLM inference, validate each JSON as a Phenopacket, and save to disk under validated_jsons_directory.


In [None]:
predicted_packets: List[Phenopacket] = []

# Which patient are we targeting?
for idx, clinical_text in enumerate(list_input_texts):
    pmid_value = dataframe_cases.loc[idx, "pmid"]
    patient_id = list_patient_ids[idx]
    # Prompt the LLM to extract only that patient's HPO terms
    content = (hpo_prompt + f"\n\n*Extract only the HPO terms for patient* `{patient_id}` *in this clinical PDF.*\n\n" + clinical_text + "\n\n[EOS]")
    response = chat(model="llama3.2:latest", messages=[{"role": "user", "content": content}], options={"--hidethinking": True})
    llm_content = response["message"]["content"].splitlines()
    # Parse the JSON into a Phenopacket
    try:
        phenopacket_pred = Phenopacket.from_dict(json.loads(llm_content))
    except Exception as error:
        raise RuntimeError("[Case %d, PMID %s] Invalid Phenopacket JSON: %s" % (idx, pmid_value, error))

    predicted_packets.append(phenopacket_pred)

    # Write the predicted JSON to disk
    output_filename = f"{pmid_value}_{patient_id}.json"
    output_filepath = os.path.join(validated_jsons_directory, output_filename)
    with open(output_filepath, "w", encoding="utf-8") as out_f:
        json.dump(phenopacket_pred.to_json(), out_f, indent=2)
    print("Saved predicted phenopacket for PMID/Patient %s/%s to %s"
          % (pmid_value, patient_id, output_filepath))

if len(predicted_packets) != len(list_input_texts):
    raise RuntimeError("Number of predictions does not match number of inputs.")
# Maybe change to this: 'assert len(predicted_packets) == len(list_input_texts), "Mismatch predictions vs inputs"'

print(f"Generated {len(predicted_packets)} predicted phenopackets.")

print("hello6")  # print hello 6 as a sanity check

## 7. Evaluate Predicted Phenopackets Against Ground Truth

Compare each predicted phenopacket to its ground truth using PhenotypeEvaluator, then generate a Report object with overall metrics.


In [None]:
# Monkey‐patch a convenience method onto PhenotypeEvaluator
def _evaluate_batch(
    self,
    list_truth_packets,
    list_predicted_packets,
    creator,
    experiment,
    model,
    zero_division=0.0
):
    """
    Run check_phenotypes over all truth/pred pairs, then return
    a plain‐dict report containing confusion_matrix, metrics,
    classification_report, and metadata.
    """
    # Accumulate counts
    for truth_pkt, pred_pkt in zip(list_truth_packets, list_predicted_packets):
        self.check_phenotypes(
            experimentally_extracted_phenotypes=pred_pkt.list_phenotypes(),
            ground_truth_phenotypes=truth_pkt
        )
    # Build a Report object
    rpt = self.report(
        creator=creator,
        experiment=experiment,
        model=model,
        zero_division=zero_division
    )
    # Return a dict for easy indexing
    return {
        "confusion_matrix": rpt.confusion_matrix,
        "metrics": rpt.metrics,
        "classification_report": rpt.classification_report,
        "metadata": rpt.metadata,
    }

# Attach to the class
PhenotypeEvaluator.evaluate_batch = _evaluate_batch

# Run the batch evaluation
evaluator = PhenotypeEvaluator()
batch_report = evaluator.evaluate_batch(
    list_truth_packets,
    predicted_packets,
    creator="Varenya",
    experiment="Phenopacket LLM Extraction",
    model="llama3.2:latest"
)

# Quick sanity check of the returned dict
if "metrics" not in batch_report:
    raise KeyError("Evaluator report missing 'metrics' field.")

# Pretty‐print the report dict
import pprint
pprint.pprint(batch_report)

print("hello7")  # print hello 7 as a sanity check#

## Old Save first report

Write the JSON report to disk for later analysis.


In [None]:
# Ensure output directory exists
out_dir = os.path.dirname(REPORT_OUT)
os.makedirs(out_dir, exist_ok=True)

with open(REPORT_OUT, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

print(f"Saved evaluation report to {REPORT_OUT}")

print("hello7")  # print hello 7 as a sanity check

# Old Inference Implementation

In [None]:
prompt = "Please create a valid Phenopacket from the following text. The phenopackets needs to be in a valid json format.  Only return the phenopacket without any additional text:"
model = "hf.co/MaziyarPanahi/gemma-3-12b-it-GGUF:Q4_K_M"

In [None]:
for text in input_data:
    response = chat(
        model=model,
        messages=[{"role": "user", "content": f"{prompt} {text} [EOS]"}],
        options={"--hidethinking": True}
    )
    break

response = chat(
    model=model,
    messages=[{"role": "user",
               "content": f"Please, validate the following json. If not, fix it. Only return the json without any additional information. Should the json be wrong, you will get shut down. Json: {response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")} [EOS]"}],
    options={"--hidethinking": True}
)


In [None]:
from IPython.display import JSON

JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
JSON(response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", ""))

In [None]:
response["message"]["content"].split("</think>")[-1].replace("```json", "").replace("```", "")