In [1]:
! pip install pfd_toolkit

Collecting pfd_toolkit
  Downloading pfd_toolkit-0.4.0-py3-none-any.whl.metadata (6.4 kB)
Collecting adjusttext>=1.3.0 (from pfd_toolkit)
  Downloading adjustText-1.3.0-py3-none-any.whl.metadata (3.1 kB)
Collecting backoff>=2.2.1 (from pfd_toolkit)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting dotenv>=0.9.9 (from pfd_toolkit)
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting pymupdf>=1.26.1 (from pfd_toolkit)
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pfd_toolkit-0.4.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.4/70.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading adjustText-1.3.0-py3-none-any.whl (13 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
from pathlib import Path
import os

import pandas as pd
from dotenv import load_dotenv

from pfd_toolkit import LLM, Screener
from pfd_toolkit.config import GeneralConfig

DATA_PATH = Path("../../ons_replication/PFD Toolkit--Consensus Comparison.xlsx")
RESULTS_PATH = Path("model_comparison.csv")
SHEET_NAME = "Consensus annotations"

In [3]:
MODEL_SPECS = [
    # OpenAI API models
    {"name": "gpt-4.1", "temperature": 0},
    {"name": "gpt-4.1-mini", "temperature": 0},
    {"name": "gpt-4.1-nano", "temperature": 0},


    # Ollama-hosted models
    {
        "name": "mistral-nemo:12b",
        "temperature": 0,
        "base_url": "http://localhost:11434/v1",
        "api_key": "ollama",
        "timeout": 10**9,
    },
    {
        "name": "mistral-small:22b",
        "temperature": 0,
        "base_url": "http://localhost:11434/v1",
        "api_key": "ollama",
        "timeout": 10**9,
    },
    {
        "name": "mistral-small:24b",
        "temperature": 0,
        "base_url": "http://localhost:11434/v1",
        "api_key": "ollama",
        "timeout": 10**9,
    },

]

user_query = """
Identify cases where the deceased was aged 18 or younger *clearly at the time of death* **and** 
the death was due to suicide. If suicide is not explicitly stated, you can use a strict balance of 
probabilities threshold to determine it as such. 

Age may not be explicitly stated, but could be implied through references such as 
**recent** use of child or adolescent services (e.g. CAMHS), attending school years 
(e.g. “Year 10”), or similar contextual indicators of being under 18 (again, under a 
strict balance of probabilities threshold).
"""


In [4]:
def load_reports() -> pd.DataFrame:
    df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)
    renamed = df.rename(
        columns={
            "Ref": GeneralConfig.COL_ID,
            "Investigation section": GeneralConfig.COL_INVESTIGATION,
            "Circumstances of death section": GeneralConfig.COL_CIRCUMSTANCES,
            "Matters of concern section": GeneralConfig.COL_CONCERNS,
            "Post-consensus verdict: Is this a child suicide case? (Yes or No)": "consensus",
        }
    )

    reports = renamed[
        [
            GeneralConfig.COL_ID,
            GeneralConfig.COL_INVESTIGATION,
            GeneralConfig.COL_CIRCUMSTANCES,
            GeneralConfig.COL_CONCERNS,
            "consensus",
        ]
    ].copy()

    reports["consensus"] = (
        reports["consensus"].astype(str).str.strip().str.lower() == "yes"
    )
    return reports

In [5]:
def evaluate_model(spec: dict[str, object], reports: pd.DataFrame) -> dict[str, float]:
    model_name = spec["name"]
    llm_kwargs = {
        "api_key": spec.get("api_key", os.getenv("OPENAI_API_KEY")),
        "max_workers": 8,
        "model": model_name,
        "seed": 12345,
        "timeout": spec.get("timeout", 20),
        "temperature": 1 if model_name.startswith("gpt-5") else spec["temperature"],
    }

    if "base_url" in spec:
        llm_kwargs["base_url"] = spec["base_url"]

    llm_client = LLM(**llm_kwargs)
    screener = Screener(
        llm=llm_client,
        reports=reports,
        include_investigation=True,
        include_circumstances=True,
        include_concerns=True,
    )

    classified = screener.screen_reports(
        search_query=user_query,
        filter_df=False,
        result_col_name="model_pred",
    )

    pred = classified["model_pred"].astype(bool)
    truth = classified["consensus"].astype(bool)

    tp = (pred & truth).sum()
    tn = ((~pred) & (~truth)).sum()
    fp = (pred & ~truth).sum()
    fn = ((~pred) & truth).sum()

    total = tp + tn + fp + fn
    accuracy = (tp + tn) / total if total else float("nan")
    sensitivity = tp / (tp + fn) if (tp + fn) else float("nan")
    specificity = tn / (tn + fp) if (tn + fp) else float("nan")

    return {
        "model": model_name,
        "accuracy": accuracy,
        "sensitivity": sensitivity,
        "specificity": specificity,
    }



In [6]:
def run_comparisons():
    load_dotenv("../../api.env")
    reports = load_reports()

    if RESULTS_PATH.exists():
        results_df = pd.read_csv(RESULTS_PATH)
    else:
        results_df = pd.DataFrame(
            columns=["model", "accuracy", "sensitivity", "specificity"]
        )

    completed_models = set(results_df["model"].astype(str))
    models_to_run = [spec for spec in MODEL_SPECS if spec["name"] not in completed_models]

    if not models_to_run:
        print("All models already tested.")
        return results_df

    for spec in models_to_run:
        print(f"Testing model: {spec['name']}")
        results = evaluate_model(spec, reports)
        results_df = pd.concat([results_df, pd.DataFrame([results])], ignore_index=True)
        results_df.to_csv(RESULTS_PATH, index=False)

    return results_df


In [7]:
load_dotenv("../../api.env")
reports = load_reports()

if RESULTS_PATH.exists():
    results_df = pd.read_csv(RESULTS_PATH)
else:
    results_df = pd.DataFrame(
        columns=["model", "accuracy", "sensitivity", "specificity"]
    )

completed_models = set(results_df["model"].astype(str))
models_to_run = [spec for spec in MODEL_SPECS if spec["name"] not in completed_models]

if not models_to_run:
    print("All models already tested.")
    return

for spec in models_to_run:
    print(f"Testing model: {spec['name']}")
    results = evaluate_model(spec, reports)
    results_df = pd.concat([results_df, pd.DataFrame([results])], ignore_index=True)
    results_df.to_csv(RESULTS_PATH, index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../../ons_replication/PFD Toolkit--Consensus Comparison.xlsx'