# Open-weight model experiment

Run the open-weight model comparison on the consensus dataset without relying on local data files. The notebook pulls the data directly from GitHub, loads any available API keys from env files next to the notebook, and saves results locally to a CSV.


In [1]:
# Install dependencies
! pip install -q pfd_toolkit ipywidgets

In [2]:
# Get API key

import os
from ipywidgets import Password, Button, VBox, HTML

api_key_input = Password(
    description="API key:",
    placeholder="Paste your key here"
)
status = HTML("")
button = Button(description="Set key")

def set_api_key(_):
    if api_key_input.value:
        os.environ["OPENAI_API_KEY"] = api_key_input.value
        # clear the visible value
        api_key_input.value = ""
        status.value = "<b>API key stored in this session.</b>"
    else:
        status.value = "<b>No key entered.</b>"

button.on_click(set_api_key)

ui = VBox([api_key_input, button, status])
ui


VBox(children=(Password(description='API key:', placeholder='Paste your key here'), Button(description='Set ke…

In [3]:
api_key = os.environ["OPENAI_API_KEY"]

In [6]:
from pathlib import Path
import os
import tempfile
from urllib.parse import quote

import pandas as pd
from dotenv import load_dotenv
import requests

from pfd_toolkit import LLM, Screener
from pfd_toolkit.config import GeneralConfig

NOTEBOOK_DIR = Path.cwd()


from getpass import getpass

def ensure_secret(env_name: str) -> str:
    if env_name not in os.environ or not os.environ[env_name]:
        os.environ[env_name] = getpass(f"Enter {env_name}: ")
    return os.environ[env_name]

OPENAI_API_KEY = api_key

GITHUB_BASE = "https://raw.githubusercontent.com/Sam-Osian/PFD-toolkit/open-model-exp"
DATA_URL = f"{GITHUB_BASE}/ons_replication/{quote('PFD Toolkit--Consensus Comparison.xlsx')}"
SHEET_NAME = "Consensus annotations"
RESULTS_PATH = NOTEBOOK_DIR / "model_comparison.csv"


In [7]:
data_path = Path(tempfile.gettempdir()) / "pfd_toolkit_consensus.xlsx"
response = requests.get(DATA_URL, timeout=60)
response.raise_for_status()
data_path.write_bytes(response.content)

print(f"Dataset downloaded to {data_path}")

df = pd.read_excel(data_path, sheet_name=SHEET_NAME)
renamed = df.rename(
    columns={
        "Ref": GeneralConfig.COL_ID,
        "Investigation section": GeneralConfig.COL_INVESTIGATION,
        "Circumstances of death section": GeneralConfig.COL_CIRCUMSTANCES,
        "Matters of concern section": GeneralConfig.COL_CONCERNS,
        "Post-consensus verdict: Is this a child suicide case? (Yes or No)": "consensus",
    }
)

reports = renamed[
    [
        GeneralConfig.COL_ID,
        GeneralConfig.COL_INVESTIGATION,
        GeneralConfig.COL_CIRCUMSTANCES,
        GeneralConfig.COL_CONCERNS,
        "consensus",
    ]
].copy()

reports["consensus"] = reports["consensus"].astype(str).str.strip().str.lower() == "yes"

reports.head()


Dataset downloaded to /tmp/pfd_toolkit_consensus.xlsx


Unnamed: 0,id,investigation,circumstances,concerns,consensus
0,2023-0452,On 12 August 2022 I commenced an investigation...,Madeleine Savory was 15 years old when they di...,"The availability, nationally, of Tier 4 beds i...",True
1,2023-0445,"On 15 May 2023, one of my assistant coroners, ...",Igor hanged himself in his room at the Depaul ...,"On 28 March 2023, Igor refused to get out of t...",True
2,2023-0313,On 3 August 2022 I commenced an investigation ...,Allison Aules was referred to the mental healt...,The Inquest identified multiple failings in th...,True
3,2023-0283,On 12 June 2019 I commenced an investigation i...,,A detailed review of the evidence in this case...,True
4,2023-0146,On 31 August 2022 I opened an investigation to...,On 27 August 2022 Callum Wong was found having...,1. Consideration for exceptions to patient con...,True


In [8]:
MODEL_SPECS = [
    # OpenAI API models
    {"name": "gpt-4.1", "temperature": 0},
    {"name": "gpt-4.1-mini", "temperature": 0},
    {"name": "gpt-4.1-nano", "temperature": 0},

    # Ollama-hosted models
    {
        "name": "mistral-nemo:12b",
        "temperature": 0,
        "base_url": "http://localhost:11434/v1",
        "api_key": "ollama",
        "timeout": 10**9,
    },
    {
        "name": "mistral-small:22b",
        "temperature": 0,
        "base_url": "http://localhost:11434/v1",
        "api_key": "ollama",
        "timeout": 10**9,
    },
    {
        "name": "mistral-small:24b",
        "temperature": 0,
        "base_url": "http://localhost:11434/v1",
        "api_key": "ollama",
        "timeout": 10**9,
    },
]

user_query = """
Identify cases where the deceased was aged 18 or younger *clearly at the time of death* **and**
the death was due to suicide. If suicide is not explicitly stated, you can use a strict balance of
probabilities threshold to determine it as such.

Age may not be explicitly stated, but could be implied through references such as
recent use of child or adolescent services (e.g. CAMHS), attending school years
(e.g. \"Year 10\"), or similar contextual indicators of being under 18 (again, under a
strict balance of probabilities threshold).
"""


In [None]:
results_columns = ["model", "accuracy", "sensitivity", "specificity"]
if RESULTS_PATH.exists():
    results_df = pd.read_csv(RESULTS_PATH)
else:
    results_df = pd.DataFrame(columns=results_columns)

completed_models = set(results_df["model"].astype(str))

for spec in MODEL_SPECS:
    if spec["name"] in completed_models:
        print(f"Skipping {spec['name']} (already evaluated)")
        continue

    print(f"Testing model: {spec['name']}")

    llm_kwargs = {
        "api_key": spec.get("api_key", os.getenv("OPENAI_API_KEY")),
        "max_workers": 8,
        "model": spec["name"],
        "seed": 12345,
        "timeout": spec.get("timeout", 20),
        "temperature": 1 if spec["name"].startswith("gpt-5") else spec["temperature"],
    }

    if "base_url" in spec:
        llm_kwargs["base_url"] = spec["base_url"]

    llm_client = LLM(**llm_kwargs)
    screener = Screener(
        llm=llm_client,
        reports=reports,
        include_investigation=True,
        include_circumstances=True,
        include_concerns=True,
    )

    classified = screener.screen_reports(
        search_query=user_query,
        filter_df=False,
        result_col_name="model_pred",
    )

    pred = classified["model_pred"].astype(bool)
    truth = classified["consensus"].astype(bool)

    tp = (pred & truth).sum()
    tn = ((~pred) & (~truth)).sum()
    fp = (pred & ~truth).sum()
    fn = ((~pred) & truth).sum()

    total = tp + tn + fp + fn
    accuracy = (tp + tn) / total if total else float("nan")
    sensitivity = tp / (tp + fn) if (tp + fn) else float("nan")
    specificity = tn / (tn + fp) if (tn + fp) else float("nan")

    results_df = pd.concat(
        [
            results_df,
            pd.DataFrame(
                [
                    {
                        "model": spec["name"],
                        "accuracy": accuracy,
                        "sensitivity": sensitivity,
                        "specificity": specificity,
                    }
                ]
            ),
        ],
        ignore_index=True,
    )

    results_df.to_csv(RESULTS_PATH, index=False)
    completed_models.add(spec["name"])

results_df


Testing model: gpt-4.1


Sending requests to the LLM: 100%|██████████| 146/146 [00:12<00:00, 11.39it/s]
  results_df = pd.concat(


Testing model: gpt-4.1-mini


Sending requests to the LLM: 100%|██████████| 146/146 [00:09<00:00, 15.91it/s]


Testing model: gpt-4.1-nano


Sending requests to the LLM: 100%|██████████| 146/146 [00:09<00:00, 15.52it/s]


Testing model: mistral-nemo:12b


Sending requests to the LLM:   0%|          | 0/146 [00:00<?, ?it/s]INFO:backoff:Backing off _parse_with_backoff(...) for 0.9s (openai.APIConnectionError: Connection error.)
INFO:backoff:Backing off _parse_with_backoff(...) for 0.3s (openai.APIConnectionError: Connection error.)
INFO:backoff:Backing off _parse_with_backoff(...) for 0.8s (openai.APIConnectionError: Connection error.)
INFO:backoff:Backing off _parse_with_backoff(...) for 0.3s (openai.APIConnectionError: Connection error.)
INFO:backoff:Backing off _parse_with_backoff(...) for 0.7s (openai.APIConnectionError: Connection error.)
INFO:backoff:Backing off _parse_with_backoff(...) for 0.7s (openai.APIConnectionError: Connection error.)
INFO:backoff:Backing off _parse_with_backoff(...) for 0.8s (openai.APIConnectionError: Connection error.)
INFO:backoff:Backing off _parse_with_backoff(...) for 0.2s (openai.APIConnectionError: Connection error.)
INFO:backoff:Backing off _parse_with_backoff(...) for 1.8s (openai.APIConnectionErro

KeyboardInterrupt: 

INFO:backoff:Backing off _parse_with_backoff(...) for 7.0s (openai.APIConnectionError: Connection error.)
INFO:backoff:Backing off _parse_with_backoff(...) for 5.6s (openai.APIConnectionError: Connection error.)
