# Open-weight model experiment

Run the open-weight model comparison on the consensus dataset without relying on local data files. The notebook pulls the data directly from GitHub, loads any available API keys from env files next to the notebook, and saves results locally to a CSV.


In [1]:
# Install dependencies
! pip install -q pfd_toolkit ipywidgets

In [5]:
# Get API key

import os
from ipywidgets import Password, Button, VBox, HTML

api_key_input = Password(
    description="API key:",
    placeholder="Paste your key here"
)
status = HTML("")
button = Button(description="Set key")

def set_api_key(_):
    if api_key_input.value:
        os.environ["OPENAI_API_KEY"] = api_key_input.value
        # clear the visible value
        api_key_input.value = ""
        status.value = "<b>API key stored in this session.</b>"
    else:
        status.value = "<b>No key entered.</b>"

button.on_click(set_api_key)

ui = VBox([api_key_input, button, status])
ui


VBox(children=(Password(description='API key:', placeholder='Paste your key here'), Button(description='Set ke…

In [6]:
api_key = os.environ["OPENAI_API_KEY"]

In [7]:
from pathlib import Path
import shutil
import os
import tempfile
from urllib.parse import quote

import pandas as pd
from dotenv import load_dotenv
import requests

from pfd_toolkit import LLM, Screener
from pfd_toolkit.config import GeneralConfig

NOTEBOOK_DIR = Path.cwd()


from getpass import getpass

def ensure_secret(env_name: str) -> str:
    if env_name not in os.environ or not os.environ[env_name]:
        os.environ[env_name] = getpass(f"Enter {env_name}: ")
    return os.environ[env_name]

OPENAI_API_KEY = api_key

GITHUB_BASE = "https://raw.githubusercontent.com/Sam-Osian/PFD-toolkit/open-model-exp"
DATA_URL = f"{GITHUB_BASE}/ons_replication/{quote('PFD Toolkit--Consensus Comparison.xlsx')}"
SHEET_NAME = "Consensus annotations"
RESULTS_PATH = NOTEBOOK_DIR / "model_comparison.csv"


## Start Ollama in Colab

Install and start the Ollama daemon locally so the open-weight models can be downloaded and served within this runtime.

In [1]:

import shutil
import subprocess
import time

import requests

OLLAMA_PORT = 11434
OLLAMA_BASE_URL = f"http://localhost:{OLLAMA_PORT}/v1"

if not shutil.which("ollama"):
    print("Installing Ollama...")
    !curl -fsSL https://ollama.com/install.sh | sh

# Start the Ollama server if it isn't already running
try:
    requests.get(f"http://localhost:{OLLAMA_PORT}/api/tags", timeout=2)
    print("Ollama is already running.")
except Exception:
    print("Starting ollama serve...")
    ollama_process = subprocess.Popen(
        ["ollama", "serve"],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )
    for _ in range(60):
        try:
            requests.get(f"http://localhost:{OLLAMA_PORT}/api/tags", timeout=2)
            print("Ollama is ready.")
            break
        except Exception:
            time.sleep(1)
    else:
        raise RuntimeError("Ollama failed to start. Check the logs above.")


Installing Ollama...
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
Starting ollama serve...
Ollama is ready.


In [2]:

OLLAMA_MODELS = [
    {"name": "mistral-nemo:12b", "temperature": 0, "timeout": 10**9},
    #{"name": "mistral-small:22b", "temperature": 0, "timeout": 10**9},
    #{"name": "mistral-small:24b", "temperature": 0, "timeout": 10**9},
]

for spec in OLLAMA_MODELS:
    print(f"Pulling {spec['name']} (this may take a few minutes)...")
    subprocess.run(["ollama", "pull", spec["name"]], check=True)


Pulling mistral-nemo:12b (this may take a few minutes)...


In [8]:
data_path = Path(tempfile.gettempdir()) / "pfd_toolkit_consensus.xlsx"
response = requests.get(DATA_URL, timeout=60)
response.raise_for_status()
data_path.write_bytes(response.content)

print(f"Dataset downloaded to {data_path}")

df = pd.read_excel(data_path, sheet_name=SHEET_NAME)
renamed = df.rename(
    columns={
        "Ref": GeneralConfig.COL_ID,
        "Investigation section": GeneralConfig.COL_INVESTIGATION,
        "Circumstances of death section": GeneralConfig.COL_CIRCUMSTANCES,
        "Matters of concern section": GeneralConfig.COL_CONCERNS,
        "Post-consensus verdict: Is this a child suicide case? (Yes or No)": "consensus",
    }
)

reports = renamed[
    [
        GeneralConfig.COL_ID,
        GeneralConfig.COL_INVESTIGATION,
        GeneralConfig.COL_CIRCUMSTANCES,
        GeneralConfig.COL_CONCERNS,
        "consensus",
    ]
].copy()

reports["consensus"] = reports["consensus"].astype(str).str.strip().str.lower() == "yes"

reports.head()


Dataset downloaded to /tmp/pfd_toolkit_consensus.xlsx


Unnamed: 0,id,investigation,circumstances,concerns,consensus
0,2023-0452,On 12 August 2022 I commenced an investigation...,Madeleine Savory was 15 years old when they di...,"The availability, nationally, of Tier 4 beds i...",True
1,2023-0445,"On 15 May 2023, one of my assistant coroners, ...",Igor hanged himself in his room at the Depaul ...,"On 28 March 2023, Igor refused to get out of t...",True
2,2023-0313,On 3 August 2022 I commenced an investigation ...,Allison Aules was referred to the mental healt...,The Inquest identified multiple failings in th...,True
3,2023-0283,On 12 June 2019 I commenced an investigation i...,,A detailed review of the evidence in this case...,True
4,2023-0146,On 31 August 2022 I opened an investigation to...,On 27 August 2022 Callum Wong was found having...,1. Consideration for exceptions to patient con...,True


In [9]:

MODEL_SPECS = [
    # OpenAI API models
    {"name": "gpt-4.1", "temperature": 0},
    {"name": "gpt-4.1-mini", "temperature": 0},
    {"name": "gpt-4.1-nano", "temperature": 0},
]

# Append Ollama-hosted models using the shared base URL and API key
MODEL_SPECS += [
    {
        "name": spec["name"],
        "temperature": spec["temperature"],
        "base_url": OLLAMA_BASE_URL,
        "api_key": "ollama",
        "timeout": spec.get("timeout", 20),
    }
    for spec in OLLAMA_MODELS
]

user_query = """
Identify cases where the deceased was aged 18 or younger *clearly at the time of death* **and**
the death was due to suicide. If suicide is not explicitly stated, you can use a strict balance of
probabilities threshold to determine it as such.

Age may not be explicitly stated, but could be implied through references such as
recent use of child or adolescent services (e.g. CAMHS), attending school years
(e.g. "Year 10"), or similar contextual indicators of being under 18 (again, under a
strict balance of probabilities threshold).
"""


In [10]:
results_columns = ["model", "accuracy", "sensitivity", "specificity"]
if RESULTS_PATH.exists():
    results_df = pd.read_csv(RESULTS_PATH)
else:
    results_df = pd.DataFrame(columns=results_columns)

completed_models = set(results_df["model"].astype(str))

for spec in MODEL_SPECS:
    if spec["name"] in completed_models:
        print(f"Skipping {spec['name']} (already evaluated)")
        continue

    print(f"Testing model: {spec['name']}")

    llm_kwargs = {
        "api_key": spec.get("api_key", os.getenv("OPENAI_API_KEY")),
        "max_workers": 8,
        "model": spec["name"],
        "seed": 12345,
        "timeout": spec.get("timeout", 20),
        "temperature": 1 if spec["name"].startswith("gpt-5") else spec["temperature"],
    }

    if "base_url" in spec:
        llm_kwargs["base_url"] = spec["base_url"]

    llm_client = LLM(**llm_kwargs)
    screener = Screener(
        llm=llm_client,
        reports=reports,
        include_investigation=True,
        include_circumstances=True,
        include_concerns=True,
    )

    classified = screener.screen_reports(
        search_query=user_query,
        filter_df=False,
        result_col_name="model_pred",
    )

    pred = classified["model_pred"].astype(bool)
    truth = classified["consensus"].astype(bool)

    tp = (pred & truth).sum()
    tn = ((~pred) & (~truth)).sum()
    fp = (pred & ~truth).sum()
    fn = ((~pred) & truth).sum()

    total = tp + tn + fp + fn
    accuracy = (tp + tn) / total if total else float("nan")
    sensitivity = tp / (tp + fn) if (tp + fn) else float("nan")
    specificity = tn / (tn + fp) if (tn + fp) else float("nan")

    results_df = pd.concat(
        [
            results_df,
            pd.DataFrame(
                [
                    {
                        "model": spec["name"],
                        "accuracy": accuracy,
                        "sensitivity": sensitivity,
                        "specificity": specificity,
                    }
                ]
            ),
        ],
        ignore_index=True,
    )

    results_df.to_csv(RESULTS_PATH, index=False)
    completed_models.add(spec["name"])

results_df


Skipping gpt-4.1 (already evaluated)
Skipping gpt-4.1-mini (already evaluated)
Skipping gpt-4.1-nano (already evaluated)
Testing model: mistral-nemo:12b


Sending requests to the LLM: 100%|██████████| 146/146 [04:49<00:00,  1.98s/it]


Unnamed: 0,model,accuracy,sensitivity,specificity
0,gpt-4.1,0.910959,0.897059,0.923077
1,gpt-4.1-mini,0.917808,0.911765,0.923077
2,gpt-4.1-nano,0.90411,0.941176,0.871795
3,mistral-nemo:12b,0.90411,0.838235,0.961538
