In [1]:
from pathlib import Path
import pandas as pd 
import ast

base_path = Path().resolve() / 'files'

prompts_df = pd.read_excel(base_path / 'prompts_omim_pmid_updated_file_final1.xlsx', index_col='File Name')

prompts_df = prompts_df.dropna()
prompts_df['OMIM'] = prompts_df['OMIM'].astype(int)

# --- if the column is already a real list you can skip this helper ----
def ensure_list(x):
    """Turn stringified list / set into a real Python list."""
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x = x.strip()
        if not x:
            return []
        try:                                     # try "[...]"  or "{...}"
            obj = ast.literal_eval(x)
            if isinstance(obj, (list, set, tuple)):
                return list(obj)
        except Exception:
            pass
        # fall-back: split on commas
        return [y.strip() for y in x.split(",") if y.strip()]
    return []

preds = pd.read_csv('all_predictions_with_hpo_prompts.csv')
preds["patient_hpo_names"] = preds["patient_hpo_names"].apply(ensure_list)
preds = preds[preds["patient_hpo_names"].apply(len) > 2].copy()

prompts_df = prompts_df.loc[preds['file_name'].unique()]
prompts_df

Unnamed: 0_level_0,Case Description,Correct Diagnosis,OMIM,PMID
File Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PMID_10571775_KSN_II_1_en-prompt.txt,The proband was a female. Disease onset was no...,Distal renal tubular acidosis 4 with hemolytic...,611590,PMID_10571775_KSN_II_1_en-prompt.txt
PMID_10571775_YAT_II_1_en-prompt.txt,The proband was a male. Disease onset was not ...,Distal renal tubular acidosis 4 with hemolytic...,611590,PMID_10571775_YAT_II_1_en-prompt.txt
PMID_10580070_A_III_11_en-prompt.txt,The proband was a female. Disease onset occurr...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_11_en-prompt.txt
PMID_10580070_A_III_13_en-prompt.txt,The proband was a male. Disease onset occurred...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_13_en-prompt.txt
PMID_10580070_A_III_5_en-prompt.txt,The proband was a female. Disease onset occurr...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_5_en-prompt.txt
...,...,...,...,...
STX_Syrbe_3_en-prompt.txt,"The proband was a 5-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_3_en-prompt.txt
STX_Syrbe_4_en-prompt.txt,"The proband was a 6-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_4_en-prompt.txt
STX_Syrbe_5_en-prompt.txt,"The proband was a 4-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_5_en-prompt.txt
STX_Syrbe_6_en-prompt.txt,"The proband was a 2-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_6_en-prompt.txt


In [2]:
known_imds = pd.read_csv(base_path / "reconws_diseases.csv")
known_imds

Unnamed: 0,id,diseaseAbbr,diseaseName,diseaseSyn,diseaseSource,diseaseDescription,diseaseType,icimdNosologyNumber,gene_id,omimGene,...,gard,genereviews,clingendosage,igsr1000genoms,gwascataloge,gwascentral,geno2mp,clinvar,lovd,malacard
0,1,GCH1A,GTP cyclohydrolase 1 deficiency,,ICIMD,,Inherited metabolic disease,21.1.02.01,2643,600225,...,,,GCH1,,GCH1,GCH1,GCH1,600225,GCH1,
1,2,GCH1,DOPA-responsive dystonia,Segawa disease,ICIMD,,Inherited metabolic disease,21.1.03.01,2643,,...,,,,,,,,,,
2,3,PTS,6-pyruvoyl-tetrahydropterin synthase deficiency,PTS-associated atypical phenylketonuria,ICIMD,,Inherited metabolic disease,21.1.04.01,5805,612719,...,,,PTS,,PTS,PTS,PTS,612719,PTS,
3,4,SPR,Sepiapterin reductase deficiency,SPR-associated atypical phenylketonuria,ICIMD,,Inherited metabolic disease,21.1.05.01,6697,182125,...,,,SPR,,SPR,SPR,SPR,182125,SPR,
4,5,TETB,Dihydropteridine reductase deficiency,QDPR-associated atypical phenylketonuria,ICIMD,,Inherited metabolic disease,21.1.06.01,5860,612676,...,5682.0,,QDPR,ENSG00000151552,QDPR,QDPR,QDPR,612676,QDPR,tetrahydrobiopterin_deficiency
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,2353,TDH6,Thyroid dyshormonogenesis 6,Familial thyroid dyshormonogenesis,OMIM,Familial thyroid dyshormonogenesis is a type o...,,,50506,606759,...,16843.0,,,,,,,,,
2353,2354,VDDR1A,Hypocalcemic vitamin D-dependent rickets,,OMIM,An early-onset hereditary vitamin D metabolism...,,,1594,609506,...,17319.0,,,,,,,,,
2354,2355,VDDR3,"Vitamin D-dependent rickets, type 3",Hypocalcemic vitamin D-resistant rickets,OMIM,Hypocalcemic vitamin D-resistant rickets (HVDR...,,,1576,124010,...,16805.0,,,,,,,,,
2355,2356,VDEGS,Van den Ende-Gupta syndrome,,OMIM,Van den Ende Gupta syndrome is present at birt...,,,91179,613619,...,3382.0,,,,,,,,,


In [3]:
known_imds = pd.read_csv(base_path / "reconws_diseases.csv")
known_imds = known_imds[~known_imds['omimDisease'].isna()]
known_imds['omimDisease'] = known_imds['omimDisease'].astype(int)
known_imds = known_imds[(known_imds['diseaseSource'] == 'ICIMD') | (known_imds['diseaseSource'] == 'IEMBASE')]
known_imds = known_imds['omimDisease'].unique()

# -----------------------------------------------
# known_imds  =  list / set of OMIM integers
# prompts_df  =  your original DataFrame
# -----------------------------------------------

# 1) how many rows for each diagnosis
counts = prompts_df["Correct Diagnosis"].value_counts()

# 2) first (or only) OMIM code associated with each diagnosis
diag2omim = (
    prompts_df
        .groupby("Correct Diagnosis", as_index=True)["OMIM"]
        .first()                       # <-- pick the first non-NA per group
)

# 3) build the final table
counts_df = (
    counts
        .to_frame("count")             # make it a DataFrame, column name = "count"
        .join(diag2omim)               # adds a second column called "OMIM"
        .assign(in_known_imd = lambda t: t["OMIM"].isin(known_imds))
)

print(counts_df.head())

                                                    count    OMIM  \
Correct Diagnosis                                                   
Developmental and epileptic encephalopathy 4          411  612164   
KBG syndrome                                          308  148050   
Developmental and epileptic encephalopathy 11         265  613721   
Glass syndrome                                        143  612313   
Mitochondrial DNA depletion syndrome 13 (enceph...     92  615471   

                                                    in_known_imd  
Correct Diagnosis                                                 
Developmental and epileptic encephalopathy 4                True  
KBG syndrome                                               False  
Developmental and epileptic encephalopathy 11              False  
Glass syndrome                                             False  
Mitochondrial DNA depletion syndrome 13 (enceph...          True  


In [22]:
df = counts_df

latex_table = (
    df.reset_index()                       # 1. move index → column
      .rename(columns={
          "index": "Diagnosis",            # new text column
          "count": "Count",
          "in_known_imd": "Known IMD"      # rename the flag
      })
      .drop(columns=["OMIM"])              # (optional) omit OMIM if you prefer
      .assign(**{                          # 2. prettify the flag
          "Known IMD": lambda t: t["Known IMD"].map({True: "Yes", False: "No"})
      })
      .to_latex(
          index=False, escape=False,
          longtable=True,
          column_format="p{7cm}rr",
          caption="Disease frequencies and IMD status",
          label="tab:disease_counts"
      )
)

print(latex_table)

\begin{longtable}{p{7cm}rr}
\caption{Disease frequencies and IMD status} \label{tab:disease_counts} \\
\toprule
Correct Diagnosis & Count & Known IMD \\
\midrule
\endfirsthead
\caption[]{Disease frequencies and IMD status} \\
\toprule
Correct Diagnosis & Count & Known IMD \\
\midrule
\endhead
\midrule
\multicolumn{3}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
Developmental and epileptic encephalopathy 4  & 411 & Yes \\
KBG syndrome  & 308 & No \\
Developmental and epileptic encephalopathy 11  & 265 & No \\
Glass syndrome  & 143 & No \\
Mitochondrial DNA depletion syndrome 13 (encephalomyopathic type)  & 92 & Yes \\
Holt-Oram syndrome  & 72 & No \\
Neurodevelopmental disorder with coarse facies and mild distal skeletal abnormalities  & 65 & No \\
Kabuki Syndrome 1  & 65 & Yes \\
Coffin-Siris syndrome 8  & 62 & No \\
Jacobsen syndrome  & 60 & No \\
Houge-Janssen syndrome 2  & 53 & No \\
ZTTK SYNDROME  & 50 & No \\
Mitochondrial DNA depletion syndrome 6 (hepa

In [2]:
# Count how many times each disease appears
disease_counts = prompts_df['Correct Diagnosis'].value_counts()

# Calculate mean and standard deviation
mean_per_disease = disease_counts.mean()
std_per_disease = disease_counts.std()

print(f"Average entries per disease: {mean_per_disease:.2f}")
print(f"Standard deviation: {std_per_disease:.2f}")

Average entries per disease: 12.24
Standard deviation: 32.86


In [4]:
prompts_df = prompts_df.dropna()
prompts_df['OMIM'] = prompts_df['OMIM'].astype(int)
prompts_df

Unnamed: 0_level_0,Case Description,Correct Diagnosis,OMIM,PMID
File Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PMID_10571775_KSN_II_1_en-prompt.txt,The proband was a female. Disease onset was no...,Distal renal tubular acidosis 4 with hemolytic...,611590,PMID_10571775_KSN_II_1_en-prompt.txt
PMID_10571775_YAT_II_1_en-prompt.txt,The proband was a male. Disease onset was not ...,Distal renal tubular acidosis 4 with hemolytic...,611590,PMID_10571775_YAT_II_1_en-prompt.txt
PMID_10580070_A_III_11_en-prompt.txt,The proband was a female. Disease onset occurr...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_11_en-prompt.txt
PMID_10580070_A_III_13_en-prompt.txt,The proband was a male. Disease onset occurred...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_13_en-prompt.txt
PMID_10580070_A_III_5_en-prompt.txt,The proband was a female. Disease onset occurr...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_5_en-prompt.txt
...,...,...,...,...
STX_Syrbe_3_en-prompt.txt,"The proband was a 5-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_3_en-prompt.txt
STX_Syrbe_4_en-prompt.txt,"The proband was a 6-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_4_en-prompt.txt
STX_Syrbe_5_en-prompt.txt,"The proband was a 4-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_5_en-prompt.txt
STX_Syrbe_6_en-prompt.txt,"The proband was a 2-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_6_en-prompt.txt


In [5]:
prompts_df = prompts_df.loc[preds['file_name'].unique()]

In [37]:
# %% [markdown]
# # Minimal GPT‑4o → OMIM demo
# One test case, no synonym table.

# %%
# 0  Install deps  (uncomment the next line the very first time)
# !pip install --quiet openai==1.* tqdm pandas

# %%
import json, os, math
import pandas as pd
import openai
from tqdm.auto import tqdm

openai.api_key = os.getenv("OPENAI_KEY")   # ← set this in your shell:  export OPENAI_API_KEY="sk‑..."

MODEL_NAME    = "gpt-4o"   # or "gpt-4o" / "gpt-4o-128k"
TEMPERATURE   = 0               # deterministic runs
MAX_WORDS     = 150             # rationale limit

# %%
# 1  Define the function‑calling schema
function_def = {
    "name": "diagnose_patient",
    "description": "Return the OMIM diagnosis for one patient vignette.",
    "parameters": {
        "type": "object",
        "properties": {
            "omim_id":   { "type": "string",
                           "description": "6‑digit OMIM identifier you judge most likely" },
            "diagnosis": { "type": "string",
                           "description": "Human‑readable disease name chosen" },
            "rationale": { "type": "string",
                           "description": f"Explanation ≤{MAX_WORDS} words" },
            "confidence":{ "type": "number",
                           "description": "0–1 confidence this is correct" }
        },
        "required": ["omim_id"]
    }
}

# %%
# 2  Prompt template
prompt_tmpl = f"""You are an expert clinical geneticist.
Analyse the patient vignette and call the function `diagnose_patient`.

Guidelines:
• Pick **one** Mendelian disease (the single most likely).
• If uncertain, still return your best guess and set confidence <0.3.
• The rationale must be ≤{MAX_WORDS} words.

<CASE>
{{vignette}}
</CASE>
"""

# %%
# 3  Helper to call GPT‑4o
def gpt_diagnose(vignette):
    messages=[{"role":"system","content":prompt_tmpl.format(vignette=vignette)}]
    rsp = openai.chat.completions.create(
        model       = MODEL_NAME,
        temperature = TEMPERATURE,
        messages    = messages,
        functions   = [function_def],
    )
    fc = rsp.choices[0].message.function_call
    return json.loads(fc.arguments) if fc else {"omim_id":"","diagnosis":"", "rationale":"","confidence":0}

# %%
# 4  YOUR *single* test case goes here  ↓↓↓
# test_cases = [
#     {
#         "vignette": """
# A 15‑year‑old tall male with hyperextensible joints, high‑arched palate, 
# pectus excavatum, bilateral lens dislocation, and aortic root dilation 
# (4.8 cm). Family history reveals similarly affected father who died at 34 
# from aortic dissection.
# """,
#         "omim": "154700"          # ← the correct answer you expect (Marfan syndrome)
#     }
# ]

# %%
# 5  Run the benchmark
results = []
total_cases = 50
for case in tqdm(prompts_df.sample(total_cases).iterrows(), total=total_cases):
    pred = gpt_diagnose(case[1]["Case Description"])
    correct = str(pred["omim_id"]) == str(case[1]["OMIM"])
    results.append({
        "gold":       case[1]["OMIM"],
        "pred_id":    pred["omim_id"],
        "diagnosis":  pred.get("diagnosis",""),
        "correct diagnosis": case[1]["Correct Diagnosis"],
        "confidence": pred.get("confidence", math.nan),
        "correct":    correct,
        "rationale":  pred.get("rationale","")
    })

pd.DataFrame(results)

  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0,gold,pred_id,diagnosis,correct diagnosis,confidence,correct,rationale
0,102370,156200,Léri-Weill dyschondrosteosis,Acromicric dysplasia,0.3,False,"The proband's presentation of short stature, s..."
1,604377,601238,Spinocerebellar Ataxia Type 3 (Machado-Joseph ...,Mitochondrial complex IV deficiency nuclear ty...,0.25,False,"The proband's symptoms, including tremor, stra..."
2,617402,219200,"Congenital Disorder of Glycosylation, Type II",Cutis laxa autosomal recessive type IIC,0.7,False,The proband's symptoms align with a Congenital...
3,613721,308350,Allan-Herndon-Dudley syndrome,Developmental and epileptic encephalopathy 11,0.6,False,The clinical presentation of the patient is su...
4,148050,610443,SYNGAP1-related intellectual disability,KBG syndrome,0.3,False,The proband's symptoms of intellectual disabil...
5,620535,194050,Williams Syndrome,Developmental delay dysmorphic facies and brai...,0.8,False,The clinical features presented in the vignett...
6,268310,129400,"Robinow syndrome, autosomal recessive",Robinow syndrome autosomal recessive,0.8,False,The clinical features presented in the vignett...
7,209900,209900,Bardet-Biedl syndrome,Bardet-Biedl syndrome 1,0.6,True,"The proband's symptoms, including postaxial po..."
8,620511,300624,Mowat-Wilson syndrome,Fliedner-Zweier syndrome,0.6,False,"The proband's symptoms, including global devel..."
9,618362,616780,"Ohdo syndrome, SBBYS variant",Coffin-Siris syndrome 8,0.3,False,"The proband's features, including macrocephaly..."


In [6]:
import chromadb

chroma_client = chromadb.PersistentClient(
                                    path=str('/Users/timhulshof/Documents/test_chatimd_latest/chatimd_interface/backend/chatimd_backend/databases/hpo_synonym_db')
                                )

collection = chroma_client.get_collection(name="hpo_synonym")

new_client = chromadb.PersistentClient(
                                    path=str(base_path / 'synthetic_patients')
                                )
new_client = chromadb.PersistentClient(
                                    path=str('/Users/timhulshof/Documents/test_chatimd_latest/chatimd_interface/backend/chatimd_backend/databases/synthetic_patients')
                                )

profiles = new_client.get_collection(name="synthetic_disease_profiles_v1")

In [7]:
import pickle

relevant_hpos = pickle.load(open(base_path / 'relevant_hpo_terms.pkl', 'rb'))

In [8]:
from utils.hpo_ontology import load_ontology

# Load the HPO ontology
graph, ic_dict = load_ontology(annotations='OMIM')

In [9]:
from oaklib import get_adapter

adapter = get_adapter(f"simpleobo:{base_path}/mondo.obo")

In [10]:
with open(base_path / "depth_cache.pkl", "rb") as f:
    depth_cache = pickle.load(f)

In [32]:
import utils.score_diagnosis 
import importlib

# make changes to example.py file
importlib.reload(utils.score_diagnosis)

from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate

results_path = Path().resolve() / 'results'

# Initialize dictionary to store times
execution_times = {}

# TOM
_, execution_times["TOM"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    depth_cache=depth_cache,
    # index=index,
    # orpha=False,
    collection=collection,
    weighted_score_active=True,
    # id_to_disease=id_to_disease,
    log_file_path=results_path / "evaluation_log_TOM_filtered_2.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM_filtered_2.npz"
)

  0%|          | 0/4419 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_filtered_2.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_filtered_2.npz
Time taken: 26166.23 seconds


In [28]:
import utils.score_diagnosis 
import importlib

# make changes to example.py file
importlib.reload(utils.score_diagnosis)
import utils.overlap_method
# make changes to example.py file
importlib.reload(utils.overlap_method)


from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate

results_path = Path().resolve() / 'results'

# TOM
_, execution_times["TOM_SA"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    depth_cache=depth_cache,
    # index=index,
    collection=collection,
    semantic_similarity=True,
    weighted_score_active=True,
    # id_to_disease=id_to_disease,
    log_file_path=results_path / "evaluation_log_TOM_SA_filtered_2.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM_SA_filtered_2.npz"
)

  0%|          | 0/4419 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_SA_filtered_2.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_SA_filtered_2.npz
Time taken: 29924.86 seconds


NameError: name 'execution_times' is not defined

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re, ast
from pathlib import Path
import pandas as pd

# ------------------------------------------------------------------
# 0.  FILE LOCATIONS  ------------------------------------------------
# results_path = Path(".")          #  <- adjust as needed
# base_path    = Path(".")          #  <- adjust as needed
results_path = Path().resolve() / 'results'
base_path    = Path().resolve() / 'files'

RAW_TXT = results_path / "evaluation_log_case_description_SA.txt"
HPO_OBO = base_path   / "hp.obo"

# ------------------------------------------------------------------
# 1.  REGEX PATTERNS  ------------------------------------------------
re_file    = re.compile(r'^File:\s*(.*)')
re_correct = re.compile(r'^Correct diagnosis:\s*(.*)')
# re_omim = re.compile(r'^ID\s+(\S+)') 
re_omim = re.compile(r'^ID.*?(\S+)$') 

metric_patterns = {
    "number_hpo_terms"                  : re.compile(r'^Number HPO terms\s+([\d.]+)'),
    "new_metric"                        : re.compile(r'^New Metric\s+([\d.]+)'),
    "number_matched_hpo_terms"          : re.compile(r'^Number matched HPO terms\s+([\d.]+)'),
    "fraction_overlap_search_hpo_terms" : re.compile(r'^Fraction overlapping search HPO terms\s+([\d.]+)'),
    "fraction_matched_disease_hpo_terms": re.compile(r'^Fraction matched disease HPO terms\s+([\d.]+)'),
    "weighted_score"                    : re.compile(r'^Weighted score\s+([\d.]+)'),
    "similarity"                        : re.compile(r'^Similarity\s+([\d.]+)'),
    "score"                             : re.compile(r'^Score\s+([\d.]+)'),
    "probability"                       : re.compile(r'^Probability\s+([\d.]+)'),
    "fdr"                               : re.compile(r'^FDR\s+([\d.]+)')
}

# block delimiter: any one of these starts the NEXT disease / ends current
re_block_end = re.compile(
    r'^(Error processing disease:|Name: OMIM:|Exact match|No match)'
)

# ------------------------------------------------------------------
# 2.  LOAD hp.obo  (ID → name)  -------------------------------------
def load_hpo_dict(obo_path: Path) -> dict:
    hpo_name, current = {}, None
    with obo_path.open(encoding="utf-8") as fh:
        for line in fh:
            if line.startswith("id: HP:"):
                current = line.split()[1].strip()
            elif line.startswith("name:") and current:
                hpo_name[current] = line[5:].strip()
                current = None
    return hpo_name

hpo_dict = load_hpo_dict(HPO_OBO)
print(f"Loaded {len(hpo_dict):,} HPO terms")

# robust “IDs → names”
def ids_to_names(raw):
    if raw in (None, '', []):
        ids = []
    elif isinstance(raw, (list, set, tuple)):
        ids = list(raw)
    elif isinstance(raw, str):
        try:
            maybe = ast.literal_eval(raw)
            if isinstance(maybe, (list, set, tuple)):
                ids = list(maybe)
            else:
                ids = [str(maybe)]
        except (ValueError, SyntaxError):
            ids = [x.strip() for x in raw.split(",") if x.strip()]
    else:
        ids = []
    return [hpo_dict.get(x, x) for x in ids]

# split simple comma-separated string into list of IDs
split_ids = lambda s: [x.strip() for x in s.split(",") if x.strip()]

# ------------------------------------------------------------------
# 3.  PARSE LOG FILE  -----------------------------------------------
rows, disease, current_file, correct_dx = [], {}, None, None

def flush():
    if not disease:
        return
    disease["file_name"]         = current_file
    disease["correct_diagnosis"] = correct_dx
    for k in metric_patterns:                     # ensure all metric cols exist
        disease.setdefault(k, None)
    for k in ("patient_hpo_ids","overlap_hpo_ids","disease_hpo_ids"):
        disease.setdefault(k, [])
    rows.append(disease.copy())
    disease.clear()

# def collect_until_close(initial, close_char, it):
#     buff = initial
#     for nxt in it:                # safe; stops at EOF
#         buff += " " + nxt.rstrip()
#         if close_char in buff:
#             break
#     return buff.split(close_char)[0]

def collect_until_close(initial_fragment, close_char, it):
    """
    Return the substring inside the list delimiters.
    *initial_fragment* is the text after the opening '[' or '{'.
    If the close_char is already present, we never advance the iterator.
    """
    buff = initial_fragment
    if close_char not in buff:                 # only read more if needed
        for nxt in it:                         # stops automatically at EOF
            buff += " " + nxt.rstrip()
            if close_char in buff:
                break
    return buff.split(close_char)[0]

with RAW_TXT.open(encoding="utf-8") as fh:
    for raw in fh:
        line = raw.rstrip()
        if not line:
            continue

        # new FILE block
        m = re_file.match(line)
        if m:
            flush()
            current_file = m.group(1).strip()
            correct_dx   = None
            continue

        # correct diagnosis
        m = re_correct.match(line)
        if m:
            correct_dx = m.group(1).strip()
            continue

        # explicit end-of-disease markers
        if re_block_end.match(line):
            flush()
            continue

        # first free-text line → disease_name
        if "disease_name" not in disease:
            disease["disease_name"] = line.strip()
            continue

        # e) OMIM ID  ------------------------------------------------------
        m = re.search(r'OMIM:\d+', line)          # find OMIM:<digits> anywhere
        if m:
            disease["omim_id"] = m.group()        # e.g. "OMIM:107480"
            continue

        # numeric metrics
        for key, rx in metric_patterns.items():
            m = rx.match(line)
            if m:
                val = m.group(1)
                disease[key] = int(val) if val.isdigit() else float(val)
                break
        else:   # no metric matched → maybe HPO lines
            if line.startswith("Patient HPOs"):
                first = line.split("{", 1)[1] if "{" in line else line.split("[",1)[1]
                full  = collect_until_close(first, "}" if "{" in line else "]", fh)
                disease["patient_hpo_ids"] = split_ids(full)
                continue
            if line.startswith("Overlap HPOs"):
                first = line.split("{", 1)[1]
                full  = collect_until_close(first, "}", fh)
                disease["overlap_hpo_ids"] = split_ids(full)
                continue
            if line.startswith("Disease HPOs"):
                first = line.split("{", 1)[1] if "{" in line else line.split("[",1)[1]
                full  = collect_until_close(first, "}" if "{" in line else "]", fh)
                disease["disease_hpo_ids"] = split_ids(full)
                continue

# flush last disease at EOF
flush()

# ------------------------------------------------------------------
# 4.  BUILD DATAFRAME & TRANSLATE NAMES  ----------------------------
df = pd.DataFrame(rows)

for col in ("patient_hpo_ids","overlap_hpo_ids","disease_hpo_ids"):
    name_col = col.replace("_ids","_names")
    df[name_col] = df[col].apply(ids_to_names)

front = ["file_name","correct_diagnosis","disease_name","omim_id","new_metric"]
df = df[front + [c for c in df.columns if c not in front]]

print(df.head(8))
# df.to_pickle(".pkl")
df.to_csv   ("all_predictions_with_hpo_prompts_case_descriptions.csv", index=False)
print("\n✓  Saved tidy dataframe with full HPO IDs and names")

Loaded 19,434 HPO terms
  file_name      correct_diagnosis  \
0         1  DRAVET SYNDROME; DRVT   
1         1  DRAVET SYNDROME; DRVT   
2         1  DRAVET SYNDROME; DRVT   
3         2  DRAVET SYNDROME; DRVT   
4         2  DRAVET SYNDROME; DRVT   
5         2  DRAVET SYNDROME; DRVT   
6         2  DRAVET SYNDROME; DRVT   
7         2  DRAVET SYNDROME; DRVT   

                                        disease_name      omim_id  new_metric  \
0          Mental retardation, autosomal dominant 42  OMIM:616973    0.456696   
1  Multiple congenital anomalies-hypotonia-seizur...  OMIM:300868    0.445298   
2  Epileptic encephalopathy, early infantile, 6 (...  OMIM:607208    0.443617   
3     Developmental and epileptic encephalopathy 112  OMIM:620537    0.454924   
4          Mental retardation, autosomal dominant 42  OMIM:616973    0.405977   
5      Developmental and epileptic encephalopathy 91  OMIM:617711    0.402312   
6       Developmental delay with or without epilepsy  OMIM:620540 

In [1]:
#!/usr/bin/env python3
"""
Jupyter‑ready helper to **parse the SA evaluation log** and attach the
original case descriptions from *LiteratureCases_12_5_25.csv*.

Drop the whole cell into a notebook, adjust the three paths below if
needed, run, and you’ll get a tidy `pandas.DataFrame` plus a CSV on
disk.
"""

# ---------------------------------------------------------------------
# 0️⃣  CONFIG — adjust paths if your folders differ
# ---------------------------------------------------------------------
from pathlib import Path
import re, ast, pandas as pd

LOG_PATH   = Path("results/evaluation_log_case_description_SA.txt")
CASES_CSV  = Path("files/LiteratureCases_12_5_25.csv")
OBO_PATH   = Path("files/hp.obo")          # optional – for name mapping
OUT_FILE   = "all_predictions_with_hpo_prompts_case_descriptions.csv"

# ---------------------------------------------------------------------
# 1️⃣  HPO ID → name (only if hp.obo is available)
# ---------------------------------------------------------------------
def load_hpo_names(obo_path: Path) -> dict:
    """Return {"HP:0000001": "All" , ...} (skips obsolete terms)."""
    if not obo_path.exists():
        return {}
    names, current = {}, None
    with obo_path.open(encoding="utf-8") as fh:
        for line in fh:
            if line.startswith("id: HP:"):
                current = line.split()[1]
            elif line.startswith("name:") and current:
                names[current] = line.split("name:",1)[1].strip()
                current = None
    return names

HPO_NAME = load_hpo_names(OBO_PATH)
ids2name = lambda x: HPO_NAME.get(x, x)

# ---------------------------------------------------------------------
# 2️⃣  REGEX TEMPLATES
# ---------------------------------------------------------------------
RE_FILE     = re.compile(r"^File:\s*(\d+)")
RE_CORRECT  = re.compile(r"^Correct diagnosis:\s*(.*)")
RE_EXACT    = re.compile(r"^Exact match Rank:\s*(\d+)")
RE_DEEPEST  = re.compile(r"^Deep(est| ancestor) match Rank:\s*(\d+)")

METRIC_RX   = {
    "number_hpo_terms"                  : re.compile(r"^Number HPO terms\s+([\d.]+)"),
    "number_matched_hpo_terms"          : re.compile(r"^Number matched HPO terms\s+([\d.]+)"),
    "fraction_overlap_search_hpo_terms" : re.compile(r"^Fraction overlapping search HPO terms\s+([\d.]+)"),
    "fraction_matched_disease_hpo_terms": re.compile(r"^Fraction matched disease HPO terms\s+([\d.]+)"),
    "weighted_score"                    : re.compile(r"^Weighted score\s+([\d.]+)"),
    "similarity"                        : re.compile(r"^Similarity\s+([\d.]+)"),
    "score"                             : re.compile(r"^Score\s+([\d.]+)"),
    "probability"                       : re.compile(r"^Probability\s+([\d.]+)"),
    "fdr"                               : re.compile(r"^FDR\s+([\d.]+)")
}

END_BLOCK  = re.compile(r"^(Error processing disease:|Name: OMIM:|Exact match|Deepest ancestor match|No match|File:)\b")

# helpers for HPO set capture -------------------------------------------------
ID_SPLIT = lambda s: [x.strip() for x in s.split(',') if x.strip()]

def collect_until_close(first_fragment: str, close_char: str, iterator):
    buff = first_fragment
    if close_char not in buff:
        for nxt in iterator:
            buff += ' ' + nxt.rstrip()
            if close_char in buff:
                break
    return buff.split(close_char)[0]

# ---------------------------------------------------------------------
# 3️⃣  PARSE THE LOG
# ---------------------------------------------------------------------
rows, disease = [], {}
cur_file = cur_dx = None
with LOG_PATH.open(encoding='utf-8') as fh:
    for raw in fh:
        line = raw.rstrip()
        if not line:
            continue

        # New FILE delimiter --------------------------------------------------
        m = RE_FILE.match(line)
        if m:
            # flush any existing disease before starting a new file block
            if disease:
                rows.append(disease)
                disease = {}
            cur_file = int(m.group(1))
            cur_dx   = None
            continue

        # Correct diagnosis
        m = RE_CORRECT.match(line)
        if m:
            cur_dx = m.group(1).strip()
            continue

        # Explicit end‑of‑disease markers
        if END_BLOCK.match(line):
            if disease:
                rows.append(disease)
                disease = {}
            continue

        # First free‑text line = disease candidate name
        if "disease_name" not in disease:
            disease["disease_name"] = line
            continue

        # Ranks ---------------------------------------------------------------
        m = RE_EXACT.match(line)
        if m:
            disease["exact_match_rank"] = int(m.group(1))
            continue
        m = RE_DEEPEST.match(line)
        if m:
            disease["deepest_ancestor_rank"] = int(m.group(2))
            continue

        # OMIM ID anywhere in line
        m = re.search(r"OMIM:\d+", line)
        if m:
            disease["omim_id"] = m.group()
            continue

        # Numeric metrics -----------------------------------------------------
        for key, rx in METRIC_RX.items():
            mm = rx.match(line)
            if mm:
                val = mm.group(1)
                disease[key] = float(val) if "." in val else int(val)
                break
        else:  # HPO sets ------------------------------------------------------
            if line.startswith("Patient HPOs"):
                first = line.split("{",1)[1]
                ids   = collect_until_close(first, "}", fh)
                disease["patient_hpo_ids"] = ID_SPLIT(ids)
            elif line.startswith("Overlap HPOs"):
                first = line.split("{",1)[1]
                ids   = collect_until_close(first, "}", fh)
                disease["overlap_hpo_ids"] = ID_SPLIT(ids)
            elif line.startswith("Disease HPOs"):
                first = line.split("{",1)[1]
                ids   = collect_until_close(first, "}", fh)
                disease["disease_hpo_ids"] = ID_SPLIT(ids)

    # flush last block at EOF
    if disease:
        rows.append(disease)

# ---------------------------------------------------------------------
# 4️⃣  BUILD DATAFRAME AND MERGE CASE DESCRIPTIONS
# ---------------------------------------------------------------------
log_df = pd.DataFrame(rows)
log_df["id"] = log_df["file_name"] = log_df.pop("file_name") if "file_name" in log_df else log_df.index + 1
log_df["correct_diagnosis"] = cur_dx

# add name‑translated HPO columns (optional)
for col in ("patient_hpo_ids","overlap_hpo_ids","disease_hpo_ids"):
    if col in log_df:
        log_df[col.replace("_ids","_names")] = log_df[col].apply(lambda lst: [ids2name(x) for x in lst])

# Case descriptions -----------------------------------------------------------
case_df = pd.read_csv(CASES_CSV, usecols=["id","caseDescription"])
full_df = log_df.merge(case_df, on="id", how="left")

# Convenience boolean flags
full_df["exact_match_found"]     = full_df["exact_match_rank"].notna()
full_df["deepest_ancestor_found"] = full_df["deepest_ancestor_rank"].notna()

# Save & preview --------------------------------------------------------------
full_df.to_csv(OUT_FILE, index=False)
print(f"✓  Saved {len(full_df):,} rows → {OUT_FILE}")
full_df.head()


TypeError: 'float' object is not iterable

In [31]:
import utils.score_diagnosis 
import importlib
import utils.overlap_method
# make changes to example.py file
importlib.reload(utils.score_diagnosis)
importlib.reload(utils.overlap_method)

from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate

results_path = Path().resolve() / 'results'

# TOM
_, execution_times["TOM_SA_HNSW_WHOLE"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    depth_cache=depth_cache,
    index=profiles,
    collection=collection,
    semantic_similarity=True,
    hnsw=True,
    k=1000,
    id_to_disease=None,
    weighted_score_active=True,
    log_file_path=results_path / "evaluation_log_TOM_SA_HNSW_smaller_k=1000.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM_SA_HNSW_smaller_k=1000.npz"
)

  0%|          | 0/4419 [00:00<?, ?it/s]

INFO:backoff:Backing off send_request(...) for 1.0s (requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Read timed out. (read timeout=15))


Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_SA_HNSW_smaller_k=1000.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_SA_HNSW_smaller_k=1000.npz
Time taken: 17915.90 seconds


NameError: name 'execution_times' is not defined

In [26]:
from pathlib import Path
import pronto   # pip install pronto

# 1) load the ontology (only once)
hpo = pronto.Ontology(base_path / "hp.obo")   # path to your hp.obo

# 2) function to translate
def hpo_id_to_name(hpo_id: str) -> str:
    """Return the primary term name (or the code itself if not found)."""
    try:
        return hpo[hpo_id].name
    except KeyError:
        return hpo_id

# 3) translate your list
codes = [
    "HP:0000822", "HP:0001945", "HP:0001903", "HP:0001974", "HP:0007663",
    "HP:0001824", "HP:0002027", "HP:0003259", "HP:0002315", "HP:0025406",
    "HP:0000790", "HP:0005978", "HP:0002716", "HP:0002013", "HP:0002014"
]

for code in codes:
    print(f"{code}  →  {hpo_id_to_name(code)}")

  hpo = pronto.Ontology(base_path / "hp.obo")   # path to your hp.obo


HP:0000822  →  Hypertension
HP:0001945  →  Fever
HP:0001903  →  Anemia
HP:0001974  →  Leukocytosis
HP:0007663  →  Reduced visual acuity
HP:0001824  →  Weight loss
HP:0002027  →  Abdominal pain
HP:0003259  →  Elevated circulating creatinine concentration
HP:0002315  →  Headache
HP:0025406  →  Asthenia
HP:0000790  →  Hematuria
HP:0005978  →  Type II diabetes mellitus
HP:0002716  →  Lymphadenopathy
HP:0002013  →  Vomiting
HP:0002014  →  Diarrhea


In [57]:
import utils.score_diagnosis 
import importlib

# make changes to example.py file
importlib.reload(utils.score_diagnosis)

from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate

results_path = Path().resolve() / 'results'

# TOM
_, execution_times["TOM_SA_HNSW_WHOLE"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    depth_cache=depth_cache,
    index=index,
    collection=collection,
    semantic_similarity=True,
    hnsw=True,
    k=int(len(final_profile_df) / 2),
    id_to_disease=id_to_disease,
    weighted_score_active=False,
    log_file_path=results_path / "evaluation_log_TOM_SA_HNSW_WHOLE_HALF.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM_SA_HNSW_WHOLE_HALF.npz"
)

  0%|          | 0/5212 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_SA_HNSW_WHOLE_HALF.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_SA_HNSW_WHOLE_HALF.npz
Time taken: 35315.28 seconds


In [None]:
import utils.score_diagnosis 
import importlib

# make changes to example.py file
importlib.reload(utils.score_diagnosis)

from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate
import matplotlib.pyplot as plt

results_path = Path().resolve() / 'results'

# Initialize dictionary to store times
execution_times = {}

# TOM
_, execution_times["TOM"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    index=index,
    collection=collection,
    id_to_disease=id_to_disease,
    log_file_path=results_path / "evaluation_log_TOM.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM.npz"
)

# TOM_SA
_, execution_times["TOM_SA"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    index=index,
    id_to_disease=id_to_disease,
    collection=collection,
    semantic_similarity=True,
    log_file_path=results_path / "evaluation_log_TOM_SA.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM_SA.npz"
)

# Print summary of execution times
print("\nSummary of Execution Times:")
for method, elapsed_time in execution_times.items():
    print(f"{method}: {elapsed_time:.2f} seconds")

# Extract method names and their corresponding execution times
methods = list(execution_times.keys())
times = list(execution_times.values())

# Create the bar chart
plt.figure(figsize=(8, 6))
bars = plt.bar(methods, times, color='skyblue')

# Add labels above each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f} s', ha='center', va='bottom')

# Add title and labels
plt.title("Execution Times for Different Methods")
plt.xlabel("Methods")
plt.ylabel("Execution Time (seconds)")

# Show the chart
plt.tight_layout()
plt.show()

  0%|          | 0/5145 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM.npz
Time taken: 31057.76 seconds


  0%|          | 0/5145 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_SA.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_SA.npz
Time taken: 33124.02 seconds


  0%|          | 0/5145 [00:00<?, ?it/s]

RuntimeError: Input vector data wrong shape. Number of dimensions 0. Data must be a 1D or 2D array.

In [25]:
import pickle 

with open(results_path / 'saved_times.pkl', 'wb') as f:
    pickle.dump(execution_times, f)

In [30]:
import utils.score_diagnosis 
import importlib

# make changes to example.py file
importlib.reload(utils.score_diagnosis)

from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate

results_path = Path().resolve() / 'results'

execution_times = {}

# TOM_SA_HNSW
_, execution_times["TOM_SA_HNSW_higher_ef_m"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    index=index,
    id_to_disease=id_to_disease,
    collection=collection,
    semantic_similarity=True,
    diseases_to_consider=diseases_in_index,
    hnsw=True,
    log_file_path=results_path / "evaluation_log_TOM_SA_HNSW_higher_ef_m.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM_SA_HNSW_higher_ef_m.npz"
)

# Print summary of execution times
print("\nSummary of Execution Times:")
for method, elapsed_time in execution_times.items():
    print(f"{method}: {elapsed_time:.2f} seconds")

# Extract method names and their corresponding execution times
methods = list(execution_times.keys())
times = list(execution_times.values())

# Create the bar chart
plt.figure(figsize=(8, 6))
bars = plt.bar(methods, times, color='skyblue')

# Add labels above each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f} s', ha='center', va='bottom')

# Add title and labels
plt.title("Execution Times for Different Methods")
plt.xlabel("Methods")
plt.ylabel("Execution Time (seconds)")

# Show the chart
plt.tight_layout()
plt.show()
# 1hr 38min


  0%|          | 0/5145 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_SA_HNSW_higher_ef_m.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_SA_HNSW_higher_ef_m.npz
Time taken: 6725.57 seconds


NameError: name 'execution_times' is not defined

In [31]:
# TOM_SA
execution_times = {}

_, execution_times["TOM_SA_index_diseases"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    index=index,
    id_to_disease=id_to_disease,
    collection=collection,
    semantic_similarity=True,
    diseases_to_consider=diseases_in_index,
    log_file_path=results_path / "evaluation_log_TOM_SA_index_diseases.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM_SA_index_diseases.npz")

  0%|          | 0/5145 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_SA_index_diseases.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_SA_index_diseases.npz
Time taken: 14928.63 seconds


In [9]:
from utils.hdc import load_hpo_terms, create_hpo_vector_lookup, load_phenotype_hpoa, build_disease_vectors, create_disease_name_mapping

# Load data
hpo_terms = load_hpo_terms(base_path / 'hp.obo')
hpo_vectors = create_hpo_vector_lookup(hpo_terms)
hpoa_df = load_phenotype_hpoa(base_path / 'phenotype.hpoa')
hpoa_df = hpoa_df[hpoa_df['database_id'].str.startswith('OMIM')]
disease_vectors = build_disease_vectors(hpoa_df, hpo_vectors)
disease_names = create_disease_name_mapping(hpoa_df)

Building disease vectors...


In [11]:
import utils.score_diagnosis 
import importlib

# make changes to example.py file
importlib.reload(utils.score_diagnosis)

from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate

results_path = Path().resolve() / 'results'

# TOM
_, execution_times["TOM_SA"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    depth_cache=depth_cache,
    index=None,
    collection=collection,
    semantic_similarity=True,
    weighted_score_active=False,
    id_to_disease=None,
    hdc=True, 
    disease_vectors=disease_vectors,
    disease_names=disease_names,
    hpo_vectors=hpo_vectors,
    log_file_path=results_path / "evaluation_log_HDC.txt", 
    output_ranks_path=results_path / "evaluation_ranks_HDC.npz")

  0%|          | 0/5212 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_HDC.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_HDC.npz
Time taken: 18121.96 seconds


NameError: name 'execution_times' is not defined