In [1]:
from pathlib import Path
import pandas as pd 
import ast

base_path = Path().resolve() / 'files'

prompts_df = pd.read_excel(base_path / 'prompts_omim_pmid_updated_file_final1.xlsx', index_col='File Name')

prompts_df = prompts_df.dropna()
prompts_df['OMIM'] = prompts_df['OMIM'].astype(int)

# --- if the column is already a real list you can skip this helper ----
def ensure_list(x):
    """Turn stringified list / set into a real Python list."""
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x = x.strip()
        if not x:
            return []
        try:                                     # try "[...]"  or "{...}"
            obj = ast.literal_eval(x)
            if isinstance(obj, (list, set, tuple)):
                return list(obj)
        except Exception:
            pass
        # fall-back: split on commas
        return [y.strip() for y in x.split(",") if y.strip()]
    return []

preds = pd.read_csv('all_predictions_with_hpo_prompts.csv')
preds["patient_hpo_names"] = preds["patient_hpo_names"].apply(ensure_list)
preds = preds[preds["patient_hpo_names"].apply(len) > 2].copy()

prompts_df = prompts_df.loc[preds['file_name'].unique()]
prompts_df

Unnamed: 0_level_0,Case Description,Correct Diagnosis,OMIM,PMID
File Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PMID_10571775_KSN_II_1_en-prompt.txt,The proband was a female. Disease onset was no...,Distal renal tubular acidosis 4 with hemolytic...,611590,PMID_10571775_KSN_II_1_en-prompt.txt
PMID_10571775_YAT_II_1_en-prompt.txt,The proband was a male. Disease onset was not ...,Distal renal tubular acidosis 4 with hemolytic...,611590,PMID_10571775_YAT_II_1_en-prompt.txt
PMID_10580070_A_III_11_en-prompt.txt,The proband was a female. Disease onset occurr...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_11_en-prompt.txt
PMID_10580070_A_III_13_en-prompt.txt,The proband was a male. Disease onset occurred...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_13_en-prompt.txt
PMID_10580070_A_III_5_en-prompt.txt,The proband was a female. Disease onset occurr...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_5_en-prompt.txt
...,...,...,...,...
STX_Syrbe_3_en-prompt.txt,"The proband was a 5-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_3_en-prompt.txt
STX_Syrbe_4_en-prompt.txt,"The proband was a 6-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_4_en-prompt.txt
STX_Syrbe_5_en-prompt.txt,"The proband was a 4-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_5_en-prompt.txt
STX_Syrbe_6_en-prompt.txt,"The proband was a 2-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_6_en-prompt.txt


In [2]:
known_imds = pd.read_csv(base_path / "reconws_diseases.csv")
known_imds

Unnamed: 0,id,diseaseAbbr,diseaseName,diseaseSyn,diseaseSource,diseaseDescription,diseaseType,icimdNosologyNumber,gene_id,omimGene,...,gard,genereviews,clingendosage,igsr1000genoms,gwascataloge,gwascentral,geno2mp,clinvar,lovd,malacard
0,1,GCH1A,GTP cyclohydrolase 1 deficiency,,ICIMD,,Inherited metabolic disease,21.1.02.01,2643,600225,...,,,GCH1,,GCH1,GCH1,GCH1,600225,GCH1,
1,2,GCH1,DOPA-responsive dystonia,Segawa disease,ICIMD,,Inherited metabolic disease,21.1.03.01,2643,,...,,,,,,,,,,
2,3,PTS,6-pyruvoyl-tetrahydropterin synthase deficiency,PTS-associated atypical phenylketonuria,ICIMD,,Inherited metabolic disease,21.1.04.01,5805,612719,...,,,PTS,,PTS,PTS,PTS,612719,PTS,
3,4,SPR,Sepiapterin reductase deficiency,SPR-associated atypical phenylketonuria,ICIMD,,Inherited metabolic disease,21.1.05.01,6697,182125,...,,,SPR,,SPR,SPR,SPR,182125,SPR,
4,5,TETB,Dihydropteridine reductase deficiency,QDPR-associated atypical phenylketonuria,ICIMD,,Inherited metabolic disease,21.1.06.01,5860,612676,...,5682.0,,QDPR,ENSG00000151552,QDPR,QDPR,QDPR,612676,QDPR,tetrahydrobiopterin_deficiency
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,2353,TDH6,Thyroid dyshormonogenesis 6,Familial thyroid dyshormonogenesis,OMIM,Familial thyroid dyshormonogenesis is a type o...,,,50506,606759,...,16843.0,,,,,,,,,
2353,2354,VDDR1A,Hypocalcemic vitamin D-dependent rickets,,OMIM,An early-onset hereditary vitamin D metabolism...,,,1594,609506,...,17319.0,,,,,,,,,
2354,2355,VDDR3,"Vitamin D-dependent rickets, type 3",Hypocalcemic vitamin D-resistant rickets,OMIM,Hypocalcemic vitamin D-resistant rickets (HVDR...,,,1576,124010,...,16805.0,,,,,,,,,
2355,2356,VDEGS,Van den Ende-Gupta syndrome,,OMIM,Van den Ende Gupta syndrome is present at birt...,,,91179,613619,...,3382.0,,,,,,,,,


In [3]:
known_imds = pd.read_csv(base_path / "reconws_diseases.csv")
known_imds = known_imds[~known_imds['omimDisease'].isna()]
known_imds['omimDisease'] = known_imds['omimDisease'].astype(int)
known_imds = known_imds[(known_imds['diseaseSource'] == 'ICIMD') | (known_imds['diseaseSource'] == 'IEMBASE')]
known_imds = known_imds['omimDisease'].unique()

# -----------------------------------------------
# known_imds  =  list / set of OMIM integers
# prompts_df  =  your original DataFrame
# -----------------------------------------------

# 1) how many rows for each diagnosis
counts = prompts_df["Correct Diagnosis"].value_counts()

# 2) first (or only) OMIM code associated with each diagnosis
diag2omim = (
    prompts_df
        .groupby("Correct Diagnosis", as_index=True)["OMIM"]
        .first()                       # <-- pick the first non-NA per group
)

# 3) build the final table
counts_df = (
    counts
        .to_frame("count")             # make it a DataFrame, column name = "count"
        .join(diag2omim)               # adds a second column called "OMIM"
        .assign(in_known_imd = lambda t: t["OMIM"].isin(known_imds))
)

print(counts_df.head())

                                                    count    OMIM  \
Correct Diagnosis                                                   
Developmental and epileptic encephalopathy 4          411  612164   
KBG syndrome                                          308  148050   
Developmental and epileptic encephalopathy 11         265  613721   
Glass syndrome                                        143  612313   
Mitochondrial DNA depletion syndrome 13 (enceph...     92  615471   

                                                    in_known_imd  
Correct Diagnosis                                                 
Developmental and epileptic encephalopathy 4                True  
KBG syndrome                                               False  
Developmental and epileptic encephalopathy 11              False  
Glass syndrome                                             False  
Mitochondrial DNA depletion syndrome 13 (enceph...          True  


In [2]:
# Count how many times each disease appears
disease_counts = prompts_df['Correct Diagnosis'].value_counts()

# Calculate mean and standard deviation
mean_per_disease = disease_counts.mean()
std_per_disease = disease_counts.std()

print(f"Average entries per disease: {mean_per_disease:.2f}")
print(f"Standard deviation: {std_per_disease:.2f}")

Average entries per disease: 12.24
Standard deviation: 32.86


In [4]:
prompts_df = prompts_df.dropna()
prompts_df['OMIM'] = prompts_df['OMIM'].astype(int)
prompts_df

Unnamed: 0_level_0,Case Description,Correct Diagnosis,OMIM,PMID
File Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PMID_10571775_KSN_II_1_en-prompt.txt,The proband was a female. Disease onset was no...,Distal renal tubular acidosis 4 with hemolytic...,611590,PMID_10571775_KSN_II_1_en-prompt.txt
PMID_10571775_YAT_II_1_en-prompt.txt,The proband was a male. Disease onset was not ...,Distal renal tubular acidosis 4 with hemolytic...,611590,PMID_10571775_YAT_II_1_en-prompt.txt
PMID_10580070_A_III_11_en-prompt.txt,The proband was a female. Disease onset occurr...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_11_en-prompt.txt
PMID_10580070_A_III_13_en-prompt.txt,The proband was a male. Disease onset occurred...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_13_en-prompt.txt
PMID_10580070_A_III_5_en-prompt.txt,The proband was a female. Disease onset occurr...,Cardiomyopathy dilated 1A,115200,PMID_10580070_A_III_5_en-prompt.txt
...,...,...,...,...
STX_Syrbe_3_en-prompt.txt,"The proband was a 5-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_3_en-prompt.txt
STX_Syrbe_4_en-prompt.txt,"The proband was a 6-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_4_en-prompt.txt
STX_Syrbe_5_en-prompt.txt,"The proband was a 4-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_5_en-prompt.txt
STX_Syrbe_6_en-prompt.txt,"The proband was a 2-year, 0-month old child. D...",Developmental and epileptic encephalopathy 4,612164,STX_Syrbe_6_en-prompt.txt


  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0,gold,pred_id,diagnosis,correct diagnosis,confidence,correct,rationale
0,102370,156200,Léri-Weill dyschondrosteosis,Acromicric dysplasia,0.3,False,"The proband's presentation of short stature, s..."
1,604377,601238,Spinocerebellar Ataxia Type 3 (Machado-Joseph ...,Mitochondrial complex IV deficiency nuclear ty...,0.25,False,"The proband's symptoms, including tremor, stra..."
2,617402,219200,"Congenital Disorder of Glycosylation, Type II",Cutis laxa autosomal recessive type IIC,0.7,False,The proband's symptoms align with a Congenital...
3,613721,308350,Allan-Herndon-Dudley syndrome,Developmental and epileptic encephalopathy 11,0.6,False,The clinical presentation of the patient is su...
4,148050,610443,SYNGAP1-related intellectual disability,KBG syndrome,0.3,False,The proband's symptoms of intellectual disabil...
5,620535,194050,Williams Syndrome,Developmental delay dysmorphic facies and brai...,0.8,False,The clinical features presented in the vignett...
6,268310,129400,"Robinow syndrome, autosomal recessive",Robinow syndrome autosomal recessive,0.8,False,The clinical features presented in the vignett...
7,209900,209900,Bardet-Biedl syndrome,Bardet-Biedl syndrome 1,0.6,True,"The proband's symptoms, including postaxial po..."
8,620511,300624,Mowat-Wilson syndrome,Fliedner-Zweier syndrome,0.6,False,"The proband's symptoms, including global devel..."
9,618362,616780,"Ohdo syndrome, SBBYS variant",Coffin-Siris syndrome 8,0.3,False,"The proband's features, including macrocephaly..."


In [6]:
import chromadb

chroma_client = chromadb.PersistentClient(
                                    path=str('/Users/timhulshof/Documents/test_chatimd_latest/chatimd_interface/backend/chatimd_backend/databases/hpo_synonym_db')
                                )

collection = chroma_client.get_collection(name="hpo_synonym")

new_client = chromadb.PersistentClient(
                                    path=str(base_path / 'synthetic_patients')
                                )
new_client = chromadb.PersistentClient(
                                    path=str('/Users/timhulshof/Documents/test_chatimd_latest/chatimd_interface/backend/chatimd_backend/databases/synthetic_patients')
                                )

profiles = new_client.get_collection(name="synthetic_disease_profiles_v1")

In [7]:
import pickle

relevant_hpos = pickle.load(open(base_path / 'relevant_hpo_terms.pkl', 'rb'))

In [8]:
from utils.hpo_ontology import load_ontology

# Load the HPO ontology
graph, ic_dict = load_ontology(annotations='OMIM')

In [9]:
from oaklib import get_adapter

adapter = get_adapter(f"simpleobo:{base_path}/mondo.obo")

In [10]:
with open(base_path / "depth_cache.pkl", "rb") as f:
    depth_cache = pickle.load(f)

In [None]:
import utils.score_diagnosis 
import importlib

# make changes to example.py file
importlib.reload(utils.score_diagnosis)

from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate

results_path = Path().resolve() / 'results'

# Initialize dictionary to store times
execution_times = {}

# TOM
_, execution_times["TOM"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    depth_cache=depth_cache,
    # index=index,
    # orpha=False,
    collection=collection,
    weighted_score_active=True,
    # id_to_disease=id_to_disease,
    log_file_path=results_path / "evaluation_log_TOM.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM.npz"
)

  0%|          | 0/4419 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_filtered_2.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_filtered_2.npz
Time taken: 26166.23 seconds


In [None]:
import utils.score_diagnosis 
import importlib

# make changes to example.py file
importlib.reload(utils.score_diagnosis)
import utils.overlap_method
# make changes to example.py file
importlib.reload(utils.overlap_method)


from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate

results_path = Path().resolve() / 'results'

# TOM
_, execution_times["TOM_SA"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    depth_cache=depth_cache,
    # index=index,
    collection=collection,
    semantic_similarity=True,
    weighted_score_active=True,
    # id_to_disease=id_to_disease,
    log_file_path=results_path / "evaluation_log_TOM_SA.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM_SA.npz"
)

  0%|          | 0/4419 [00:00<?, ?it/s]

Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_SA_filtered_2.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_SA_filtered_2.npz
Time taken: 29924.86 seconds


NameError: name 'execution_times' is not defined

In [None]:
import utils.score_diagnosis 
import importlib
import utils.overlap_method
# make changes to example.py file
importlib.reload(utils.score_diagnosis)
importlib.reload(utils.overlap_method)

from utils.score_diagnosis import evaluate_predictions_and_save_logs, timed_evaluate

results_path = Path().resolve() / 'results'

# TOM
_, execution_times["TOM_SA_HNSW_WHOLE"] = timed_evaluate(
    evaluate_predictions_and_save_logs,
    relevant_hpos=relevant_hpos, 
    prompts_df=prompts_df, 
    adapter=adapter, 
    graph=graph, 
    ic_dict=ic_dict, 
    depth_cache=depth_cache,
    index=profiles,
    collection=collection,
    semantic_similarity=True,
    hnsw=True,
    k=1000,
    id_to_disease=None,
    weighted_score_active=True,
    log_file_path=results_path / "evaluation_log_TOM_SA_HNSW.txt", 
    output_ranks_path=results_path / "evaluation_ranks_TOM_SA_HNSW.npz"
)

  0%|          | 0/4419 [00:00<?, ?it/s]

INFO:backoff:Backing off send_request(...) for 1.0s (requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Read timed out. (read timeout=15))


Logs saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_log_TOM_SA_HNSW_smaller_k=1000.txt
Ranks saved to /Users/timhulshof/Documents/analysis_chatimd/results/evaluation_ranks_TOM_SA_HNSW_smaller_k=1000.npz
Time taken: 17915.90 seconds


NameError: name 'execution_times' is not defined