In [26]:
import pandas as pd
import sqlite3
from math import factorial
import numpy as np
import math
from utils.hpo_ontology import sim_gic
from pathlib import Path

def predict_disease_from_hpo(hpos,
                             graph,
                             ic_dict,
                             OMIM_or_ORPHA='OMIM',
                             semantic_similarity=False,
                             pot_diseases=None,
                             weighted_score_active=False,
                             neg_hpos=None,  # Negative HPO parameter
                             extra_diseases=None    
                             ):
    """
    1) If 'pot_diseases' is not provided, generate it by collecting diseases associated
       with the given HPOs.
    2) Score each disease based on:
       - Overlap with the patient's HPOs (positive contribution)
       - Overlap with the negative HPOs (penalty)
       - Weighted by inverse frequency (IC).
    """

    # Convert neg_hpos to a set for efficient intersection checks
    if neg_hpos is None:
        neg_hpos = set()
    else:
        neg_hpos = set(neg_hpos)

    # Use pathlib to construct the file paths
    base_path = Path().resolve() / 'files'

    # Establish a connection to your database
    conn = sqlite3.connect(base_path / 'disease_db.sqlite')

    associated_diseases_with_hpo = {}
    associated_database_ids = {}

# ---------------------------------------------------------------------------
# 1.  pot_diseases IS NOT PROVIDED – FAST PATH
# ---------------------------------------------------------------------------
    if pot_diseases is None:
        # --- 1·1  Fetch EVERY (hpo_id, database_id) pair for the patient's HPOs —
        placeholders = ", ".join(["?"] * len(hpos))
        big_query = f"""
            SELECT hpo_id, database_id
            FROM hpo_data
            WHERE hpo_id IN ({placeholders})
            AND database_id LIKE ?
            AND database_id NOT LIKE '%NOT%'
        """
        like_pattern = f"%{OMIM_or_ORPHA}%"
        df_pairs = pd.read_sql_query(big_query, conn, params=list(hpos) + [like_pattern])

        if df_pairs.empty:
            # no diseases share any of the patient's HPOs
            return pd.DataFrame(columns=[
                'ID', 'Disease name', 'Number HPO terms', 'New Metric',
                'Number matched HPO terms', 'Fraction overlapping search HPO terms',
                'Fraction matched disease HPO terms', 'Weighted score',
                'Similarity', 'Score', 'Probability', 'FDR',
                'Patient HPOs', 'Overlap HPOs', 'Disease HPOs'
            ])

        # --- 1·2  Build helper dicts in vectorised pandas ------------------------
        #   {hpo -> set(disease_ids)}  and  {hpo -> count_of_diseases}
        hpo_to_diseases = (
            df_pairs.groupby("hpo_id")["database_id"]
                    .agg(set)
                    .to_dict()
        )
        associated_diseases_with_hpo = {h: len(s) for h, s in hpo_to_diseases.items()}

        # sort HPOs by rarity (ascending disease count)
        sorted_hpo = sorted(associated_diseases_with_hpo.items(), key=lambda t: t[1])

        # --- 1·3  Collect the union of all disease IDs, preserving rarity order ---
        pot_diseases_ordered = []
        seen = set()
        for hpo, _ in sorted_hpo:
            for db in hpo_to_diseases[hpo]:
                if db not in seen:
                    seen.add(db)
                    pot_diseases_ordered.append(db)

        # --- 1·4  Fetch *all* HPO terms for *all* potential diseases at once ------
        placeholders = ", ".join(["?"] * len(pot_diseases_ordered))
        all_hpo_query = f"""
            SELECT database_id, hpo_id
            FROM hpo_data
            WHERE database_id IN ({placeholders})
            AND database_id LIKE ?
            AND database_id NOT LIKE '%NOT%'
        """
        df_hpo_full = pd.read_sql_query(all_hpo_query, conn,
                                        params=pot_diseases_ordered + [like_pattern])

        hpo_per_disease = (
            df_hpo_full.groupby("database_id")["hpo_id"]
                    .agg(set)
                    .to_dict()
        )

        pot_diseases = pot_diseases_ordered

    # ---------------------------------------------------------------------------
    # 2.  WE ALREADY HAVE A LIST OF POTENTIAL DISEASE NAMES – FAST PATH
    # ---------------------------------------------------------------------------
    else:
        # --- 2·1  Resolve the user-supplied names -> database_id ----------------
        placeholders = ", ".join(["?"] * len(pot_diseases))
        name2id_query = f"""
            SELECT DISTINCT database_id
            FROM hpo_data
            WHERE disease_name IN ({placeholders})
        """
        pot_db_ids = pd.read_sql_query(name2id_query, conn, params=pot_diseases)["database_id"].tolist()

        if not pot_db_ids:                       # nothing matched → bail early
            return pd.DataFrame(columns=[
                'ID', 'Disease name', 'Number HPO terms', 'New Metric',
                'Number matched HPO terms', 'Fraction overlapping search HPO terms',
                'Fraction matched disease HPO terms', 'Weighted score',
                'Similarity', 'Score', 'Probability', 'FDR',
                'Patient HPOs', 'Overlap HPOs', 'Disease HPOs'
            ])

        # --- 2·2  Fetch *all* HPO terms for *all* those diseases in ONE query ---
        placeholders = ", ".join(["?"] * len(pot_db_ids))
        big_query = f"""
            SELECT database_id, hpo_id
            FROM hpo_data
            WHERE database_id IN ({placeholders})
            AND database_id LIKE ?
            AND database_id NOT LIKE '%NOT%'
        """
        like_pattern = f"%{OMIM_or_ORPHA}%"
        df_hpo = pd.read_sql_query(big_query, conn, params=pot_db_ids + [like_pattern])

        # --- 2·3  Build {database_id: set_of_hpo_terms} in vectorised pandas ----
        hpo_per_disease = (
            df_hpo.groupby("database_id")["hpo_id"]
                .agg(set)
                .to_dict()
        )

        # --- 2·4  Keep only diseases that share ≥1 patient HPO ------------------
        patient_hpo_set = set(hpos)
        filtered_pot_diseases = [
            db_id for db_id, termset in hpo_per_disease.items()
            # if patient_hpo_set & termset          # set intersection is non-empty
        ]

        pot_diseases = filtered_pot_diseases

    #---------------------------------------------------------------------------
    # 3. SCORING: REWARD MATCHES AND PENALIZE NEGATIVE HPOs
    #---------------------------------------------------------------------------
    data = []

    # Count total diseases to compute inverse frequency
    query_total_diseases = f"""
        SELECT COUNT(DISTINCT database_id)
        FROM hpo_data
        WHERE database_id LIKE '%{OMIM_or_ORPHA}%'
          AND database_id NOT LIKE '%NOT%'
    """
    total_diseases = pd.read_sql_query(query_total_diseases, conn).iloc[0, 0]

    # We'll define a penalty factor here (tweak as needed)
    penalty_factor = 0.5

    # Ensure uniqueness
    pot_diseases = list(set(pot_diseases))

    for name, termset in extra_diseases.items():
        # Skip if caller passed an empty HPO list
        if not termset:
            continue

        # If the disease name already exists (same spelling as DB),
        # merge the term sets; else add as entirely new entry.
        if name in hpo_per_disease:
            hpo_per_disease[name].update(termset)
        else:
            hpo_per_disease[name] = termset
            pot_diseases.append(name)

    for disease in pot_diseases:
        hpos_for_disease = hpo_per_disease[disease]

        # Positive overlap between patient HPOs and disease HPOs
        overlap = set(hpos).intersection(hpos_for_disease)
        overlap_count = len(overlap)

        # If no overlap or no disease HPOs, skip
        if overlap_count == 0 or len(hpos_for_disease) == 0:
            continue

        perc_with_disease = overlap_count / len(hpos_for_disease)
        perc_with_hpos = overlap_count / len(hpos)

        #---------------------
        # Calculate Weighted Score
        #---------------------
        weighted_score = 0.0
        sum_hpo_per_disease = 0

        # (a) Reward for matching patient HPOs
        for hpo in overlap:
            query_hpo_count = f"""
                SELECT COUNT(*) 
                FROM hpo_data
                WHERE hpo_id = ?
                  AND database_id LIKE '%{OMIM_or_ORPHA}%'
                  AND database_id NOT LIKE '%NOT%'
            """
            count_hpo = pd.read_sql_query(query_hpo_count, conn, params=(hpo,)).iloc[0, 0]
            if count_hpo == 0:
                continue

            sum_hpo_per_disease += count_hpo
            # Inverse frequency weight
            weight = np.log(total_diseases / count_hpo)
            weighted_score += weight

        if sum_hpo_per_disease == 0:
            continue

        # (b) Penalty for negative HPOs present in the disease
        negative_overlap = set(hpos_for_disease).intersection(neg_hpos)
        for hpo_n in negative_overlap:
            query_hpo_count_neg = f"""
                SELECT COUNT(*) 
                FROM hpo_data
                WHERE hpo_id = ?
                  AND database_id LIKE '%{OMIM_or_ORPHA}%'
                  AND database_id NOT LIKE '%NOT%'
            """
            count_hpo_neg = pd.read_sql_query(query_hpo_count_neg, conn, params=(hpo_n,)).iloc[0, 0]

            if count_hpo_neg == 0:
                continue

            # Same IC approach for negative terms
            weight_neg = np.log(total_diseases / count_hpo_neg)
            # Subtract some fraction as penalty
            weighted_score -= (weight_neg * penalty_factor)

        #---------------------
        # Probability-based metric
        #---------------------
        score = overlap_count / sum_hpo_per_disease
        prob = 1 / (
            factorial(len(hpos_for_disease)) /
            (factorial(overlap_count) * factorial(len(hpos_for_disease) - overlap_count))
        )
        fdr = prob * len(pot_diseases)

        try:
            # Retrieve disease name
            query_disease_name = f"""
                SELECT disease_name
                FROM hpo_data
                WHERE database_id = ?
                AND database_id LIKE '%{OMIM_or_ORPHA}%'
                AND database_id NOT LIKE '%NOT%'
                LIMIT 1
            """
            disease_name = pd.read_sql_query(query_disease_name, conn, params=(disease,)).iloc[0, 0]
        except IndexError:
            # If no disease name found, skip this entry
            disease_name = disease

        #---------------------
        # (Optional) Semantic Similarity
        #---------------------
        hpos_without_overlap = set(hpos) - overlap
        hpos_for_disease_without_overlap = set(hpos_for_disease) - overlap

        if semantic_similarity:
            similarity = sim_gic(
                list(hpos_without_overlap),
                list(hpos_for_disease_without_overlap),
                graph=graph,
                ic_dict=ic_dict
            )
        else:
            similarity = 0

        new_metric = perc_with_hpos + (1 - perc_with_hpos) * similarity

        # Filter by significance
        if prob < 0.05:
            data.append([
                disease,                        # 'ID'
                disease_name,                   # 'Disease name'
                len(hpos_for_disease),          # 'Number HPO terms'
                new_metric,                     # 'New Metric'
                overlap_count,                  # 'Number matched HPO terms'
                perc_with_disease,              # 'Fraction overlapping search HPO terms'
                perc_with_hpos,                 # 'Fraction matched disease HPO terms'
                weighted_score,                 # 'Weighted score' (with negative penalty)
                similarity,                     # 'Similarity'
                score,                          # 'Score'
                prob,                           # 'Probability'
                fdr,                             # 'FDR'
                set(hpos),                           # 'Patient HPOs'
                overlap,             # 'Overlap HPOs'
                set(hpos_for_disease)      # 'Disease HPOs'
            ])

    conn.close()

    #---------------------
    # Assemble and Sort DataFrame
    #---------------------
    columns = [
        'ID', 'Disease name', 'Number HPO terms', 'New Metric',
        'Number matched HPO terms', 'Fraction overlapping search HPO terms',
        'Fraction matched disease HPO terms', 'Weighted score',
        'Similarity', 'Score', 'Probability', 'FDR', 'Patient HPOs', 'Overlap HPOs', 'Disease HPOs'
    ]
    df = pd.DataFrame(data, columns=columns)
    df.index = df['ID']

    if weighted_score_active:
        df_sorted = df.sort_values(
            by=['New Metric', 'Weighted score', 'Fraction matched disease HPO terms', 'Fraction overlapping search HPO terms'],
            ascending=[False, False, False, False]
        )
    else:
        df_sorted = df.sort_values(
            by=['New Metric', 'Fraction matched disease HPO terms'],
            ascending=[False, False]
        )

    return df_sorted

In [27]:
from pathlib import Path
import pandas as pd

base_path = Path().resolve() / 'files'

cleaned_cases = pd.read_csv(f"{base_path}/../results/all_predictions_case_descriptions_cleaned_with_collector_combined.csv")
cleaned_cases = cleaned_cases.drop_duplicates(subset="file_id", keep="first")
cleaned_cases

Unnamed: 0,file_id,rank,predicted_id,predicted_name,correct_diagnosis_id,correct_diagnosis_name,case_description,patient_hpo_terms,disease_hpo_terms,overlap_hpo_terms,new_metric,fraction_overlap,fraction_matched,exact_match,deepest_ancestor,collector,responsibleGene,reference
0,2,1,OMIM:620537,Developmental and epileptic encephalopathy 112,OMIM:607208,DRAVET SYNDROME; DRVT,"The patient, a female, who is currently 11 yea...","{'HP:0012758', 'HP:0001328', 'HP:0002133', 'HP...","{'HP:0001270', 'HP:0032712', 'HP:0002133', 'HP...","{'HP:0000750', 'HP:0002069', 'HP:0001263', 'HP...",0.682135,0.125000,0.444444,False,False,IT,SCN1A,https://onlinelibrary.wiley.com/doi/full/10.11...
20,3,1,OMIM:620655,Alfadhel syndrome,OMIM:204750,ALPHA-AMINOADIPIC AND ALPHA-KETOADIPIC ACIDURI...,"patient presented with microcephaly, mild moto...","{'HP:0010863', 'HP:0001270', 'HP:0003355', 'HP...","{'HP:0000085', 'HP:0030863', 'HP:0001762', 'HP...","{'HP:0010863', 'HP:0001270', 'HP:0000252', 'HP...",0.627590,0.161290,0.625000,False,False,IT,DHTKD1,https://omim.org/entry/204750
40,4,1,OMIM:230650,"GM1-gangliosidosis, type III",OMIM:257220,"NIEMANN-PICK DISEASE, TYPE C1; NPC1",a 43-year-old man who presented with splenomeg...,"{'HP:0003651', 'HP:0000511', 'HP:0004356', 'HP...","{'HP:0000750', 'HP:0001332', 'HP:0002506', 'HP...","{'HP:0001256', 'HP:0001744', 'HP:0003651'}",0.501384,0.103448,0.428571,False,False,IT,NPC1,https://omim.org/entry/257220?search=DISEASE&h...
42,5,1,OMIM:261630,"Hyperphenylalaninemia, bh4-deficient, C",OMIM:257220,"NIEMANN-PICK DISEASE, TYPE C1; NPC1",A 25-year-old man presented with a 14-year his...,"{'HP:0002312', 'HP:0001332', 'HP:0001249', 'HP...","{'HP:0002514', 'HP:0004923', 'HP:0001332', 'HP...","{'HP:0001332', 'HP:0001249', 'HP:0002015', 'HP...",0.403764,0.238095,0.357143,False,False,IT,NPC1,https://pubmed.ncbi.nlm.nih.gov/30119649/
47,6,1,OMIM:256550,Neuraminidase deficiency,OMIM:257220,"NIEMANN-PICK DISEASE, TYPE C1; NPC1","The proband, a 28-year-old woman, presented wi...","{'HP:0025233', 'HP:0004333', 'HP:0001922', 'HP...","{'HP:0004333', 'HP:0000943', 'HP:0000518', 'HP...","{'HP:0001922', 'HP:0001744', 'HP:0004333', 'HP...",0.400000,0.117647,0.400000,False,False,IT,NPC1,https://pubmed.ncbi.nlm.nih.gov/30119649/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10062,974,1,OMIM:613869,"Myopathy, myofibrillar, fatal infantile hypert...",OMIM:613752,S-Adenosylhomocysteine hydrolase deficiency,The proband male was born at 34 weeks through ...,"{'HP:0003236', 'HP:0011410', 'HP:0012498', 'HP...","{'HP:0003236', 'HP:0000007', 'HP:0001612', 'HP...","{'HP:0003236', 'HP:0002878', 'HP:0001612', 'HP...",0.273596,0.307692,0.235294,False,False,Marina Wasef,AHCY,PMID: 35789945
10082,975,1,OMIM:614300,Hypermethioninemia due to adenosine kinase def...,OMIM:613752,S-Adenosylhomocysteine hydrolase deficiency,The first child is a female born at 37 weeks o...,"{'HP:0003236', 'HP:0008151', 'HP:0004798', 'HP...","{'HP:0001508', 'HP:0034730', 'HP:0002059', 'HP...","{'HP:0003235', 'HP:0003236', 'HP:0008151', 'HP...",0.333333,0.117647,0.333333,False,False,Marina Wasef,AHCY,PMID: 39512434
10090,976,1,OMIM:616606,Ring chromosome 14 syndrome,OMIM:613752,S-Adenosylhomocysteine hydrolase deficiency,An 18-month-old female patient was referred to...,"{'HP:0001182', 'HP:0002160', 'HP:0002240', 'HP...","{'HP:0000750', 'HP:0002384', 'HP:0000463', 'HP...","{'HP:0000316', 'HP:0001290', 'HP:0000286', 'HP...",0.366569,0.285714,0.216216,False,False,Marina Wasef,AHCY,PMID: 39634240
10110,977,1,OMIM:614498,"Rigidity and multifocal seizure syndrome, leth...",OMIM:616586,"P5CS deficiency, cutis laxa phenotype",The patient was born to a 23-year-old G2P1L1 m...,"{'HP:0001562', 'HP:0009110', 'HP:0001511', 'HP...","{'HP:0002169', 'HP:0000023', 'HP:0002104', 'HP...","{'HP:0000252', 'HP:0000023', 'HP:0000347'}",0.402169,0.083333,0.081081,False,False,Marina Wasef,ALDH18A1,PMID: 21739576


In [28]:
clin_resp = pd.read_csv(f"{base_path}/ClinResponses_with_caseDescInformation.csv")
clin_resp_lit_cases = clin_resp[clin_resp['ReportType'] == 'literature']['caseID'].unique()
clin_resp_lit_cases

array([158, 130,  97,  49,  40,  69,  45,  70,  26, 121,  84, 189, 124,
       168, 136,  25, 147, 120, 125,  61,  81, 139, 156,   4, 154, 161,
       112, 191, 175,  18,  34,  76,  80, 169, 108, 131,  67,  12, 117,
       106,  92, 182, 148,  75])

In [29]:
clinician_cases = cleaned_cases[cleaned_cases['file_id'].isin(clin_resp_lit_cases)]
cleaned_cases = cleaned_cases[~cleaned_cases['file_id'].isin(clin_resp_lit_cases)]
cleaned_cases.index = cleaned_cases['file_id']
cleaned_cases

Unnamed: 0_level_0,file_id,rank,predicted_id,predicted_name,correct_diagnosis_id,correct_diagnosis_name,case_description,patient_hpo_terms,disease_hpo_terms,overlap_hpo_terms,new_metric,fraction_overlap,fraction_matched,exact_match,deepest_ancestor,collector,responsibleGene,reference
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,2,1,OMIM:620537,Developmental and epileptic encephalopathy 112,OMIM:607208,DRAVET SYNDROME; DRVT,"The patient, a female, who is currently 11 yea...","{'HP:0012758', 'HP:0001328', 'HP:0002133', 'HP...","{'HP:0001270', 'HP:0032712', 'HP:0002133', 'HP...","{'HP:0000750', 'HP:0002069', 'HP:0001263', 'HP...",0.682135,0.125000,0.444444,False,False,IT,SCN1A,https://onlinelibrary.wiley.com/doi/full/10.11...
3,3,1,OMIM:620655,Alfadhel syndrome,OMIM:204750,ALPHA-AMINOADIPIC AND ALPHA-KETOADIPIC ACIDURI...,"patient presented with microcephaly, mild moto...","{'HP:0010863', 'HP:0001270', 'HP:0003355', 'HP...","{'HP:0000085', 'HP:0030863', 'HP:0001762', 'HP...","{'HP:0010863', 'HP:0001270', 'HP:0000252', 'HP...",0.627590,0.161290,0.625000,False,False,IT,DHTKD1,https://omim.org/entry/204750
5,5,1,OMIM:261630,"Hyperphenylalaninemia, bh4-deficient, C",OMIM:257220,"NIEMANN-PICK DISEASE, TYPE C1; NPC1",A 25-year-old man presented with a 14-year his...,"{'HP:0002312', 'HP:0001332', 'HP:0001249', 'HP...","{'HP:0002514', 'HP:0004923', 'HP:0001332', 'HP...","{'HP:0001332', 'HP:0001249', 'HP:0002015', 'HP...",0.403764,0.238095,0.357143,False,False,IT,NPC1,https://pubmed.ncbi.nlm.nih.gov/30119649/
6,6,1,OMIM:256550,Neuraminidase deficiency,OMIM:257220,"NIEMANN-PICK DISEASE, TYPE C1; NPC1","The proband, a 28-year-old woman, presented wi...","{'HP:0025233', 'HP:0004333', 'HP:0001922', 'HP...","{'HP:0004333', 'HP:0000943', 'HP:0000518', 'HP...","{'HP:0001922', 'HP:0001744', 'HP:0004333', 'HP...",0.400000,0.117647,0.400000,False,False,IT,NPC1,https://pubmed.ncbi.nlm.nih.gov/30119649/
7,7,1,OMIM:117360,"Spinocerebellar ataxia 29, congenital nonprogr...",OMIM:257220,"NIEMANN-PICK DISEASE, TYPE C1; NPC1 (adult form)",This male patient had no familial or personal ...,"{'HP:0000750', 'HP:0010845', 'HP:0200136', 'HP...","{'HP:0000666', 'HP:0001270', 'HP:0002384', 'HP...","{'HP:0002071', 'HP:0001332', 'HP:0001249', 'HP...",0.520000,0.092199,0.520000,False,False,IT,NPC1,https://academic.oup.com/brain/article/130/1/1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
974,974,1,OMIM:613869,"Myopathy, myofibrillar, fatal infantile hypert...",OMIM:613752,S-Adenosylhomocysteine hydrolase deficiency,The proband male was born at 34 weeks through ...,"{'HP:0003236', 'HP:0011410', 'HP:0012498', 'HP...","{'HP:0003236', 'HP:0000007', 'HP:0001612', 'HP...","{'HP:0003236', 'HP:0002878', 'HP:0001612', 'HP...",0.273596,0.307692,0.235294,False,False,Marina Wasef,AHCY,PMID: 35789945
975,975,1,OMIM:614300,Hypermethioninemia due to adenosine kinase def...,OMIM:613752,S-Adenosylhomocysteine hydrolase deficiency,The first child is a female born at 37 weeks o...,"{'HP:0003236', 'HP:0008151', 'HP:0004798', 'HP...","{'HP:0001508', 'HP:0034730', 'HP:0002059', 'HP...","{'HP:0003235', 'HP:0003236', 'HP:0008151', 'HP...",0.333333,0.117647,0.333333,False,False,Marina Wasef,AHCY,PMID: 39512434
976,976,1,OMIM:616606,Ring chromosome 14 syndrome,OMIM:613752,S-Adenosylhomocysteine hydrolase deficiency,An 18-month-old female patient was referred to...,"{'HP:0001182', 'HP:0002160', 'HP:0002240', 'HP...","{'HP:0000750', 'HP:0002384', 'HP:0000463', 'HP...","{'HP:0000316', 'HP:0001290', 'HP:0000286', 'HP...",0.366569,0.285714,0.216216,False,False,Marina Wasef,AHCY,PMID: 39634240
977,977,1,OMIM:614498,"Rigidity and multifocal seizure syndrome, leth...",OMIM:616586,"P5CS deficiency, cutis laxa phenotype",The patient was born to a 23-year-old G2P1L1 m...,"{'HP:0001562', 'HP:0009110', 'HP:0001511', 'HP...","{'HP:0002169', 'HP:0000023', 'HP:0002104', 'HP...","{'HP:0000252', 'HP:0000023', 'HP:0000347'}",0.402169,0.083333,0.081081,False,False,Marina Wasef,ALDH18A1,PMID: 21739576


In [30]:
import json

# Step 1: Load your JSON file
with open(base_path / "cleaned_case_description_hpos.json", "r") as f:
    case_hpos_old = json.load(f)

with open(base_path / "cleaned_case_description_hpos_new.json", "r") as f:
    case_hpos_new = json.load(f)

case_hpos = case_hpos_old | case_hpos_new
len(case_hpos)

770

In [6]:
# name = "Galactosemia, classical"
name = "WOLF-HIRSCHHORN SYNDROME"
# name = "NIEMANN-PICK DISEASE, TYPE C1; NPC1"
cases = cleaned_cases[cleaned_cases['correct_diagnosis_name'] == name]
# cases = cleaned_cases[cleaned_cases['correct_diagnosis_name'].str.contains('niemann', case=False, na=False)]
cases.index = cases['file_id']
cases

Unnamed: 0_level_0,file_id,rank,predicted_id,predicted_name,correct_diagnosis_id,correct_diagnosis_name,case_description,patient_hpo_terms,disease_hpo_terms,overlap_hpo_terms,new_metric,fraction_overlap,fraction_matched,exact_match,deepest_ancestor,collector,responsibleGene,reference
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
536,536,1,OMIM:217980,"Corpus callosum, agenesis of, with facial anom...",OMIM:194190,WOLF-HIRSCHHORN SYNDROME,Baby boy M. J. was born at 36 weeks' gestation...,"{'HP:0030864', 'HP:0000377', 'HP:0001511', 'HP...","{'HP:0000377', 'HP:0001545', 'HP:0003196', 'HP...","{'HP:0000377', 'HP:0000347', 'HP:0000047', 'HP...",0.628602,0.159091,0.333333,False,False,FKM,IC,PMID: 25137600 DOI: 10.1097/ANC.0000000000000116
537,537,1,OMIM:235730,Mowat-Wilson syndrome,OMIM:194190,WOLF-HIRSCHHORN SYNDROME,"A 13-year-old boy, the first child of a young,...","{'HP:0000174', 'HP:0006698', 'HP:0000614', 'HP...","{'HP:0000518', 'HP:0000565', 'HP:0001270', 'HP...","{'HP:0001631', 'HP:0000047', 'HP:0001270', 'HP...",0.449221,0.117647,0.444444,False,True,FKM,IC,PMID: 34572183 PMCID: PMC8471045 DOI: 10.3390/...
538,538,1,OMIM:147920,Kabuki Syndrome 1,OMIM:194190,WOLF-HIRSCHHORN SYNDROME,"A 4-year-old girl, the only child of non-relat...","{'HP:0000164', 'HP:0001999', 'HP:0005268', 'HP...","{'HP:0000851', 'HP:0000358', 'HP:0003196', 'HP...","{'HP:0000164', 'HP:0001631', 'HP:0000960', 'HP...",0.594731,0.054645,0.588235,False,False,FKM,IC,PMID: 34572183 PMCID: PMC8471045 DOI: 10.3390/...
539,539,1,OMIM:216340,Yunis-Varon syndrome,OMIM:194190,WOLF-HIRSCHHORN SYNDROME,"A 2-year-old boy, the second child of an unrel...","{'HP:0007598', 'HP:0001156', 'HP:0001511', 'HP...","{'HP:0007333', 'HP:0005461', 'HP:0000737', 'HP...","{'HP:0007598', 'HP:0001511', 'HP:0000047', 'HP...",0.573786,0.081818,0.5625,False,False,FKM,IC,PMID: 34572183 PMCID: PMC8471045 DOI: 10.3390/...
541,541,1,OMIM:609029,Emanuel syndrome,OMIM:194190,WOLF-HIRSCHHORN SYNDROME,"A 1-year-old boy, the only child of an unrelat...","{'HP:0001344', 'HP:0001999', 'HP:0001631', 'HP...","{'HP:0001562', 'HP:0001511', 'HP:0000400', 'HP...","{'HP:0001631', 'HP:0002205', 'HP:0000960', 'HP...",0.614114,0.088235,0.6,False,False,FKM,IC,PMID: 34572183 PMCID: PMC8471045 DOI: 10.3390/...
542,542,1,OMIM:614866,Peroxisome biogenesis disorder 5A (Zellweger),OMIM:194190,WOLF-HIRSCHHORN SYNDROME,"A 5-year-old girl, the first child of an unrel...","{'HP:0001999', 'HP:0001511', 'HP:0001631', 'HP...","{'HP:0000311', 'HP:0001433', 'HP:0001762', 'HP...","{'HP:0001511', 'HP:0001631', 'HP:0001252', 'HP...",0.440545,0.08046,0.4375,False,False,FKM,IC,PMID: 34572183 PMCID: PMC8471045 DOI: 10.3390/...
544,544,1,OMIM:618356,Neurodevelopmental disorder with central and p...,OMIM:194190,WOLF-HIRSCHHORN SYNDROME,A female patient was observed with significant...,"{'HP:0000316', 'HP:0001511', 'HP:0000347', 'HP...","{'HP:0000878', 'HP:0002075', 'HP:0000237', 'HP...","{'HP:0000316', 'HP:0011968', 'HP:0000347', 'HP...",0.7,0.189189,0.7,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830
545,545,1,OMIM:615802,Neurodevelopmental disorder with dysmorphic fe...,OMIM:194190,WOLF-HIRSCHHORN SYNDROME,This female patient had notable intrauterine g...,"{'HP:0001511', 'HP:0003202', 'HP:0001252', 'HP...","{'HP:0000582', 'HP:0000395', 'HP:0000400', 'HP...","{'HP:0003202', 'HP:0001252', 'HP:0000733', 'HP...",0.745815,0.118644,0.7,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830
546,546,1,OMIM:264470,Peroxisomal acyl-CoA oxidase deficiency,OMIM:194190,WOLF-HIRSCHHORN SYNDROME,A female patient presented with severe develop...,"{'HP:0008619', 'HP:0002839', 'HP:0010864', 'HP...","{'HP:0000654', 'HP:0000737', 'HP:0002415', 'HP...","{'HP:0010864', 'HP:0008619', 'HP:0011344'}",0.6,0.078947,0.6,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830
547,547,1,OMIM:612513,Chromosome 2p16.1-p15 deletion syndrome,OMIM:194190,WOLF-HIRSCHHORN SYNDROME,This male patient had growth retardation and a...,"{'HP:0002353', 'HP:0011968', 'HP:0011471', 'HP...","{'HP:0000232', 'HP:0000506', 'HP:0001166', 'HP...","{'HP:0002079', 'HP:0002353', 'HP:0011968', 'HP...",0.667718,0.065574,0.666667,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830


In [7]:
cleaned_cases = cleaned_cases.drop(cases.index)

In [8]:
cases['file_id'] = cases['file_id'].astype(str)
cases['HPO_terms'] = cases['file_id'].map(case_hpos)
cleaned_cases['file_id'] = cleaned_cases['file_id'].astype(str)
cleaned_cases['HPO_terms'] = cleaned_cases['file_id'].map(case_hpos)
len(cases.dropna()), len(cleaned_cases.dropna())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cases['file_id'] = cases['file_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cases['HPO_terms'] = cases['file_id'].map(case_hpos)


(20, 708)

In [9]:
# import chromadb
# from tqdm.auto import tqdm
# from utils.synthetic_dataset_utils import aggregate_embeddings_average

# chroma_client = chromadb.PersistentClient(
#                                     path=str('/Users/timhulshof/Documents/test_chatimd_latest/chatimd_interface/backend/chatimd_backend/databases/hpo_synonym_db')
#                                 )

# collection = chroma_client.get_collection(name="hpo_synonym")

# # Wrap `tqdm` around `apply` with `aggregate_embeddings_average`
# tqdm.pandas(desc="Calculating Embeddings")

# cases['Embeddings'] = cases['HPO_terms'].progress_apply(lambda x: aggregate_embeddings_average(x, collection))
# cases

In [10]:
cases["correct_diagnosis_name"] = cases["correct_diagnosis_name"].str.replace(name, f"{name} atypical")
cleaned_cases["correct_diagnosis_name"] = (cleaned_cases["correct_diagnosis_name"] + " atypical")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cases["correct_diagnosis_name"] = cases["correct_diagnosis_name"].str.replace(name, f"{name} atypical")


In [11]:
extra_diseases = {}
for disease_name in cleaned_cases['correct_diagnosis_name'].unique():
    sub_df = cleaned_cases[cleaned_cases['correct_diagnosis_name'] == disease_name]
    atypical_terms = set(t for sublist in sub_df["HPO_terms"] for t in sublist)
    extra_diseases[disease_name] = atypical_terms

In [22]:
# Random states 42, 64, 128
cases_sample = cases.sample(10, random_state=64)
cases_remaining = cases.drop(cases_sample.index)

In [23]:
from utils.synthetic_dataset_utils import get_disease_profile

atypical_terms = set(t for sublist in cases_sample["HPO_terms"] for t in sublist)

cases_remaining["atypical_terms"] = [list(atypical_terms)] * len(cases_remaining)

cases_remaining["overlap_count_atypical"] = cases_remaining["HPO_terms"].apply(
    lambda terms: len(set(terms) & atypical_terms)
)

hpo_data = pd.read_csv(base_path / 'phenotype.hpoa', delimiter='\t', comment='#', low_memory=False)

omim = cases_remaining.iloc[0]['correct_diagnosis_id']
classical_hpos = set(get_disease_profile(hpo_data, disease_id=omim).keys())

cases_remaining["overlap_count_classical"] = cases_remaining["HPO_terms"].apply(
    lambda terms: len(set(terms) & classical_hpos)
)

cases_remaining["overlap_terms_atypical"] = cases_remaining["HPO_terms"].apply(
    lambda terms: list(set(terms) & atypical_terms)
)

# cases_remaining.drop(columns=["HPO_terms"], inplace=True)

cases_remaining

Unnamed: 0_level_0,file_id,rank,predicted_id,predicted_name,correct_diagnosis_id,correct_diagnosis_name,case_description,patient_hpo_terms,disease_hpo_terms,overlap_hpo_terms,...,exact_match,deepest_ancestor,collector,responsibleGene,reference,HPO_terms,atypical_terms,overlap_count_atypical,overlap_count_classical,overlap_terms_atypical
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
538,538,1,OMIM:147920,Kabuki Syndrome 1,OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,"A 4-year-old girl, the only child of non-relat...","{'HP:0000164', 'HP:0001999', 'HP:0005268', 'HP...","{'HP:0000851', 'HP:0000358', 'HP:0003196', 'HP...","{'HP:0000164', 'HP:0001631', 'HP:0000960', 'HP...",...,False,False,FKM,IC,PMID: 34572183 PMCID: PMC8471045 DOI: 10.3390/...,"[HP:0000164, HP:0000252, HP:0000519, HP:000096...","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",10,9,"[HP:0001629, HP:0004322, HP:0001631, HP:000126..."
541,541,1,OMIM:609029,Emanuel syndrome,OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,"A 1-year-old boy, the only child of an unrelat...","{'HP:0001344', 'HP:0001999', 'HP:0001631', 'HP...","{'HP:0001562', 'HP:0001511', 'HP:0000400', 'HP...","{'HP:0001631', 'HP:0002205', 'HP:0000960', 'HP...",...,False,False,FKM,IC,PMID: 34572183 PMCID: PMC8471045 DOI: 10.3390/...,"[HP:0000252, HP:0000750, HP:0000960, HP:000125...","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",9,4,"[HP:0001631, HP:0000960, HP:0001250, HP:000075..."
544,544,1,OMIM:618356,Neurodevelopmental disorder with central and p...,OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,A female patient was observed with significant...,"{'HP:0000316', 'HP:0001511', 'HP:0000347', 'HP...","{'HP:0000878', 'HP:0002075', 'HP:0000237', 'HP...","{'HP:0000316', 'HP:0011968', 'HP:0000347', 'HP...",...,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830,"[HP:0000252, HP:0000316, HP:0000347, HP:000125...","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",5,6,"[HP:0011968, HP:0001511, HP:0000347, HP:000125..."
546,546,1,OMIM:264470,Peroxisomal acyl-CoA oxidase deficiency,OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,A female patient presented with severe develop...,"{'HP:0008619', 'HP:0002839', 'HP:0010864', 'HP...","{'HP:0000654', 'HP:0000737', 'HP:0002415', 'HP...","{'HP:0010864', 'HP:0008619', 'HP:0011344'}",...,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830,"[HP:0002839, HP:0008619, HP:0010864, HP:001134...","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",2,1,"[HP:0010864, HP:0011344]"
548,548,1,OMIM:620465,"Epilepsy, early-onset, 3, with or without deve...",OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,A female patient exhibited a characteristic cr...,"{'HP:0001344', 'HP:0000316', 'HP:0001631', 'HP...","{'HP:0007359', 'HP:0002384', 'HP:0000006', 'HP...","{'HP:0001344', 'HP:0010864'}",...,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830,"[HP:0000316, HP:0000426, HP:0001250, HP:000134...","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",5,4,"[HP:0010864, HP:0001631, HP:0001250, HP:001134..."
552,552,1,OMIM:613385,"Autoimmune disease, multisystem, with facial d...",OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,A male patient had profound developmental dela...,"{'HP:0002353', 'HP:0002205', 'HP:0001290', 'HP...","{'HP:0000331', 'HP:0000358', 'HP:0001270', 'HP...","{'HP:0001999', 'HP:0001252', 'HP:0001290', 'HP...",...,False,True,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830,"[HP:0001252, HP:0001290, HP:0001999, HP:000220...","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",6,2,"[HP:0012736, HP:0001252, HP:0002353, HP:000220..."
553,553,1,OMIM:616462,"Acrofacial dysostosis, Cincinnati type",OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,"This female patient exhibited scoliosis, kypho...","{'HP:0000924', 'HP:0001166', 'HP:0001155', 'HP...","{'HP:0009110', 'HP:0008807', 'HP:0001511', 'HP...","{'HP:0002650', 'HP:0001250'}",...,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830,"[HP:0000924, HP:0001155, HP:0001166, HP:000125...","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",2,2,"[HP:0001250, HP:0000924]"
554,554,1,OMIM:618454,Developmental delay with or without dysmorphic...,OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,A female patient exhibited a distinct facial p...,"{'HP:0001344', 'HP:0000431', 'HP:0000010', 'HP...","{'HP:0002164', 'HP:0000358', 'HP:0003196', 'HP...","{'HP:0001344', 'HP:0000431', 'HP:0000076', 'HP...",...,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830,"[HP:0000010, HP:0000076, HP:0000431, HP:000134...","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",4,2,"[HP:0002553, HP:0012736, HP:0001344, HP:0001627]"
555,555,1,OMIM:105830,Angelman syndrome,OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,A female patient exhibited delayed physical an...,"{'HP:0031058', 'HP:0002353', 'HP:0001270', 'HP...","{'HP:0001270', 'HP:0002650', 'HP:0000639', 'HP...","{'HP:0002353', 'HP:0001270', 'HP:0031936', 'HP...",...,False,True,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830,"[HP:0001250, HP:0001263, HP:0001270, HP:000235...","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",5,3,"[HP:0001270, HP:0031058, HP:0002353, HP:000126..."
557,557,1,OMIM:619854,"Neurodevelopmental disorder with hypotonia, im...",OMIM:194190,WOLF-HIRSCHHORN SYNDROME atypical,A female patient exhibited profound developmen...,"{'HP:0012736', 'HP:0031936', 'HP:0001250', 'HP...","{'HP:0000729', 'HP:0001182', 'HP:0001847', 'HP...","{'HP:0031936', 'HP:0001250', 'HP:0011968'}",...,False,False,FKM,IC,PMID: 10103318 DOI: 10.1542/peds.103.4.830,"[HP:0001250, HP:0011968, HP:0012736, HP:0031936]","[HP:0000967, HP:0001629, HP:0001156, HP:041003...",3,1,"[HP:0012736, HP:0011968, HP:0001250]"


In [14]:
# pd.concat([pd.read_csv('../../Downloads/niemann_pick_remaining.csv'), pd.read_csv('../../Downloads/galactosemia_remaining.csv'), pd.read_csv('../../Downloads/wolf_remaining.csv')])

In [24]:
from utils.hpo_ontology import load_ontology

atypical = {f'{name} atypical': atypical_terms}
extra_diseases = extra_diseases | atypical

# Load the HPO ontology
graph, ic_dict = load_ontology(annotations='OMIM')

print(f"{name} cases:")
for i in range(len(cases_remaining)):

    patient_hpos     = cases_remaining.iloc[i]['HPO_terms']  # list of lists of HPO IDs
    patient_idx      = cases_remaining.iloc[i]['file_id']

    pred_df = predict_disease_from_hpo(patient_hpos,
                             graph,
                             ic_dict,
                             OMIM_or_ORPHA='OMIM',
                             semantic_similarity=True,
                             pot_diseases=None,
                             weighted_score_active=True,
                             neg_hpos=None,  # Negative HPO parameter
                             extra_diseases=extra_diseases
                             )
    
    row_num_atypical = pred_df.reset_index(drop=True).index[pred_df["ID"] == f"{name} atypical"]
    row_num_classical = pred_df.reset_index(drop=True).index[pred_df["ID"] == omim]

    if not row_num_atypical.empty:
        if row_num_classical.empty:
            print(f"Patient {patient_idx:<3} | classical → atypical rank : "
                f"{'Not Found':>10} → {row_num_atypical[0] + 1:<3}")
        else:
            print(f"Patient {patient_idx:<3} | classical → atypical rank : "
                f"{row_num_classical[0] + 1:<3} → {row_num_atypical[0] + 1:<3}")
    else:
        print(f"Patient {patient_idx:<3} | classical → atypical rank : "
            f"{'Value not found':>20}")


WOLF-HIRSCHHORN SYNDROME cases:
Patient 538 | classical → atypical rank : 6   → 1  
Patient 541 | classical → atypical rank : 161 → 1  
Patient 544 | classical → atypical rank : 8   → 62 
Patient 546 | classical → atypical rank : 384 → 53 
Patient 548 | classical → atypical rank : 57  → 6  
Patient 552 | classical → atypical rank : 440 → 1  
Patient 553 | classical → atypical rank : 60  → 329
Patient 554 | classical → atypical rank : 32  → 5  
Patient 555 | classical → atypical rank : 121 → 1  
Patient 557 | classical → atypical rank : 1168 → 39 


In [None]:
pred_df

Unnamed: 0_level_0,ID,Disease name,Number HPO terms,New Metric,Number matched HPO terms,Fraction overlapping search HPO terms,Fraction matched disease HPO terms,Weighted score,Similarity,Score,Probability,FDR,Patient HPOs,Overlap HPOs,Disease HPOs
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
OMIM:117360,OMIM:117360,"Spinocerebellar ataxia 29, congenital nonprogr...",141,0.592173,14,0.099291,0.583333,50.155368,0.021215,0.002436,1.383660e-19,4.424945e-16,"{HP:0001251, HP:0001272, HP:0002518, HP:000201...","{HP:0001249, HP:0001332, HP:0001251, HP:000126...","{HP:0011094, HP:0001251, HP:0004322, HP:003312..."
OMIM:256731,OMIM:256731,"Ceroid lipofuscinosis, neuronal, 5",25,0.522518,9,0.360000,0.375000,30.365214,0.236030,0.001872,4.894823e-07,1.565364e-03,"{HP:0001251, HP:0001272, HP:0002518, HP:000201...","{HP:0001249, HP:0001251, HP:0002376, HP:000131...","{HP:0001249, HP:0000007, HP:0001251, HP:000192..."
"NIEMANN-PICK DISEASE, TYPE C1; NPC1 atypical","NIEMANN-PICK DISEASE, TYPE C1; NPC1 atypical","NIEMANN-PICK DISEASE, TYPE C1; NPC1 atypical",104,0.466708,11,0.105769,0.458333,38.554007,0.015460,0.002498,4.483404e-15,1.433793e-11,"{HP:0001251, HP:0001272, HP:0002518, HP:000201...","{HP:0001249, HP:0001332, HP:0001251, HP:000237...","{HP:0000280, HP:0010524, HP:0001251, HP:000074..."
OMIM:614381,OMIM:614381,"Leukodystrophy, hypomyelinating, 8, with or wi...",40,0.459853,11,0.275000,0.458333,40.228427,0.002806,0.002724,4.325631e-10,1.383337e-06,"{HP:0001251, HP:0001272, HP:0002518, HP:000201...","{HP:0001249, HP:0001332, HP:0001251, HP:000725...","{HP:0001251, HP:0004322, HP:0000750, HP:000127..."
OMIM:619606,OMIM:619606,Developmental and epileptic encephalopathy 99,33,0.424179,4,0.121212,0.166667,12.269920,0.309015,0.001659,2.443793e-05,7.815249e-02,"{HP:0001251, HP:0001272, HP:0002518, HP:000201...","{HP:0001249, HP:0001272, HP:0002307, HP:0002119}","{HP:0025097, HP:0003623, HP:0001272, HP:000735..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OMIM:143095,OMIM:143095,Spondyloepiphyseal dysplasia with congenital j...,77,0.041667,1,0.012987,0.041667,1.692809,0.000000,0.000653,1.298701e-02,4.153247e+01,"{HP:0001251, HP:0001272, HP:0002518, HP:000201...",{HP:0001249},"{HP:0001659, HP:0003071, HP:0001371, HP:000432..."
OMIM:613610,OMIM:613610,Cranioectodermal dysplasia 2,80,0.041667,1,0.012500,0.041667,1.692809,0.000000,0.000653,1.250000e-02,3.997500e+01,"{HP:0001251, HP:0001272, HP:0002518, HP:000201...",{HP:0001249},"{HP:0045025, HP:0004322, HP:0000293, HP:000135..."
OMIM:154400,OMIM:154400,"Acrofacial dysostosis 1, Nager type",83,0.041667,1,0.012048,0.041667,1.692809,0.000000,0.000653,1.204819e-02,3.853012e+01,"{HP:0001251, HP:0001272, HP:0002518, HP:000201...",{HP:0001249},"{HP:0009601, HP:0004322, HP:0010034, HP:000040..."
OMIM:305600,OMIM:305600,Focal dermal hypoplasia,89,0.041667,1,0.011236,0.041667,1.692809,0.000000,0.000653,1.123596e-02,3.593258e+01,"{HP:0001251, HP:0001272, HP:0002518, HP:000201...",{HP:0001249},"{HP:0000455, HP:0000073, HP:0000307, HP:000432..."


In [None]:
# from ast import literal_eval

# # Step 2: Parse the hp.obo file to extract HPO code -> description mapping
# hpo_mapping = {}
# current_id = None
# current_name = None

# with open(base_path / "hp.obo", "r", encoding="utf-8") as f:
#     for line in f:
#         line = line.strip()
#         if line.startswith("[Term]"):  # New HPO entry starts
#             current_id = None
#             current_name = None
#         elif line.startswith("id: HP:"):
#             current_id = line.split(": ")[1]  # Extract HPO ID
#         elif line.startswith("name:") and current_id:
#             current_name = line.split(": ")[1]  # Extract HPO name
#             hpo_mapping[current_id] = current_name  # Store in dictionary

# remaining_terms = set()
# for i, row in cases_remaining.iterrows():
#     hpo_terms = literal_eval(row['patient_hpo_terms'])
#     for term in hpo_terms:
#         remaining_terms.add(hpo_mapping.get(term, "Unknown term"))

# sample_terms = set()
# for term in atypical_terms:
#     sample_terms.add(hpo_mapping.get(term, "Unknown term"))

# from matplotlib_venn import venn2
# import matplotlib.pyplot as plt

# # Create Venn diagram
# venn2([sample_terms, remaining_terms], set_labels=('Atypical profile terms', 'Remaining cases terms'))

# # Show plot
# plt.title(f"Intersection of Atypical profile terms and Remaining cases terms in {name}")
# plt.show()

In [None]:
# # Bigger figure size for more space
# plt.figure(figsize=(14, 14))

# v = venn2([sample_terms, remaining_terms], set_labels=('Atypical profile terms', 'Remaining cases terms'))

# # Function to set small text with wrapping
# def set_label(region_id, terms):
#     if v.get_label_by_id(region_id):  # region might not exist
#         label_text = "\n".join(sorted(terms))
#         v.get_label_by_id(region_id).set_text(label_text)
#         v.get_label_by_id(region_id).set_fontsize(6)  # smaller font

# # Replace labels with actual HPO terms
# set_label('10', sample_terms - remaining_terms)       # Only in A
# set_label('01', remaining_terms - sample_terms)       # Only in B
# set_label('11', sample_terms & remaining_terms)       # Intersection

# # Make circle labels bigger too
# for label in v.set_labels:
#     label.set_fontsize(14)

# plt.title(f"Intersection of Atypical profile terms and Remaining cases terms in {name}")
# plt.show()

In [None]:
# classical_terms = set()
# for term in classical_hpos:
#     classical_terms.add(hpo_mapping.get(term, "Unknown term"))

In [None]:
# # Bigger figure size for more space
# plt.figure(figsize=(14, 14))

# v = venn2([classical_terms, remaining_terms], set_labels=('Classical profile terms', 'Remaining cases terms'))

# # Function to set small text with wrapping
# def set_label(region_id, terms):
#     if v.get_label_by_id(region_id):  # region might not exist
#         label_text = "\n".join(sorted(terms))
#         v.get_label_by_id(region_id).set_text(label_text)
#         v.get_label_by_id(region_id).set_fontsize(6)  # smaller font

# # Replace labels with actual HPO terms
# set_label('10', classical_terms - remaining_terms)       # Only in A
# set_label('01', remaining_terms - classical_terms)       # Only in B
# set_label('11', classical_terms & remaining_terms)       # Intersection

# # Make circle labels bigger too
# for label in v.set_labels:
#     label.set_fontsize(14)

# plt.title(f"Intersection of Classical profile terms and Remaining cases terms in {name}")
# plt.show()

In [None]:
# from matplotlib import pyplot as plt
# from matplotlib.patches import Circle
# import random

# # ------------------------  PARAMETERS ------------------------
# random.seed(123)
# fontsize = 6            # Text size in points
# attempts_per_label = 400    # How hard we try before giving up

# # Example sets (replace with your own)
# A = sample_terms
# B = remaining_terms

# left_only  = list(A - B)
# right_only = list(B - A)
# both       = list(A & B)

# # ------------------------  GEOMETRY --------------------------
# r = 1.25
# centerA = (-0.6, 0)
# centerB = ( 0.6, 0)

# def in_left(x, y):
#     dA = (x - centerA[0])**2 + (y - centerA[1])**2
#     return dA <= r**2

# def in_right(x, y):
#     dB = (x - centerB[0])**2 + (y - centerB[1])**2
#     return dB <= r**2

# # Helper booleans for *regions*
# def inside_left_only(coords):
#     return in_left(*coords) and not in_right(*coords)

# def inside_right_only(coords):
#     return in_right(*coords) and not in_left(*coords)

# def inside_both(coords):
#     return in_left(*coords) and in_right(*coords)

# region_funcs = {
#     "left":  inside_left_only,
#     "both":  inside_both,
#     "right": inside_right_only,
# }

# # --------------------  FIGURE & AXES -------------------------
# fig, ax = plt.subplots(figsize=(14, 14))

# circleA = Circle(centerA, r, alpha=0.3)
# circleB = Circle(centerB, r, alpha=0.3)
# ax.add_patch(circleA)
# ax.add_patch(circleB)
# ax.set_xlim(-2, 2)
# ax.set_ylim(-1.6, 1.6)
# ax.set_aspect('equal')
# ax.axis('off')

# # Because we'll create many temporary text objects to measure size,
# # pre‑draw the canvas once so renderer is available
# fig.canvas.draw()
# renderer = fig.canvas.get_renderer()

# # -----------------  BOUNDING‑BOX UTILITIES ------------------
# def get_bbox_for(label, x, y):
#     """Return bbox (minx, miny, maxx, maxy) in data coords for text at (x,y)."""
#     txt = ax.text(x, y, label, ha='center', va='center', fontsize=fontsize, alpha=0)
#     fig.canvas.draw()          # need to draw to update position
#     bbox = txt.get_window_extent(renderer=renderer)
#     # transform to data coordinates
#     inv = ax.transData.inverted()
#     (x0, y0), (x1, y1) = inv.transform([[bbox.x0, bbox.y0], [bbox.x1, bbox.y1]])
#     txt.remove()               # tidy invisible text
#     return (x0, y0, x1, y1)

# def bboxes_overlap(bb1, bb2, pad=0.02):
#     """Check if two rectangles intersect (with small padding)."""
#     return (bb1[0]-pad) < (bb2[2]+pad) and (bb1[2]+pad) > (bb2[0]-pad) \
#        and (bb1[1]-pad) < (bb2[3]+pad) and (bb1[3]+pad) > (bb2[1]-pad)

# def bbox_inside_region(bbox, region_key):
#     """All 4 corners must satisfy region membership test."""
#     x0, y0, x1, y1 = bbox
#     xs = [x0, x1]
#     ys = [y0, y1]
#     func = region_funcs[region_key]
#     return all(func((x, y)) for x in xs for y in ys)

# # --------------------  PLACE THE LABELS ---------------------
# placed_bboxes = []

# def place_labels(labels, region_key):
#     for label in labels:
#         placed = False
#         for _ in range(attempts_per_label):
#             # Generate candidate inside a generous bounding square, then test
#             x = random.uniform(-2, 2)
#             y = random.uniform(-1.5, 1.5)
#             if not region_funcs[region_key]((x, y)):
#                 continue
#             bbox = get_bbox_for(label, x, y)
#             if not bbox_inside_region(bbox, region_key):
#                 continue
#             if any(bboxes_overlap(bbox, prev) for prev in placed_bboxes):
#                 continue
#             # Accept this position
#             ax.text(x, y, label, ha='center', va='center', fontsize=fontsize)
#             placed_bboxes.append(bbox)
#             placed = True
#             break
#         if not placed:
#             # Fallback: stack near region centroid
#             cx, cy = {
#                 "left":  centerA,
#                 "both":  ((centerA[0]+centerB[0])/2, 0),
#                 "right": centerB,
#             }[region_key]
#             delta_y = -0.25 * sum(l == label for l in labels)  # stagger a bit
#             ax.text(cx, cy + delta_y, label, ha='center', va='center', fontsize=fontsize)
#             # Bounding box calculation for fallback
#             bbox = get_bbox_for(label, cx, cy + delta_y)
#             placed_bboxes.append(bbox)

# # Place each category
# place_labels(left_only,  "left")
# place_labels(both,       "both")
# place_labels(right_only, "right")

# plt.title("Labels fully contained in each Venn region")
# plt.show()

In [None]:
# client = chromadb.PersistentClient(path=str(base_path / 'synthetic_patients'))   # e.g. "./hpo_synonym_db"

# profile_col = client.get_or_create_collection(
#     name="synthetic_disease_profiles_v1",
#     metadata={                      # ‑‑ optional but useful knobs
#         "hnsw:space":          "cosine",   # metric
#         "hnsw:M":              32,         # graph degree   (↑recall, ↑RAM)
#         "hnsw:construction_ef":200,        # insert breadth (↑quality, ↑CPU)
#         "hnsw:search_ef":      64          # query breadth  (↑recall, ↑latency)
#     }
# )

In [None]:
# from ast import literal_eval
# import math, numpy as np, json, itertools

# # ---------- 1. figure out where to start -----------------------------
# START_ID = profile_col.count()          # e.g. 12 345 existing vectors
# id_iter  = (str(i) for i in itertools.count(START_ID + 2))   # "12345", "12346", …

# # ---------- 2. unchanged helper --------------------------------------
# DIM = len(profile_col.peek()["embeddings"][0])

# def clean_embedding(cell):
#     if isinstance(cell, str):
#         cell = literal_eval(cell)
#     if isinstance(cell, np.ndarray):
#         cell = cell.astype(np.float32).tolist()
#     if len(cell) != DIM:
#         raise ValueError(f"dim={len(cell)} ≠ expected {DIM}")
#     if not all(math.isfinite(x) for x in cell):
#         raise ValueError("NaN or ±inf detected")
#     return [float(x) for x in cell]

# # ---------- 3. streaming ingest with fresh IDs -----------------------
# BATCH = 128
# cache = {"ids": [], "embs": [], "docs": [], "meta": []}

# for _, row in cases_sample.iterrows():
#     try:
#         vec = clean_embedding(row["Embeddings"])
#     except Exception as err:
#         print(f"Row skipped → {err}")
#         continue

#     cache["ids"].append(next(id_iter))             # ← NEW sequential ID
#     cache["embs"].append(vec)
#     cache["docs"].append(row["correct_diagnosis_name"])
#     cache["meta"].append({
#         "Disease":  row["correct_diagnosis_name"],
#         "HPO_IDs":  json.dumps(list(row["HPO_terms"])),
#     })

#     if len(cache["ids"]) == BATCH:
#         profile_col.add(
#             ids        = cache["ids"],
#             embeddings = cache["embs"],
#             documents  = cache["docs"],
#             metadatas  = cache["meta"],
#         )
#         cache = {k: [] for k in cache}

# if cache["ids"]:
#     profile_col.add(
#         ids        = cache["ids"],
#         embeddings = cache["embs"],
#         documents  = cache["docs"],
#         metadatas  = cache["meta"],
#     )

In [None]:
# profile_col.delete(ids=cache["ids"])            # ← removes only those
# print("Collection size:", profile_col.count())  # confirm it shrank

In [None]:
# print("Collection size:", profile_col.count())   # should be old + new

# # peek at the very last ID we just generated
# last_id = str(profile_col.count() - 1)
# print("Newest ID should be:", last_id)
# view = profile_col.get(ids=[last_id], include=["documents", "metadatas"])
# print(view["ids"][0], view["documents"][0])

In [None]:
# print(f"{name} cases:")
# for i in range(len(cases_remaining)):

#     patient_hpos     = cases_remaining.iloc[i]['HPO_terms']  # list of lists of HPO IDs
#     patient_idx      = cases_remaining.iloc[i]['file_id']
#     query_embedding  = aggregate_embeddings_average(patient_hpos, collection)

#     if isinstance(query_embedding, np.ndarray):
#         query_embedding = query_embedding.astype(float).tolist()   # or .astype(np.float32)

#     results = profile_col.query(
#         query_embeddings=[query_embedding],      # ← list[list[float]]
#         n_results=4000,
#         include=["documents", "metadatas", "distances"],
#     )

#     pred_df = predict_disease_from_hpo(patient_hpos,
#                              graph,
#                              ic_dict,
#                              OMIM_or_ORPHA='OMIM',
#                              semantic_similarity=True,
#                              pot_diseases=list(set(results["documents"][0][:4000])),
#                              weighted_score_active=True,
#                              neg_hpos=None, 
#                              extra_diseases=atypical
#                              )
    
#     row_num_atypical = pred_df.reset_index(drop=True).index[pred_df["ID"] == f"{name} atypical"]
#     row_num_classical = pred_df.reset_index(drop=True).index[pred_df["ID"] == omim]

#     if not row_num_atypical.empty:
#         if row_num_classical.empty:
#             print(f"Patient {patient_idx:<3} | classical → atypical rank : "
#                 f"{'Not Found':>10} → {row_num_atypical[0] + 1:<3}")
#         else:
#             print(f"Patient {patient_idx:<3} | classical → atypical rank : "
#                 f"{row_num_classical[0] + 1:<3} → {row_num_atypical[0] + 1:<3}")
#     else:
#         print(f"Patient {patient_idx:<3} | classical → atypical rank : "
#             f"{'Value not found':>20}")

#     # Targets to find
#     # target_classical = f"{name}"
#     target_classical = f"Galactosemia"
#     target_atypical  = f"{name} atypical"

#     # Init: Not found yet
#     idx_classical = idx_atypical = None

#     # Search through the ranked list
#     for j, disease in enumerate(results["documents"][0]):
#         if idx_classical is None and disease == target_classical:
#             idx_classical = j
#         if idx_atypical is None and disease == target_atypical:
#             idx_atypical = j

#         # Stop early if both found
#         if idx_classical is not None and idx_atypical is not None:
#             break

#     # Print nicely
#     if idx_classical is not None or idx_atypical is not None:
#         # print(f"Patient {patient_idx}:")
#         if idx_classical is not None:
#             dist_classical = results["distances"][0][idx_classical]
#             print(f"  • {target_classical:<40} index={idx_classical+1:<3} distance={dist_classical:.4f}")
#         else:
#             print(f"  • {target_classical:<40} Not Found")

#         if idx_atypical is not None:
#             dist_atypical = results["distances"][0][idx_atypical]
#             print(f"  • {target_atypical:<40} index={idx_atypical+1:<3} distance={dist_atypical:.4f}")
#         else:
#             print(f"  • {target_atypical:<40} Not Found")
    # else:
    #     print(f"Patient {patient_idx}: Neither disease found")

In [None]:
# Patient 375 | classical → atypical rank : 4   → 2  
# Patient 377 | classical → atypical rank : 9   → 1  
# Patient 379 | classical → atypical rank : 3   → 1  
# Patient 380 | classical → atypical rank : 383 → 342
# Patient 382 | classical → atypical rank : 4   → 1  
# Patient 383 | classical → atypical rank : 140 → 14 
# Patient 385 | classical → atypical rank : 128 → 1  
# Patient 387 | classical → atypical rank : 111 → 2  
# Patient 391 | classical → atypical rank : 14  → 1