We found hundreds of variants in the ClinVar database that potentially effect CTF. 
We want to see whether specific conditions are enriched for that group. 


Note: we use the openai API in this notebook. Because of enviroment incompatibilities it should be run in an enviroment compatible with openAI

In [84]:
import pickle
import pandas as pd
import numpy as np
import openai
from tqdm import tqdm
import json
from scipy.stats import hypergeom
import os


In [2]:
def merge_2dicts(dict1: dict, dict2: dict) -> dict:
    ''' 
    This function merges two dictionaries to a single one. Combine the values to a list for overlaping keys
    '''
    # Merge dictionaries with values combined into a list
    merged_dict = {}

    # Update values for keys in dict1
    for key, value in dict1.items():
        if not isinstance(value, list):
            value = [value]
        dict2_value = dict2.get(key, [])
        if not isinstance(dict2_value, list):
            dict2_value = [dict2_value]
        merged_dict[key] = value + dict2_value

    # Update values for keys in dict2 that are not in dict1
    for key, value in dict2.items():
        if key not in dict1:
            if not isinstance(value, list):
                value = [value]
            merged_dict[key] = value

    return merged_dict

In [4]:
''' Load the ClinVar df '''

clinvar_df = pd.read_pickle("../Data/ClinVar/ClinVar_processed_df.pickle")
clinvar_df = clinvar_df[~clinvar_df["Protein change"].isna()].copy() #keep only CDS variants
clinvar_df = clinvar_df[clinvar_df["Variant_Type"] == "SNP"].copy()

In [5]:
clinvar_df.shape

(89465, 30)

In [22]:
''' Get all unique conditions on ClinVar '''
conditions = clinvar_df["Condition(s)"].unique()


In [43]:
''' 
We have a long list of conditions. We want to cluster them to large categories such as cancer, heart disease, mental illnesses, developmental diseases, etc. 
We use the API of chatGPT to perform this clustering 
'''

classification_dict = {} #initilize results dict

# Split the list into batches of 500 names for chatGPT to handle. 
batch_size = 500
condition_batches = [conditions[i:i+batch_size] for i in range(0, len(conditions), batch_size)]

# Use chatgpt4 to classify the conditions to categories
key = os.environ["OPENAI_API_KEY"]
client = openai.OpenAI(api_key=key)

#first batch

completion = client.chat.completions.create(
  model="gpt-4o",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are an experienced doctor."},
    {"role": "user", "content": f"I am going to send batches of medical conditions in several following messages. Classify the following medical conditions into appropriate categories. Add an 'others' category for the ones you are unsure of. Generate new categories if necessary based on the conditions listed. This is the first batch: {condition_batches[0]}. Return a json, the keys are the categories and the values are the conditions."}])

dict_current_batch = json.loads(completion.choices[0].message.content)
classification_dict = merge_2dicts(classification_dict, dict_current_batch)

# all other batches: 

previous_message = completion.choices[0].message.content #the previous answer of the chat needs to be fed to it
for num_batch, batch in tqdm(enumerate(condition_batches[1:])):
    
    try:
    
        completion = client.chat.completions.create(
          model="gpt-4o",
          response_format={ "type": "json_object" },
          messages=[
            {"role": "user", "content": f"Again, classify the following medical conditions into appropriate categories. Generate new categories if necessary based on the conditions listed. This is the next batch: {batch}. Return a json, the keys are the categories and the values are the conditions."}, 
            {"role": "assistant", "content": f"This is your previous answer: {previous_message}"}])

        #save the message content to avoid running again everytime there is an issue
        with open(f'../Results/classified_clinvar_conditions_batch{num_batch}.pickle', 'wb') as handle:
            pickle.dump(completion.choices[0].message.content, handle)

        dict_current_batch = json.loads(completion.choices[0].message.content)
        classification_dict = merge_2dicts(classification_dict, dict_current_batch)
    except: 
        print(f"There was a problem with batch {num_batch}")


29it [20:49, 43.08s/it]


In [46]:
with open('../Results/classified_clinvar_conditions.pickle', 'wb') as handle:
    pickle.dump(classification_dict, handle)


In [113]:
''' Because the chat can't see the categories it created for the previous batches, each round it might created different categories with similar meanings. 
Here we cluster them as well ''' 


def merge_categories(original_dict: dict, merge_map: dict) -> dict:
    merged_dict = {}

    # Create a reverse mapping from each original category to its new category
    reverse_merge_map = {}
    for new_cat, old_cats in merge_map.items():
        for old_cat in old_cats:
            reverse_merge_map[old_cat] = new_cat

    # Merge the original dictionary according to the reverse mapping
    for category, conditions in original_dict.items():
        new_category = reverse_merge_map.get(category, category)
        if new_category not in merged_dict:
            merged_dict[new_category] = []
        merged_dict[new_category].extend(conditions)

    # Remove duplicates in each category's condition list
    for category in merged_dict:
        merged_dict[category] = list(set(merged_dict[category]))

    return merged_dict


merge_map = {
    "Genetic Disorders": ["Epileptic Disorders", "Genomic Disorders", "Genetic Epilepsy", "Inborn Genetic Conditions", "Genetic Disorders", "Genetic Syndromes", "Genetic Polymorphisms", "Inborn Genetic Diseases", "Inborn Genetic Disorders and Syndromes", "Hereditary Disorders", "Other Genetic Disorders", "Genetic Eye Disorders", "Genetic Bone and Skeletal Disorders", "Genetic Conditions", "Genetic and Inherited Conditions", "Other Genetic Diseases", "Genetic Syndromes and Disorders", "Inherited Disorder"],
    "Neurological Disorders": ["Neurological Diseases", "Epilepsy", "Muscle Disorders and Neurological Disorders", "Genetic Epilepsy", "Leukodystrophies and Neurodegenerations", "Epilepsies and Seizure Disorders", "Neurological Disorders", "Neurodevelopmental Disorders", "Neurodegenerative Disorders", "Neuromuscular Disorders", "Neurological and Cerebral Disorders", "Neurological and Developmental Disorders", "Neurodevelopmental Disorders and Syndromes", "Neurological/Neurodegenerative Diseases", "Epileptic and Seizure Disorders", "Epilepsy and Seizure Disorders", "Epilepsy-related Disorders", "Peripheral Neuropathies and Spastic Paraplegias"],
    "Muscular Disorders": ["Muscle and Skeletal Disorders", "Muscle Disorders and Neurological Disorders", "Muscular Disorders", "Myopathies", "Muscle Diseases", "Myopathies and Muscular Disorders", "Neuromuscular Disorders"],
    "Immunological Disorders": ["Autoimmune Diseases", "Auto Immuno Disorders", "Infections and Immune Disorders", "Immunodeficiency Disorders", "Immunological Disorders", "Autoimmune Disorders", "Autoimmune and Immunodeficiency Conditions", "Inflammatory Disorders", "Inflammatory and Autoimmune Diseases", "Immune System Disorders", "Immune Disorders"],
    "Bone and Skeletal Disorders": ["Skeletal Abnormalities", "Connective Tissue Disorders and Syndromes", "Craniofacial and Skeletal Disorders", "Connective Tissue and Bone Disorders", "Bone Disorders", "Bone and Skeletal Disorders", "Bone and Joint Disorders", "Bone and Growth Disorders", "Skeletal Dysplasias", "Skeletal Dysplasia", "Skeletal Disorders", "Bone and Cartilage Disorders", "Bone and Skeletal Conditions"],
    "Cardiovascular Disorders": ["Heart-Related Disorders", "Cardiological Disorders", "Cardiovascular Disorders", "Cardiovascular Conditions", "Cardiovascular Diseases", "Vascular and Cardiac Conditions", "Cardiac Disorders", "Cardiomyopathies"],
    "Blood Disorders": ["Vascular and Blood Disorders", "Vascular Disorders", "Blood Disorders", "Hematological Disorders", "Hemoglobin Disorders", "Hematological Conditions", "Blood and Circulatory Disorders", "Blood and Coagulation Disorders", "Blood-related Disorders", "Bleeding and Blood Disorders", "Hemostatic Disorders"],
    "Metabolic Disorders": ["Inherited Metabolic and Endocrine Disorders", "Metabolic Conditions", "Metabolic Disorders", "Endocrine and Metabolic Disorders", "Metabolic and Endocrine Disorders", "Endocrine and Metabolic Conditions", "Diabetes and Metabolic Disorders", "Complex Metabolic Disorders", "Metabolic and Mitochondrial Disorders", "Diseases of Protein Metabolism"],
    "Ophthalmic Disorders": ["Ophthalmic Disorders", "Eye Disorders", "Vision Disorders", "Visual Disorders", "Visual Pathologies", "Corneal and Ophthalmic Disorders", "Vision and Eye Disorders", "Ocular Disorders", "Ocular Abnormalities"],
    "Oncological Disorders": ["Cancer and Predisposing Syndromes", "Cancer", "Oncology", "Oncological Disorders", "Cancer-Related Disorders", "Cancer and Neoplastic Disorders", "Cancer Predisposition Syndromes", "Cancer Predisposition Disorders", "Cancer and Tumor Disorders", "Cancers and Tumors", "Cancers and Pre-Cancerous Conditions", "Tumors and Cancer-Related Syndromes", "Hereditary Cancer-Prone Disorders", "Cancer Disorders", "Cancer and Tumor Predisposition", "Cancer-Predisposing Syndromes"],
    "Endocrinological Disorders": ["Inherited Metabolic and Endocrine Disorders", "Reproductive and Endocrine Disorders", "Endocrinological Disorders", "Endocrine Disorders", "Endocrine and Metabolic Disorders", "Endocrine and Hormonal Disorders", "Endocrine Disorders and Syndromes", "Endocrine Gland Disorders"],
    "Respiratory Disorders": ["Respiratory Diseases", "Respiratory Disorders", "Pulmonary Conditions", "Respiratory Conditions", "Pulmonary and Immune Disorders", "Lung Disorders"],
    "Developmental Disorders": ["Developmental Disorders", "Developmental Conditions", "Developmental and Intellectual Disabilities", "Developmental Delay"],
    "Liver Disorders": ["Liver Disorders", "Digestive and Liver Disorders", "Liver and Metabolic Conditions"],
    "Renal Disorders": ["Nephrological Disorders", "Renal Disorders", "Kidney Disorders", "Renal and Urinary Disorders", "Kidney and Urinary Disorders", "Kidney and Urinary Tract Disorders", "Kidney and Urological Disorders"],
    "Musculoskeletal Disorders": ["Musculoskeletal Disorders", "Joint and Muscle Disorders", "Musculoskeletal Conditions"],
    "Miscellaneous Conditions": ["Miscellaneous Conditions", "Miscellaneous Disorders", "Other Conditions", "Miscellaneous Genetic Disorders", "Miscellaneous Autism Spectrum Disorders", "Miscellaneous"],
    "Congenital Syndromes": ["Congenital Anomalies", "Congenital Disorders and Syndromes", "Congenital Disorders", "Congenital Defects"], "Other Disorders": ["Not Provided", "Unknown", "Unspecified Categories", "Undetermined Disorders"], 
    "Hearing Disorders": ["Deafness and Hearing Disorders", "Hearing and Balance Disorders", "Syndromic Hearing Loss", "Hearing Loss and Sensory Disorders", "Hearing Loss Disorders", "Hearing & Balance Disorders"],
    "Brain Disorders": ["Intellectual Disabilities and Developmental Disorders", "Intellectual Disability Disorders", "Microcephaly and Brain Structure Disorders", "Intellectual Disabilities and Brain Disorders"],
    "Skin Disorders":["Skin and Hair Conditions", "Dermatological Disorders", "Dermatological Conditions", "Skin Disorders"], "Diabetes Mellitus": ["Diabetes", "Hyperinsulinism and Hypoglycemia"]
}

merged_dict = merge_categories(classification_dict, merge_map)


In [115]:
with open('../Results/classified_clinvar_conditions_clustered.pickle', 'wb') as handle:
    pickle.dump(merged_dict, handle)


In [116]:
''' Check for enrichments of certain categories in the extreme df relative to the entire clinvar df. 
remember to correct for FDR '''


def check_category(row, category_conditions):
    '''
    Function to check if any condition from a category is in the row
    '''
    conditions = row['Condition(s)'].lower().split(',') #there could be more than a single condition the variant is related with
    return any(any(cond.lower() in condition for condition in conditions) for cond in category_conditions)

def count_category_occurrences(df, categories_dict):
    '''
    Function to count category occurrences
    '''
    category_counts = {}
    
    for category, conditions in categories_dict.items():
        category_counts[category] = df[df['Condition(s)'].apply(lambda x: check_category(pd.Series({'Condition(s)': x}), conditions))].shape[0]
    
    return category_counts

# Count occurrences in the entire clinvar ds
counts_clinvar = count_category_occurrences(clinvar_df, merged_dict)


In [117]:
# Count occurrences in the extreme ds

extreme_df = pd.read_csv("../Results/clinvar_extreme_mfe.txt")
counts_extreme = count_category_occurrences(extreme_df, merged_dict)


In [118]:
#save both
with open('../Results/counts_clinvar.pickle', 'wb') as handle:
    pickle.dump(counts_clinvar, handle)

with open('../Results/counts_extreme.pickle', 'wb') as handle:
    pickle.dump(counts_extreme, handle)


In [119]:
''' Check for enrichment '''

enrichment_df = pd.DataFrame(columns = ["Enrichment", "p-value"])

num_variants = clinvar_df.shape[0]
num_extreme_variants = extreme_df.shape[0]

for category in merged_dict.keys():
    try:
    
        num_variants_in_category = counts_clinvar[category]
        num_extreme_variants_in_category = counts_extreme[category]

        enrichment = np.round((num_extreme_variants_in_category / num_extreme_variants) / (num_variants_in_category / num_variants),2)
        p_value = hypergeom.sf(num_extreme_variants_in_category - 1, num_variants, num_variants_in_category, num_extreme_variants)

        enrichment_df.loc[category, "Enrichment"] = enrichment
        enrichment_df.loc[category, "p-value"] = p_value
    except:
        pass

In [125]:
enrichment_df.sort_values(by = "p-value")[:10]

Unnamed: 0,Enrichment,p-value
Syndromes with Global Spectrum,4.03,0.039254
Systemic Disorders,22.73,0.043121
Congenital Syndromes,1.51,0.057444
Connective Tissue Disorders,1.46,0.064802
Cardiovascular Disorders,1.43,0.069455
Ceroid Lipofuscinosis,12.99,0.074241
Immunological Disorders,1.67,0.081501
Diabetes Mellitus,2.63,0.107169
Oncological Disorders,1.26,0.112036
Developmental Disorders,1.36,0.128171
