In [1]:
from IPython.display import display, Markdown
import pandas as pd

### SNOMED methods example

In [2]:
display(Markdown("""
## Begin

Ensure the methods are on path
"""))

import os, sys
sys.path.insert(0,'/home/aliencat/samora/gloabl_files')
sys.path.insert(0,'/data/AS/Samora/gloabl_files')
sys.path.insert(0,'/home/jovyan/work/gloabl_files')
sys.path.insert(0, '/home/cogstack/samora/_data/gloabl_files')


## Begin

Ensure the methods are on path


In [3]:
from snomed_methods import snomed_methods_v1

display(Markdown("""

Import module

"""))



Import module



Ensure the rf2 snomed files are in the folder specified in snomed_methods_v1.py

Ensure medcat path is set if using medcat and your dev environment is set (defaults to dh-cap02)




In [4]:
snomed_relations_obj = snomed_methods_v1.snomed_relations(medcat=True)


display(Markdown("""

Initialise the snomed methods object

"""))

  from tqdm.autonotebook import tqdm, trange




Initialise the snomed methods object



In [5]:
display(Markdown("""

Define your starting point SNOMED cui code.
"""))

outcome_variable_cui_for_filter = '399187006'  # HFE

print(outcome_variable_cui_for_filter)




Define your starting point SNOMED cui code.


399187006


In [6]:
filter_root_cui = outcome_variable_cui_for_filter
print(filter_root_cui)

399187006


#### Spreading from starting SNOMED code we have found more related codes in the SNOMED tree:

In [7]:
retrieved_codes_snomed_tree, retrieved_names_snomed_tree = snomed_relations_obj.recursive_code_expansion(filter_root_cui, n_recursion = 10, debug=False)

display(Markdown("""

n_recursion is the number of cycles of searching for a codes parents and children, then appending them to a set and searching for each of these codes parent/children.
Higher recursion, more exploration, more codes, higher odds of unrelated concepts being returned.

"""))

Retrieving 399187006 with recursion 10


100%|██████████| 10/10 [00:00<00:00, 11.74it/s]




n_recursion is the number of cycles of searching for a codes parents and children, then appending them to a set and searching for each of these codes parent/children.
Higher recursion, more exploration, more codes, higher odds of unrelated concepts being returned.



In [8]:
retrieved_codes_snomed_tree[0:5], len(retrieved_codes_snomed_tree), len(retrieved_names_snomed_tree)

([66576001, 6160004, 401119001, 143101000119101, 399187006], 21, 20)

In [9]:
retrieved_names_snomed_tree[0:3]

display(Markdown("""

Lets examine some of the identified codes names. 
"""))



Lets examine some of the identified codes names. 


In [10]:
retrieved_codes_snomed_tree[0:3]

[66576001, 6160004, 401119001]

#### Lets try an additional method to find related codes.

Here we will attempt to get related codes from medcats' concept databases context similarity. In other words, what concepts occurred in a similar context in the training data for our CDB. **This method may not work if the concept did not receive training in the inital base model. This is because the concept does not have a context vector(s).

In [11]:
retrieved_codes_medcat_cdb, retrieved_names_medcat_cdb  = snomed_relations_obj.get_medcat_cdb_most_similar(filter_root_cui, context_type = 'xxxlong', type_id_filter=[], topn=50)

In [12]:
retrieved_names_medcat_cdb[0:5]

['Hemochromatosis (disorder)',
 'Hereditary hemochromatosis (disorder)',
 'Juvenile hemochromatosis (disorder)',
 'Hereditary spherocytosis (disorder)',
 'Spherocytosis (finding)']

## Produce outputs in batches:

In [13]:
ronnie_code_list = [
    700065003, 471885006, 890122001, 890119003, 871638006,
    890121008, 871649000, 890120009, 472316006, 45227007,
    195020003, 83978005, 63183009, 1204194004, 428163005,
    840303004, 840304005, 840305006, 880052005, 735686002,
    95281009, 26636000
]

# Convert the list to a comma-separated string
ronnie_code_str = ','.join(map(str, ronnie_code_list)).split(",")

# Create a new list with integers
ronnie_code_int = list(map(int, ronnie_code_list))

print("String representation:", ronnie_code_str)
print("Integer list representation:", ronnie_code_int)

target_code = ronnie_code_str[0]
target_code


String representation: ['700065003', '471885006', '890122001', '890119003', '871638006', '890121008', '871649000', '890120009', '472316006', '45227007', '195020003', '83978005', '63183009', '1204194004', '428163005', '840303004', '840304005', '840305006', '880052005', '735686002', '95281009', '26636000']
Integer list representation: [700065003, 471885006, 890122001, 890119003, 871638006, 890121008, 871649000, 890120009, 472316006, 45227007, 195020003, 83978005, 63183009, 1204194004, 428163005, 840303004, 840304005, 840305006, 880052005, 735686002, 95281009, 26636000]


'700065003'

In [14]:


def create_dataframe_snomed(input_codes):
    # Assuming snomed_relations_obj and other necessary objects are defined

    # Create an empty DataFrame
    df = pd.DataFrame(columns=['filter_root_cui', 'retrieved_codes', 'retrieved_names'])

    for i in range(len(input_codes)):
        filter_root_cui = input_codes[i]

        # Assuming snomed_relations_obj is defined
        retrieved_codes_snomed_tree, retrieved_names_snomed_tree = snomed_relations_obj.recursive_code_expansion(filter_root_cui, n_recursion=20, debug=False)

        # Append the results to the DataFrame
        df = df.append({'filter_root_cui': filter_root_cui,
                        'retrieved_codes': retrieved_codes_snomed_tree,
                        'retrieved_names': retrieved_names_snomed_tree},
                       ignore_index=True)
        
        df['retrieved_codes_count'] = df['retrieved_codes'].apply(lambda x: len(x))

        

    return df


result_df = create_dataframe_snomed(ronnie_code_str)
result_df.to_csv('result_snomed.csv')
result_df


Retrieving 700065003 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 52.00it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 471885006 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 111.04it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 890122001 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 242.61it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 890119003 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 260.40it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 871638006 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 265.21it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 890121008 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 263.52it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 871649000 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 264.34it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 890120009 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 265.55it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 472316006 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 71.33it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 45227007 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 55.85it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 195020003 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 118.84it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 83978005 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 117.42it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 63183009 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 119.03it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 1204194004 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 266.88it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 428163005 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 119.55it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 840303004 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 264.83it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 840304005 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 268.13it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 840305006 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 268.28it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 880052005 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 264.73it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 735686002 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 119.52it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 95281009 with recursion 20


100%|██████████| 20/20 [00:00<00:00, 56.94it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Retrieving 26636000 with recursion 20


100%|██████████| 20/20 [00:01<00:00, 13.12it/s]
  df = df.append({'filter_root_cui': filter_root_cui,


Unnamed: 0,filter_root_cui,retrieved_codes,retrieved_names,retrieved_codes_count
0,700065003,"[63183009, 700065003, 83978005, 700065003]",[Primary hypertrophic cardiomyopathy (disorder...,4
1,471885006,"[471885006, 471885006]",[Hypertrophic cardiomyopathy with genetic mark...,2
2,890122001,[890122001],[],1
3,890119003,[890119003],[],1
4,871638006,[871638006],[],1
5,890121008,[890121008],[],1
6,871649000,[871649000],[],1
7,890120009,[890120009],[],1
8,472316006,"[472318007, 472316006, 472316006]",[Hypertrophic mitochondrial cardiomyopathy (di...,3
9,45227007,"[63183009, 45227007, 440074009, 45227007]",[Hypertrophic obstructive cardiomyopathy (diso...,4


In [15]:


def create_dataframe_medcat(input_codes):
    # Assuming snomed_relations_obj and other necessary objects are defined

    # Create an empty DataFrame
    df = pd.DataFrame(columns=['filter_root_cui', 'retrieved_codes', 'retrieved_names'])

    for i in range(len(input_codes)):
        filter_root_cui = input_codes[i]

        # Assuming snomed_relations_obj is defined
        retrieved_codes_medcat_cdb, retrieved_names_medcat_cdb  = snomed_relations_obj.get_medcat_cdb_most_similar(filter_root_cui, context_type = 'xxxlong', type_id_filter=[], topn=50)

        # Append the results to the DataFrame
        df = df.append({'filter_root_cui': filter_root_cui,
                        'retrieved_codes': retrieved_codes_medcat_cdb,
                        'retrieved_names': retrieved_names_medcat_cdb},
                       ignore_index=True)
        
        df['retrieved_codes_count'] = df['retrieved_codes'].apply(lambda x: len(x))

        

    return df


result_df = create_dataframe_medcat(ronnie_code_str)
result_df.to_csv('result_medcat_50.csv')
result_df


  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,


'xxxlong'
'890122001'
'890119003'


  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,


'871638006'
'890121008'
'871649000'


  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,


'890120009'
'xxxlong'


  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,


'xxxlong'
'1204194004'


  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,


'840303004'
'840304005'
'840305006'


  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,


'880052005'


  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,
  df = df.append({'filter_root_cui': filter_root_cui,


Unnamed: 0,filter_root_cui,retrieved_codes,retrieved_names,retrieved_codes_count
0,700065003,"[700065003, 233873004, 85898001, 45227007, 894...",[Primary hypertrophic cardiomyopathy (disorder...,50
1,471885006,[],[],0
2,890122001,[],[],0
3,890119003,[],[],0
4,871638006,[],[],0
5,890121008,[],[],0
6,871649000,[],[],0
7,890120009,[],[],0
8,472316006,[],[],0
9,45227007,"[45227007, 195020003, 85898001, 233873004, 894...",[Hypertrophic obstructive cardiomyopathy (diso...,50


## An additional method 



In this method we will calculate an embedding for snomed terms with their name using a large language model (Gatortron OG) trained on clinical text. We will then calculate an embedding for our term of choice. /n
With these embedding vectors we can measure their cosine similarty and return a list of similar embeddings. 

In [16]:
import pickle



# Load the dictionary back from the file
with open('/home/cogstack/samora/_data/gloabl_files/gatortron/precomputed_sname_gatortron_base_embedding_dict.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)

# Print the loaded dictionary
print(len(loaded_dict.keys()))


7311327


In [17]:
list(loaded_dict.keys())[0:3]

['neoplasm~of~anterior~surface~of~epiglottis~diagnosis',
 'neoplasm',
 'neoplasm~of']

In [18]:
loaded_dict.get('hemochromatosis')

array([[ 0.13535264,  0.05197329, -0.02210324, ...,  0.01287067,
        -0.5452818 , -0.14283289]], dtype=float32)

In [19]:
import random

display(Markdown("""

This takes a long time, randomly sample keys as an example. Approx 1h for full list. 

"""))


# Get a list of all keys in the dictionary
all_keys = list(loaded_dict.keys())

# Select 1000 random keys
selected_keys = random.sample(all_keys, 100000)

selected_keys = loaded_dict

# Create a new dictionary with only the selected keys
filtered_dict = {key: loaded_dict[key] for key in selected_keys}

# Now, filtered_dict contains only 1000 randomly selected key-value pairs from loaded_dict
#print(filtered_dict)




This takes a long time, randomly sample keys as an example. Approx 1h for full list. 



In [20]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def find_most_similar(target_vector, term_vectors, n=5):
    """
    Find the n most similar vectors to the target_vector from the given term_vectors.

    Parameters:
    - target_vector: The vector for which similarity is to be calculated.
    - term_vectors: A dictionary of term vectors.
    - n: The number of most similar vectors to retrieve (default is 5).

    Returns:
    - A list of tuples, each containing (term, similarity_score), sorted by similarity_score in descending order.
    """
    similarities = {}
    
    # Reshape target_vector to 2D array
    target_vector = target_vector.reshape(1, -1)
    
    for term, vector in tqdm(term_vectors.items()):
        # Reshape vector to 2D array
        vector = vector.reshape(1, -1)
        
        # Calculate cosine similarity
        similarity_score = cosine_similarity(target_vector, vector)[0, 0]
        similarities[term] = similarity_score
    
    # Sort terms by similarity in descending order
    sorted_terms = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    
    # Return the top n most similar vectors with their terms
    top_n_similarities = sorted_terms[:n]
    
    return top_n_similarities

# Example usage:
# Assuming loaded_dict is a dictionary of term vectors
# loaded_dict = {'term1': np.array([[0.1, 0.2, 0.3]]), 'term2': np.array([[0.4, 0.5, 0.6]])}

target_vector = loaded_dict.get('hemochromatosis')
result = find_most_similar(target_vector, filtered_dict, n=50)

# Print the result
for term, similarity_score in result:
    print(f'Term: {term}, Similarity Score: {similarity_score}')


100%|██████████| 7311327/7311327 [49:25<00:00, 2465.26it/s]


Term: hemochromatosis, Similarity Score: 1.000000238418579
Term: haemochromatosis, Similarity Score: 0.9774909615516663
Term: emochromatosis, Similarity Score: 0.9753208756446838
Term: hemochromatosis~due, Similarity Score: 0.9472475051879883
Term: hemochromatosis~secondary, Similarity Score: 0.9382572174072266
Term: latent~hemochromatosis, Similarity Score: 0.9345972537994385
Term: hemachromatosis, Similarity Score: 0.9298010468482971
Term: haemochromatosis~compound, Similarity Score: 0.9290552139282227
Term: hemochromatosis~juvenile, Similarity Score: 0.9279600381851196
Term: hemochromatosis~exogenous, Similarity Score: 0.9247486591339111
Term: hemochromatosis~following, Similarity Score: 0.9222375154495239
Term: hemochromatosis~neonatal, Similarity Score: 0.9211658835411072
Term: hemosideroses, Similarity Score: 0.9200485944747925
Term: secondary~hemochromatosis, Similarity Score: 0.9189373254776001
Term: hemosiderosis, Similarity Score: 0.917299211025238
Term: heterozygous~hemochro