In [1]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)
import networkx as nx

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# -----------------------------
# Import snomed files
# -----------------------------

path = r'data/International/Relationship.txt'
international_relationship = pd.read_csv(path, delimiter="\t")

path = r'data/International/Description.txt'
international_description = pd.read_csv(path, delimiter="\t")

path = r'data/International/Concept.txt'
international_concept = pd.read_csv(path, delimiter="\t")

In [5]:
# -----------------------------
# Import files from icht
# -----------------------------
path = r'data/snomed_diagnosis_codes_morethan5.csv'
snomed_diagnosis_codes = pd.read_csv(path)

path = r'data/snomed_problem_codes_morethan5.csv'
snomed_problem_codes = pd.read_csv(path)

path = r'data/snomed_riskfactor_codes_morethan5.csv'
snomed_riskfactor_codes = pd.read_csv(path)

In [4]:
snomed_problem_codes

Unnamed: 0,PROBLEM,PROBLEM_DESC,count
0,38341003,"Hypertensive disorder, systemic arterial (diso...",15978.0
1,38907003,Varicella (disorder),8417.0
2,195967001,Asthma (disorder),7612.0
3,13644009,Hypercholesterolemia (disorder),6231.0
4,44054006,Diabetes mellitus type 2 (disorder),5662.0
...,...,...,...
2356,81576005,Closed fracture of phalanx of foot (disorder),5.0
2357,68226007,Acute cystitis (disorder),5.0
2358,372138000,Carcinoma of esophagus (disorder),5.0
2359,204731006,Imperforate anus (disorder),5.0


In [6]:
# Get list of codes
diagnosis_code_list = snomed_diagnosis_codes.DIAGNOSIS_CODE_SNOMED.values.tolist()
problem_code_list = snomed_problem_codes.PROBLEM.values.tolist()
riskfactor_code_list = snomed_riskfactor_codes.SNOMED_CODE.values.tolist()
# Combine
code_list = list(set(diagnosis_code_list) | set(problem_code_list) | set(riskfactor_code_list))

In [9]:
# Filter international snomed for disorders
description2 = international_description[international_description['term'].str.contains("\(disorder\)")==True]
# Get list of conceptIDs
conceptIDs = description2.conceptId.values.tolist()
# Filter relationship by disorder conceptIDs
relationship2 = international_relationship[(international_relationship['sourceId'].isin(conceptIDs)) | (international_relationship['destinationId'].isin(conceptIDs))]
# Filter relationship by 'is a' relationships
relationship2 = relationship2[relationship2['typeId'] == 116680003]
# Generate df for working out distances using codes
relationship2_2 = relationship2.copy()
relationship2_2.drop_duplicates(subset=['active', 'moduleId', 'sourceId', 'destinationId', \
                                      'relationshipGroup', 'typeId', 'characteristicTypeId', 'modifierId'], inplace=True)
relationship2_2 = relationship2_2[relationship2_2['active']==1]
# Remove codes from list that will not be in the graph 
sourceId_code_list = relationship2_2.sourceId.values.tolist()
destinationId_code_list = relationship2_2.destinationId.values.tolist()
Id_code_list = list(set(sourceId_code_list) | set(destinationId_code_list))
final_code_list = list(set(code_list) & set(Id_code_list))
len(Id_code_list)
len(code_list)
len(final_code_list)

110571

2743

2694

In [11]:
# Graph with codes as nodes
id_disorder_graph = nx.from_pandas_edgelist(relationship2_2, 'sourceId', 'destinationId') # Graph with only current active relationships - can verify these paths online

In [109]:
# Work out path lengths and put in matrix
def path_lengths(graph, index_list):
    print('working...')
    np_path_matrix = np.zeros((len(index_list), len(index_list)))
    for a, b in enumerate(index_list):
        for c, d in enumerate(index_list):
            try:  
                path_length = nx.shortest_path_length(graph, source=b, target=d)
                np_path_matrix[a, c] = path_length
            except:
                np_path_matrix[a, c] = np.nan
    print('Done!')
    return np_path_matrix

In [110]:
id_disorder_np_path_matrix = path_lengths(id_disorder_graph, final_code_list)

working...
Done!


In [None]:
# Create df of path lengths
path_df = pd.DataFrame(id_disorder_np_path_matrix, columns=final_code_list)
path_df.index = path_df.columns

In [None]:
# Do 1 / path_matrix to get numbers between 0 and 1\n
path_df2 = 1 / path_df
path_df2.replace([np.inf, -np.inf], 1, inplace=True)

In [73]:
# International 
id_disorder_np_path_matrix.shape

(2694, 2694)

In [82]:
# Save - International
#path_df.to_csv('international_snomed_distance.csv')
#path_df2.to_csv('international_reciprocal_snomed_distance.csv')

In [6]:
# Adjustment
path = r'data/international_snomed_distance.csv'
international_snomed_distance = pd.read_csv(path, index_col=0)

In [10]:
international_snomed_distance

Unnamed: 0,52011008,202752002,197632002,205824006,423125000,90325002,38101003,295125005,127189005,372138000,61653009,282026002,78250005,203178006,162218007,146801000119103,15188001,302932006,2089002,230654000,...,52781008,403202002,400130008,232407000,254935002,18391007,125501000119105,93143009,230572002,723116002,40108008,415105001,41345002,371073003,5505005,425558002,262955000,367403001,109355002,65323003
52011008,0.0,4.0,5.0,4.0,2.0,4.0,5.0,5.0,5.0,5.0,3.0,2.0,5.0,4.0,6.0,5.0,4.0,4.0,2.0,4.0,...,3.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,3.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0
202752002,4.0,0.0,6.0,4.0,4.0,4.0,5.0,6.0,4.0,5.0,5.0,3.0,4.0,4.0,6.0,4.0,5.0,4.0,3.0,4.0,...,3.0,5.0,4.0,4.0,4.0,5.0,5.0,4.0,3.0,3.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,5.0,4.0,4.0
197632002,5.0,6.0,0.0,6.0,7.0,6.0,6.0,8.0,7.0,7.0,6.0,5.0,6.0,6.0,8.0,6.0,5.0,7.0,5.0,6.0,...,6.0,7.0,6.0,6.0,6.0,7.0,7.0,7.0,6.0,7.0,6.0,6.0,6.0,7.0,5.0,7.0,6.0,6.0,6.0,6.0
205824006,4.0,4.0,6.0,0.0,4.0,4.0,4.0,5.0,5.0,3.0,4.0,3.0,4.0,4.0,6.0,4.0,3.0,5.0,3.0,4.0,...,3.0,5.0,4.0,3.0,4.0,6.0,5.0,4.0,4.0,5.0,3.0,4.0,3.0,3.0,4.0,5.0,4.0,4.0,4.0,4.0
423125000,2.0,4.0,7.0,4.0,0.0,5.0,5.0,5.0,5.0,4.0,3.0,3.0,5.0,4.0,7.0,5.0,4.0,4.0,3.0,4.0,...,3.0,5.0,4.0,4.0,4.0,5.0,6.0,4.0,4.0,5.0,6.0,4.0,3.0,6.0,5.0,5.0,3.0,5.0,4.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425558002,5.0,5.0,7.0,5.0,5.0,5.0,6.0,5.0,5.0,5.0,6.0,3.0,5.0,5.0,7.0,5.0,5.0,6.0,4.0,5.0,...,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,6.0,5.0,0.0,5.0,4.0,3.0,5.0
262955000,4.0,4.0,6.0,4.0,3.0,4.0,5.0,5.0,5.0,5.0,4.0,3.0,4.0,4.0,6.0,4.0,4.0,5.0,4.0,3.0,...,4.0,5.0,3.0,4.0,3.0,5.0,5.0,5.0,4.0,6.0,4.0,3.0,4.0,5.0,4.0,5.0,0.0,4.0,4.0,4.0
367403001,5.0,5.0,6.0,4.0,5.0,3.0,6.0,6.0,5.0,4.0,6.0,4.0,4.0,5.0,7.0,5.0,5.0,5.0,5.0,4.0,...,4.0,6.0,4.0,4.0,3.0,6.0,5.0,4.0,4.0,6.0,5.0,4.0,5.0,5.0,5.0,4.0,4.0,0.0,4.0,5.0
109355002,4.0,4.0,6.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,5.0,4.0,6.0,3.0,4.0,5.0,4.0,3.0,...,4.0,4.0,4.0,4.0,3.0,5.0,5.0,3.0,4.0,5.0,4.0,3.0,4.0,5.0,4.0,3.0,4.0,4.0,0.0,4.0


In [8]:
# Add one so path step of 1 does not become multiplication by one later
new_international_snomed_distance = international_snomed_distance + 1

In [9]:
new_international_snomed_distance

Unnamed: 0,52011008,202752002,197632002,205824006,423125000,90325002,38101003,295125005,127189005,372138000,61653009,282026002,78250005,203178006,162218007,146801000119103,15188001,302932006,2089002,230654000,...,52781008,403202002,400130008,232407000,254935002,18391007,125501000119105,93143009,230572002,723116002,40108008,415105001,41345002,371073003,5505005,425558002,262955000,367403001,109355002,65323003
52011008,1.0,5.0,6.0,5.0,3.0,5.0,6.0,6.0,6.0,6.0,4.0,3.0,6.0,5.0,7.0,6.0,5.0,5.0,3.0,5.0,...,4.0,5.0,5.0,5.0,5.0,6.0,6.0,6.0,5.0,6.0,5.0,5.0,4.0,6.0,6.0,6.0,5.0,6.0,5.0,5.0
202752002,5.0,1.0,7.0,5.0,5.0,5.0,6.0,7.0,5.0,6.0,6.0,4.0,5.0,5.0,7.0,5.0,6.0,5.0,4.0,5.0,...,4.0,6.0,5.0,5.0,5.0,6.0,6.0,5.0,4.0,4.0,5.0,5.0,5.0,6.0,5.0,6.0,5.0,6.0,5.0,5.0
197632002,6.0,7.0,1.0,7.0,8.0,7.0,7.0,9.0,8.0,8.0,7.0,6.0,7.0,7.0,9.0,7.0,6.0,8.0,6.0,7.0,...,7.0,8.0,7.0,7.0,7.0,8.0,8.0,8.0,7.0,8.0,7.0,7.0,7.0,8.0,6.0,8.0,7.0,7.0,7.0,7.0
205824006,5.0,5.0,7.0,1.0,5.0,5.0,5.0,6.0,6.0,4.0,5.0,4.0,5.0,5.0,7.0,5.0,4.0,6.0,4.0,5.0,...,4.0,6.0,5.0,4.0,5.0,7.0,6.0,5.0,5.0,6.0,4.0,5.0,4.0,4.0,5.0,6.0,5.0,5.0,5.0,5.0
423125000,3.0,5.0,8.0,5.0,1.0,6.0,6.0,6.0,6.0,5.0,4.0,4.0,6.0,5.0,8.0,6.0,5.0,5.0,4.0,5.0,...,4.0,6.0,5.0,5.0,5.0,6.0,7.0,5.0,5.0,6.0,7.0,5.0,4.0,7.0,6.0,6.0,4.0,6.0,5.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425558002,6.0,6.0,8.0,6.0,6.0,6.0,7.0,6.0,6.0,6.0,7.0,4.0,6.0,6.0,8.0,6.0,6.0,7.0,5.0,6.0,...,5.0,6.0,6.0,6.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0,5.0,6.0,7.0,6.0,1.0,6.0,5.0,4.0,6.0
262955000,5.0,5.0,7.0,5.0,4.0,5.0,6.0,6.0,6.0,6.0,5.0,4.0,5.0,5.0,7.0,5.0,5.0,6.0,5.0,4.0,...,5.0,6.0,4.0,5.0,4.0,6.0,6.0,6.0,5.0,7.0,5.0,4.0,5.0,6.0,5.0,6.0,1.0,5.0,5.0,5.0
367403001,6.0,6.0,7.0,5.0,6.0,4.0,7.0,7.0,6.0,5.0,7.0,5.0,5.0,6.0,8.0,6.0,6.0,6.0,6.0,5.0,...,5.0,7.0,5.0,5.0,4.0,7.0,6.0,5.0,5.0,7.0,6.0,5.0,6.0,6.0,6.0,5.0,5.0,1.0,5.0,6.0
109355002,5.0,5.0,7.0,5.0,5.0,5.0,5.0,6.0,5.0,5.0,5.0,5.0,6.0,5.0,7.0,4.0,5.0,6.0,5.0,4.0,...,5.0,5.0,5.0,5.0,4.0,6.0,6.0,4.0,5.0,6.0,5.0,4.0,5.0,6.0,5.0,4.0,5.0,5.0,1.0,5.0


In [11]:
# Do 1 / path_matrix to get numbers between 0 and 1
new_international_reciprocal_snomed_distance = 1 / new_international_snomed_distance

In [12]:
# Save - International
#new_international_snomed_distance.to_csv('new_international_snomed_distance.csv')
#new_international_reciprocal_snomed_distance.to_csv('new_international_reciprocal_snomed_distance.csv')