In [1]:
# Libraries
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)
import networkx as nx

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pickle

Graph

In [2]:
# -----------------------------
# Import
# -----------------------------

path = r'/home/wb1115/VSCode_projects/multimorbid/csv/International/StatedRelationship.txt'
international_statedrelationship = pd.read_csv(path, delimiter="\t")

path = r'/home/wb1115/VSCode_projects/multimorbid/csv/International/Relationship.txt'
international_relationship = pd.read_csv(path, delimiter="\t")

path = r'/home/wb1115/VSCode_projects/multimorbid/csv/International/Description.txt'
international_description = pd.read_csv(path, delimiter="\t")

path = r'/home/wb1115/VSCode_projects/multimorbid/csv/International/Concept.txt'
international_concept = pd.read_csv(path, delimiter="\t")

In [3]:
# Filter for disorder in brackets 
description2 = international_description[international_description['term'].str.contains("\(disorder\)")==True]

# Get list of conceptIDs
conceptIDs = description2.conceptId.values.tolist()

# Filter relationship by disorder conceptIDs
# Cahnge to & so only diagnosis
relationship2 = international_relationship[(international_relationship['sourceId'].isin(conceptIDs)) & (international_relationship['destinationId'].isin(conceptIDs))]

# Filter relationship by 'is a' relationships
relationship2 = relationship2[relationship2['typeId'] == 116680003]

In [4]:
# Generate df for working out distances using codes
relationship2_2 = relationship2.copy()
relationship2_2.drop_duplicates(subset=['active', 'moduleId', 'sourceId', 'destinationId', \
                                      'relationshipGroup', 'typeId', 'characteristicTypeId', 'modifierId'], inplace=True)
relationship2_2 = relationship2_2[relationship2_2['active']==1]

In [5]:
# Get iCARE codes
path = r'/home/wb1115/VSCode_projects/multimorbid/csv/snomed_diagnosis_codes_morethan5.csv'
snomed_diagnosis_codes = pd.read_csv(path)

path = r'/home/wb1115/VSCode_projects/multimorbid/csv/snomed_problem_codes_morethan5.csv'
snomed_problem_codes = pd.read_csv(path)

path = r'/home/wb1115/VSCode_projects/multimorbid/csv/snomed_riskfactor_codes_morethan5.csv'
snomed_riskfactor_codes = pd.read_csv(path)

In [6]:
# Get list of codes in iCARE
diagnosis_code_list = snomed_diagnosis_codes.DIAGNOSIS_CODE_SNOMED.values.tolist()
problem_code_list = snomed_problem_codes.PROBLEM.values.tolist()
riskfactor_code_list = snomed_riskfactor_codes.SNOMED_CODE.values.tolist()
# Combine
code_list = list(set(diagnosis_code_list) | set(problem_code_list) | set(riskfactor_code_list))

In [7]:
# Get final list of codes in iCARE and relationship2_2
sourceId_code_list = relationship2_2.sourceId.values.tolist()
destinationId_code_list = relationship2_2.destinationId.values.tolist()
Id_code_list = list(set(sourceId_code_list) | set(destinationId_code_list))
combined_code_list = list(set(code_list) & set(Id_code_list))

In [8]:
# Graph with codes as nodes
# Swap round initially so can use predecessors function 
id_disorder_graph = nx.from_pandas_edgelist(relationship2_2, source='destinationId', target='sourceId', create_using=nx.DiGraph)

In [10]:
nx.info(id_disorder_graph)
nx.is_directed(id_disorder_graph)

'DiGraph with 107733 nodes and 342784 edges'

True

In [12]:
def recursion_fun(graph, list, n, flag, exsplored_list, parent_list):
    n += 1
    for x in list:
        if x in exsplored_list:
            continue
        predecessors2 = [pred for pred in graph.predecessors(x)]
        exsplored_list.append(x)
        parent_list = parent_list + predecessors2
        if len(predecessors2) > 0:
            parent_list = recursion_fun(id_disorder_graph, predecessors2, n, False, exsplored_list, parent_list)
            parent_list = parent_list + predecessors2
        
    return parent_list

In [12]:
import sys
sys.setrecursionlimit(5000)

In [13]:
# Get all parent codes for final graph
# Define list and dict
final_code_list = []
code_parents_dict = {}
# Add iCARE codes
final_code_list = final_code_list + combined_code_list
for x in combined_code_list:
    predecessors = [pred for pred in id_disorder_graph.predecessors(x)]
    specific_code_parents = []
    specific_code_parents = specific_code_parents + predecessors
    exsplored_list = []
    parent_list = []
    specific_code_parents2 = recursion_fun(id_disorder_graph, predecessors, 0, True, exsplored_list, parent_list)
    specific_code_parents = specific_code_parents + specific_code_parents2
    specific_code_parents = list(set(specific_code_parents))
    code_parents_dict[x] = specific_code_parents
    final_code_list = final_code_list + specific_code_parents
final_code_list = list(set(final_code_list))

In [14]:
specific_code_parents
len(specific_code_parents)
code_parents_dict
len(code_parents_dict)
final_code_list
len(final_code_list)

[928000,
 64572001,
 19660004,
 312225001,
 81573002,
 105969002,
 362965005,
 373673007,
 123946008]

9

{52011008: [928000,
  122549002,
  118933004,
  362965005,
  128597007,
  123397009,
  128618006,
  123946008,
  125599006,
  127278005,
  118947000,
  609411003,
  262515005,
  897603007,
  609336008,
  128605003,
  417746004,
  64572001,
  19660004,
  312225001,
  105969002,
  417163006],
 202752002: [928000,
  2304001,
  46176001,
  118928003,
  399269003,
  76069003,
  362965005,
  280133005,
  363173007,
  123946008,
  371082009,
  410730009,
  50927007,
  118948005,
  443700006,
  274137005,
  53417006,
  128121009,
  785875003,
  699699005,
  129139009,
  363171009,
  367539009,
  372109003,
  609618002,
  399986003,
  363170005,
  822988000,
  64572001,
  8316001,
  33308003,
  19660004,
  312225001,
  105969002,
  699302001,
  363169009,
  88230002,
  239718003,
  302934007,
  128139000,
  3723001,
  36427004,
  363179006],
 197632002: [36171008,
  20917003,
  362965005,
  123946008,
  118943001,
  197679002,
  362975008,
  90708001,
  118948005,
  42030000,
  128121009,
  128

2694

[735445000,
 92373002,
 22741003,
 78250005,
 283050005,
 399999000,
 199295003,
 240255006,
 230654000,
 312574001,
 115966001,
 238846003,
 785875003,
 208339003,
 247464001,
 63144007,
 126845000,
 397181002,
 118653003,
 724861006,
 422183001,
 363004000,
 2556008,
 449020009,
 443089001,
 713425003,
 111313001,
 283345006,
 699302001,
 723878005,
 31654005,
 248742008,
 128123007,
 281936005,
 724304006,
 288293001,
 22053006,
 208634001,
 7930004,
 1179386005,
 268239009,
 118948005,
 450724008,
 92537005,
 301433005,
 242254002,
 363299008,
 816120008,
 254968009,
 724599009,
 5964004,
 733217006,
 700449008,
 235766003,
 37323009,
 363168001,
 449184004,
 285344007,
 72352009,
 125600009,
 373621006,
 414581006,
 123765007,
 254837009,
 64586002,
 195658003,
 251167004,
 197919005,
 443679004,
 427295004,
 312607004,
 237044002,
 282100009,
 105972009,
 237896000,
 129139009,
 414024009,
 129565002,
 363037003,
 18973006,
 29426003,
 283378004,
 21234008,
 64455005,
 251036003,

7610

In [None]:
# Save
#with open('snomed_codes_parents_dict.pickle', 'wb') as handle:
#    pickle.dump(code_parents_dict, handle)

In [9]:
code_parents_dict = pickle.load(open("snomed_codes_parents_dict.pickle", "rb"))
snomed_codes_parents_df = pd.Series(code_parents_dict, name='parents')
snomed_codes_parents_df.index.name = 'snomed_code'
snomed_codes_parents_df

In [10]:
# Save 
#snomed_codes_parents_df.to_csv('snomed_codes_parents.csv')