In [3]:
%matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from pyumls.umls_reader import read_umls
import pyumls.umls as umls
from collections import defaultdict, Counter
import itertools


Using matplotlib backend: TkAgg


In [4]:
#https://www.ncbi.nlm.nih.gov/books/NBK9685/
umls_conso = Path(r'/shared/hltdir1/disk1/home/max/data/ontologies/umls_2019/2019AA-full/2019AA/META/MRCONSO.RRF')
umls_def = Path(r'/shared/hltdir1/disk1/home/max/data/ontologies/umls_2019/2019AA-full/2019AA/META/MRDEF.RRF')
umls_rels = Path(r'/shared/hltdir1/disk1/home/max/data/ontologies/umls_2019/2019AA-full/2019AA/META/MRREL.RRF')
umls_hier = Path(r'/shared/hltdir1/disk1/home/max/data/ontologies/umls_2019/2019AA-full/2019AA/META/MRHIER.RRF')

In [1]:
def umls_rel_filter(x):
  # remove recursive relations
  if x.cui2 == x.cui1:
    return False
  # ignore siblings, CHD is enough to infer
  if x.rel == 'SIB':
    return False
  # ignore PAR, CHD is reflexive
  if x.rel == 'PAR':
    return False
  # ignore RO with no relA, not descriptive 
  if x.rel == 'RO' and x.rela == '':
    return False
  # reflexive with AQ
  if x.rel == 'QB':
    return False
  # too vague
  if x.rel == 'RB':
    return False
  return True


In [5]:
rel_iter = read_umls(
  umls_rels, 
  umls.UmlsRelation, 
  umls_filter=umls_rel_filter
)

rela_counter = Counter()
total_count = 0
for rel in rel_iter:
  rela_counter[(rel.rel, rel.rela)] += 1
  total_count += 1

print(f'{total_count} rels match filter.')

20071511 rels match filter.


In [6]:
min_count = 100
kept_rel_count = 0
skipped_rel_count = 0

kept_rels = []
skipped_rels = []
for (rel, rela), rela_count in sorted(rela_counter.items(), key=lambda x: -x[1]):
  if rela_count < min_count:
    skipped_rels.append((rel, rela, rela_count))
    skipped_rel_count += rela_count
  else:
    kept_rels.append((rel, rela, rela_count))
    print(f'{rel}, {rela}, {rela_count}')
    kept_rel_count += rela_count

CHD, , 2125393
CHD, isa, 1383066
RO, has_inactive_ingredient, 1253624
RO, inactive_ingredient_of, 1253624
AQ, , 602217
RO, has_finding_site, 355131
RO, finding_site_of, 355131
RN, mapped_to, 354410
RO, has_active_ingredient, 342188
RO, active_ingredient_of, 342188
RO, method_of, 337561
RO, has_method, 337561
RN, , 321690
RN, isa, 321396
RQ, classifies, 316858
RQ, classified_as, 316858
RO, has_active_moiety, 284521
RO, active_moiety_of, 284521
RO, associated_morphology_of, 253171
RO, has_associated_morphology, 253171
RO, has_component, 249916
RO, component_of, 249916
RO, has_ingredient, 238135
RO, ingredient_of, 238135
RO, subset_includes_concept, 199280
RO, concept_in_subset, 199280
RO, dose_form_of, 133035
RO, has_dose_form, 133035
RO, has_procedure_site, 129560
RO, procedure_site_of, 129560
RO, has_system, 127876
RO, system_of, 127876
RO, has_manifestation, 113166
RO, manifestation_of, 113166
RO, constitutes, 110875
RO, consists_of, 110875
RN, tradename_of, 109579
RO, has_class, 9843

In [7]:
print(f'Kept {len(kept_rels)} rel types with {kept_rel_count} rels.')
print(f'Skipped {len(skipped_rels)} rel types with {skipped_rel_count} rels.')

Kept 629 rel types with 20066621 rels.
Skipped 175 rel types with 4890 rels.


In [15]:
removed_rels = []
exit_early = False
for (f_rel, f_rela, f_count), (s_rel, s_rela, s_count) in zip(kept_rels[:-1], kept_rels[1:]):
  if f_count == s_count:
    print(f'({f_rel}:{f_rela}) ?= ({s_rel}:{s_rela}) ({f_count})')
    need_choice = True
    if f_rela[:4] == 'has_' and s_rela[-3:] == '_of':
      print('Auto resolve to keep left (1)')
      removed_rels.append((s_rel, s_rela, s_count))
      need_choice = False
    elif f_rela[-3:] == '_of' and s_rela[:4] == 'has_':
      print('Auto resolve to keep right (2)')
      removed_rels.append((f_rel, f_rela, f_count))
      need_choice = False
    while need_choice:
      choice = input('Keep left (1), keep right (2), skip (3), exit (0):')
      if choice == '1':
        removed_rels.append((s_rel, s_rela, s_count))
        need_choice = False
      elif choice == '2':
        removed_rels.append((f_rel, f_rela, f_count))
        need_choice = False
      elif choice == '3':
        need_choice = False
      elif choice == '0':
        print('Exiting early.')
        need_choice = False
        exit_early = True
      else:
        print(f'Invalid option: {choice}')
  if exit_early:
    break

(RO:has_inactive_ingredient) ?= (RO:inactive_ingredient_of) (1253624)
Auto resolve to keep left (1)
(RO:has_finding_site) ?= (RO:finding_site_of) (355131)
Auto resolve to keep left (1)
(RO:has_active_ingredient) ?= (RO:active_ingredient_of) (342188)
Auto resolve to keep left (1)
(RO:method_of) ?= (RO:has_method) (337561)
Auto resolve to keep right (2)
(RQ:classifies) ?= (RQ:classified_as) (316858)
Keep left (1), keep right (2), skip (3), exit (0):1
(RO:has_active_moiety) ?= (RO:active_moiety_of) (284521)
Auto resolve to keep left (1)
(RO:associated_morphology_of) ?= (RO:has_associated_morphology) (253171)
Auto resolve to keep right (2)
(RO:has_component) ?= (RO:component_of) (249916)
Auto resolve to keep left (1)
(RO:has_ingredient) ?= (RO:ingredient_of) (238135)
Auto resolve to keep left (1)
(RO:subset_includes_concept) ?= (RO:concept_in_subset) (199280)
Keep left (1), keep right (2), skip (3), exit (0):2
(RO:dose_form_of) ?= (RO:has_dose_form) (133035)
Auto resolve to keep right (2)


Keep left (1), keep right (2), skip (3), exit (0):1
(RO:may_be_molecular_abnormality_of_disease) ?= (RO:disease_may_have_molecular_abnormality) (8720)
Keep left (1), keep right (2), skip (3), exit (0):2
(RQ:used_for) ?= (RQ:use) (8213)
Keep left (1), keep right (2), skip (3), exit (0):2
(RO:uses) ?= (RO:used_by) (7663)
Keep left (1), keep right (2), skip (3), exit (0):1
(RO:occurs_after) ?= (RO:occurs_before) (7206)
Keep left (1), keep right (2), skip (3), exit (0):1
(RO:regulates) ?= (RO:regulated_by) (7180)
Keep left (1), keep right (2), skip (3), exit (0):1
(RO:therapeutic_class_of) ?= (RO:has_therapeutic_class) (6834)
Auto resolve to keep right (2)
(RO:uses_substance) ?= (RO:substance_used_by) (6759)
Keep left (1), keep right (2), skip (3), exit (0):1
(RO:has_basis_of_strength_substance) ?= (RO:basis_of_strength_substance_of) (6554)
Auto resolve to keep left (1)
(RO:has_focus) ?= (RO:focus_of) (6493)
Auto resolve to keep left (1)
(RO:has_measured_component) ?= (RO:measured_componen

Keep left (1), keep right (2), skip (3), exit (0):2
(RO:has_concentration_strength_numerator_value) ?= (RO:concentration_strength_denominator_value_of) (1859)
Auto resolve to keep left (1)
(RO:concentration_strength_denominator_value_of) ?= (RO:concentration_strength_numerator_unit_of) (1859)
Keep left (1), keep right (2), skip (3), exit (0):2
(RO:concentration_strength_numerator_unit_of) ?= (RO:concentration_strength_denominator_unit_of) (1859)
Keep left (1), keep right (2), skip (3), exit (0):1
(RO:concentration_strength_denominator_unit_of) ?= (RO:concentration_strength_numerator_value_of) (1859)
Keep left (1), keep right (2), skip (3), exit (0):1
(RO:concentration_strength_numerator_value_of) ?= (RO:has_concentration_strength_numerator_unit) (1859)
Auto resolve to keep right (2)
(RO:has_concentration_strength_numerator_unit) ?= (RO:has_concentration_strength_denominator_unit) (1859)
Keep left (1), keep right (2), skip (3), exit (0):1
(RQ:consider) ?= (RQ:consider_from) (1830)
Keep 

Keep left (1), keep right (2), skip (3), exit (0):2
(RO:excised_anatomy_has_procedure) ?= (RO:procedure_has_excised_anatomy) (476)
Keep left (1), keep right (2), skip (3), exit (0):2
(RO:superior_to) ?= (RO:inferior_to) (469)
Keep left (1), keep right (2), skip (3), exit (0):1
(RO:metabolic_site_of) ?= (RO:site_of_metabolism) (441)
Keep left (1), keep right (2), skip (3), exit (0):2
(RO:is_exam_for) ?= (RO:has_exam) (431)
Keep left (1), keep right (2), skip (3), exit (0):2
(RO:has_contraindicated_mechanism_of_action) ?= (RO:contraindicated_mechanism_of_action_of) (422)
Auto resolve to keep left (1)
(RO:gene_product_variant_of_gene_product) ?= (RO:gene_product_has_gene_product_variant) (415)
Keep left (1), keep right (2), skip (3), exit (0):1
(RO:scale_type_of) ?= (RO:has_scale_type) (398)
Auto resolve to keep right (2)
(RO:has_specimen_source_identity) ?= (RO:specimen_source_identity_of) (397)
Auto resolve to keep left (1)
(RN:mapped_from) ?= (RO:regimen_has_accepted_use_for_disease) (

(RO:receives_drainage_from) ?= (RO:drains_into) (107)
Keep left (1), keep right (2), skip (3), exit (0):2
(RO:special_category_includes_neoplasm) ?= (RO:neoplasm_has_special_category) (105)
Keep left (1), keep right (2), skip (3), exit (0):2
(RO:chromosomal_location_of_allele) ?= (RO:allele_in_chromosomal_location) (104)
Keep left (1), keep right (2), skip (3), exit (0):2


In [53]:
with open('removed_rels.txt', 'w') as f:
  for (f_rel, f_rela, f_count) in removed_rels:
    f.write(f'{f_rel}, {f_rela}, {f_count}\n')

In [21]:
filtered_rels = set(kept_rels) - set(removed_rels)
filtered_rels = sorted(list(filtered_rels), key=lambda x: -x[2])
exit_early = False
new_count = 0
for (f_rel, f_rela, f_count) in filtered_rels:
  print(f'{f_rel}, {f_rela}, {f_count}')
  new_count += f_count
print(f'filtered rel count: {new_count}')

CHD, , 2125393
CHD, isa, 1383066
RO, has_inactive_ingredient, 1253624
AQ, , 602217
RO, has_finding_site, 355131
RN, mapped_to, 354410
RO, has_active_ingredient, 342188
RO, has_method, 337561
RN, , 321690
RN, isa, 321396
RQ, classifies, 316858
RO, has_active_moiety, 284521
RO, has_associated_morphology, 253171
RO, has_component, 249916
RO, has_ingredient, 238135
RO, concept_in_subset, 199280
RO, has_dose_form, 133035
RO, has_procedure_site, 129560
RO, has_system, 127876
RO, has_manifestation, 113166
RO, constitutes, 110875
RN, tradename_of, 109579
RO, has_class, 98438
RO, has_fragments_for_synonyms, 95071
RO, has_time_aspect, 88953
RO, has_scale, 85824
RO, has_property, 85545
RO, has_causative_agent, 84622
RN, part_of, 84035
RO, has_direct_procedure_site, 82208
RO, may_treat, 81544
RO, interprets, 80351
RO, answer_to, 79378
RO, measures, 78947
RO, analyzes, 72041
RO, possibly_equivalent_to, 65964
RO, mapped_to, 63108
RQ, , 59292
RO, has_physiologic_effect, 55461
RO, has_contraindicated_

In [39]:
# start with identity mapping
def left_merge(x, y):
  count_mapping[x] += count_mapping[y]
  del count_mapping[y]
  merge_mapping[y] = x
def right_merge(x, y):
  left_merge(y, x)

def new_merge(x, y, z):
  count_mapping[z] = 0
  merge_mapping[z] = z
  left_merge(z, x)
  left_merge(z, y)
  
def left_multi_merge(x, y_list):
  for y in y_list:
    left_merge(x, y)

In [38]:
rela_mapping = defaultdict(list)
for (r, ra, rc) in filtered_rels:
  if ra != '':
    rela_mapping[ra].append(r)
  
for ra, r_list in rela_mapping.items():
  if len(r_list) > 1:
    print(f'{ra}: {r_list}')


isa: ['CHD', 'RN']
mapped_to: ['RN', 'RO', 'RQ']
part_of: ['RN', 'CHD']
member_of: ['RN', 'CHD']
has_precise_ingredient: ['RN', 'RO']
replaces: ['RO', 'RQ']
contains: ['RN', 'RO']


In [42]:
count_mapping = {(r, ra): rc for (r, ra, rc) in filtered_rels}
merge_mapping = {(r, ra): (r, ra) for (r, ra, rc) in filtered_rels}
# merge (CHD, ''), ('RN', ''), ('RN', 'isa'), ('CHD', 'has_parent') to ('CHD', 'isa')
left_multi_merge(('CHD', 'isa'), [('CHD', ''), ('RN', ''), ('RN', 'isa'), ('CHD', 'has_parent')])

# merge ['RN', 'RO', 'RQ'], 'mapped_to' to single 'RO', mapped_to
left_multi_merge(('RO', 'mapped_to'), [('RN', 'mapped_to'), ('RQ', 'mapped_to')])

# merge part_of: ['RN', 'CHD']
left_merge(('RN', 'part_of'), ('CHD', 'part_of'))

# merge member_of: ['RN', 'CHD']
left_merge(('RN', 'member_of'), ('CHD', 'member_of'))

# merge has_precise_ingredient: ['RN', 'RO']
left_merge(('RO', 'has_precise_ingredient'), ('RN', 'has_precise_ingredient'))

# merge replaces: ['RO', 'RQ']
left_merge(('RO', 'replaces'), ('RQ', 'replaces'))

# merge contains: ['RN', 'RO']
left_merge(('RO', 'contains'), ('RN', 'contains'))

In [54]:
with open('rel_merge_mapping.txt', 'w') as f:
  for (r, ra), (mr, mra) in merge_mapping.items():
    f.write(f'{r}:{ra},{mr}:{mra}\n')

In [55]:
with open('rel_count_mapping.txt', 'w') as f:
  for (r, ra), rc in count_mapping.items():
    f.write(f'{r}:{ra},{rc}\n')

In [51]:
final_count = 0
for (r, ra), rc in sorted(count_mapping.items(), key=lambda x: -x[1]):
  print(f'{r}:{ra} - {rc}')
  final_count += rc
print(f'final filtered rel count: {final_count}')
print(f'final filtered rel type count: {len(count_mapping)}')

CHD:isa - 4169658
RO:has_inactive_ingredient - 1253624
AQ: - 602217
RO:mapped_to - 427709
RO:has_finding_site - 355131
RO:has_active_ingredient - 342188
RO:has_method - 337561
RQ:classifies - 316858
RO:has_active_moiety - 284521
RO:has_associated_morphology - 253171
RO:has_component - 249916
RO:has_ingredient - 238135
RO:concept_in_subset - 199280
RO:has_dose_form - 133035
RO:has_procedure_site - 129560
RO:has_system - 127876
RO:has_manifestation - 113166
RO:constitutes - 110875
RN:tradename_of - 109579
RN:part_of - 103430
RO:has_class - 98438
RO:has_fragments_for_synonyms - 95071
RO:has_time_aspect - 88953
RO:has_scale - 85824
RO:has_property - 85545
RO:has_causative_agent - 84622
RO:has_direct_procedure_site - 82208
RO:may_treat - 81544
RO:interprets - 80351
RO:answer_to - 79378
RO:measures - 78947
RO:analyzes - 72041
RO:possibly_equivalent_to - 65964
RQ: - 59292
RO:has_physiologic_effect - 55461
RO:has_contraindicated_drug - 51755
RO:has_pathological_process - 51006
RO:is_abnormal_c

In [52]:
!ls

DataValidation.ipynb	 pyumls			removed_rels.txt
ExploreRels.ipynb	 rel_count_mapping.txt
ExploreRels-Stats.ipynb  rel_merge_mapping.txt
