In [9]:
%pwd
%matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from python.pyumls.umls_reader import read_umls
import pyumls.umls as umls
from collections import defaultdict, Counter
import itertools


Using matplotlib backend: TkAgg


ModuleNotFoundError: No module named 'python.pyumls'

In [None]:
#https://www.ncbi.nlm.nih.gov/books/NBK9685/
umls_conso = Path(r'/shared/hltdir1/disk1/home/max/data/ontologies/umls_2019/2019AA-full/2019AA/META/MRCONSO.RRF')
umls_def = Path(r'/shared/hltdir1/disk1/home/max/data/ontologies/umls_2019/2019AA-full/2019AA/META/MRDEF.RRF')
umls_rels = Path(r'/shared/hltdir1/disk1/home/max/data/ontologies/umls_2019/2019AA-full/2019AA/META/MRREL.RRF')
umls_hier = Path(r'/shared/hltdir1/disk1/home/max/data/ontologies/umls_2019/2019AA-full/2019AA/META/MRHIER.RRF')

In [3]:
# https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html#STT
languages = { 'ENG' }
suppresses = { }
# suppresses = { 'O' }
tses = { 'P' }
pfes = { 'PF' }
isprefs = { 'Y' }

def umls_atom_filter(x):
  # filter out non-english atoms
  if x.lat not in languages:
    return False
  # TODO determine best way to filter atoms out
  # Ignore atoms with supress flag
  if x.suppress in suppresses:
    return False
  # Ignore non-ts preferred atoms
  if x.ts not in tses:
    return False
  # Ignore non-stt preferred atoms
  if x.stt not in pfes:
    return False
  # Ignore non-ispref atoms
  if x.ispref not in isprefs:
    return False
  return True

umls_atoms = list(
  read_umls(
    umls_conso, 
    umls.UmlsAtom, 
    umls_filter=umls_atom_filter
  )
)
len(umls_atoms)
atoms_dict = defaultdict(list)
for atom in umls_atoms:
  atoms_dict[atom.cui].append(atom)

In [4]:
counts = []
for key, value in atoms_dict.items():
  counts.append(len(value))
print(f'min: {min(counts)}')
print(f'max: {max(counts)}')
print(f'avg: {np.mean(counts)}')
print(len(atoms_dict))

min: 1
max: 1
avg: 1.0
3285966


In [5]:
umls_defs = list(read_umls(umls_def, umls.UmlsDefinition))
def_dict = defaultdict(list)
for d in umls_defs:
  def_dict[d.cui].append(d)
counts = []
for key, value in def_dict.items():
  counts.append(len(value))
print(f'min: {min(counts)}')
print(f'max: {max(counts)}')
print(f'avg: {np.mean(counts)}')
print(len(def_dict))

min: 1
max: 25
avg: 1.3455248460547042
209490


In [6]:
def wrap(s, w):
    return [s[i:i + w] for i in range(0, len(s), w)]
  
def display_cui(cui):
  c_atoms = atoms_dict[cui]
  if len(c_atoms) > 0:
    c_string = c_atoms[0].string
    c_cui = c_atoms[0].cui
  else:
    c_string = 'UNKNOWN'
    c_cui = cui
  print(f'{c_string} ({c_cui})')
  for definition in def_dict[c_cui]:
    for w_str in wrap(definition.definition, 80):
      print(f'  - {definition.sab}: {w_str}')
    print()
    
def display_rel(rel):
  display_cui(rel.cui2)
  print()
  print(f'{rel.rela} ({rel.rel})({rel.stype2}->{rel.stype1})({rel.sab})')
  print()
  display_cui(rel.cui1)
  

In [71]:
# keep_rels = {'RO'} # , 'RB', 'RN', 'RU'
# skip_relas = {} #{'exhibited_by', 'measured_by', 'measures', 'exhibits'}
# must_have_rela = True
# must_have_def = True # def_dict[d.cui]
# seen_relas = set()
# def umls_rel_filter(x):
#   # remove recursive relations
#   if x.cui2 == x.cui1:
#     return False
#   if x.rel not in keep_rels:
#     return False
#   if not x.rela and must_have_rela:
#     return False
#   if x.rela in skip_relas:
#     return False
#   # looking for unique relas 
#   if x.rela in seen_relas:
#     return False
#   if must_have_def and (x.cui2 not in def_dict or x.cui1 not in def_dict):
#     return False
#   #seen_relas.add(x.rela)
#   return True
#   if x.rela in skip_relas:
#     return False
#   if x.rel in interesting_rels:
#     if len(x.rela) > 0:
#       return True
#     else:
#       return True
#   if x.rela in interesting_relas:
#     return True
#   return False
def umls_rel_filter(x):
  if x.rel == 'CHD' and x.cui1 != x.cui2 and x.rela == 'has_parent':
    return True
  return False

In [72]:
rel_iter = read_umls(
  umls_rels, 
  umls.UmlsRelation, 
  umls_filter=umls_rel_filter
)


# rela_counter = Counter()
# rela_examples = defaultdict(list)
# example_count = 3
# total_count = 0
# for rel in rel_iter:
#   rela_counter[rel.rela] += 1
#   if len(rela_examples[rel.rela]) < 3:
#     rela_examples[rel.rela].append(rel)
#   total_count += 1

# print(f'{total_count} rels match filter.')

In [73]:
rel = next(rel_iter)
display_rel(rel)

Bovine Intestinal Adenosine Deaminase [EPC] (C2266954)

has_parent (CHD)(SCUI->SCUI)(MED-RT)

ADENOSINE DEAMINASE (C0001457)
  - MSH: An enzyme that catalyzes the hydrolysis of ADENOSINE to INOSINE with the elimina
  - MSH: tion of AMMONIA.

  - NCI: Adenosine deaminase (363 aa, ~41 kDa) is encoded by the human ADA gene. This pro
  - NCI: tein plays a role in purine metabolism.



In [None]:
top_n = 30
top_n_skip = 0
nrof_examples = 1
for rela, rela_count in rela_counter.most_common(top_n)[top_n_skip:]:
  print('===================================')
  examples = rela_examples[rela]
  print(f'{rela}: {rela_count}')
  print('Examples:')
  for example in examples[:nrof_examples]:
    print('-----------------------------------')
    display_rel(example)
    print('-----------------------------------')
  print('===================================')
    

In [85]:
rel_iter = read_umls(
  umls_rels, 
  umls.UmlsRelation, 
  umls_filter=umls_rel_filter
)

In [95]:
rel_count = 10
rels = list(
  itertools.islice(
    rel_iter, 
    rel_count
  )
)
  
for rel in rels:
  display_rel(rel)
  print('===================================')


Deprecated 17-Ketogenic steroids:Mass Concentration:Point in time:Whole blood:Quantitative (C2713054)

measures (RO)(SCUI->SCUI)(LNC)

17-Hydroxycorticosteroids (C0000163)
  - MSH: A group of hydroxycorticosteroids bearing a hydroxy group at the 17-position. Ur
  - MSH: inary excretion of these compounds is used as an index of adrenal function. They
  - MSH:  are used systemically in the free alcohol form, but with esterification of the 
  - MSH: hydroxy groups, topical effectiveness is increased.

17-Ketogenic steroids^pre dose dexamethasone:MCnc:24H:Urine:Qn (C2735891)

measures (RO)(SCUI->SCUI)(LNC)

17-Hydroxycorticosteroids (C0000163)
  - MSH: A group of hydroxycorticosteroids bearing a hydroxy group at the 17-position. Ur
  - MSH: inary excretion of these compounds is used as an index of adrenal function. They
  - MSH:  are used systemically in the free alcohol form, but with esterification of the 
  - MSH: hydroxy groups, topical effectiveness is increased.

17-Ketogenic steroid