In [62]:
import os
import numpy as np
from pathlib import Path
import json
from collections import defaultdict

In [112]:
# all files which end in .rel in this folder
label_dir='/home/max/data/artifacts/i2b2/2010/new_data/'

def load_files(folder):
  return [os.path.join(label_dir, folder, f) for f in os.listdir(os.path.join(label_dir, folder)) if os.path.isfile(os.path.join(label_dir, folder, f))]
train_files = load_files('train')
print(len(train_files))
dev_files = load_files('dev')
print(len(dev_files))
test_files = load_files('test')
print(len(test_files))
files = train_files + dev_files + test_files
print(len(files))

153
17
256
426


In [113]:
class Concept:
  def __init__(self, line):
    span_info, type_info, attr_info = line.strip().split('||')
    fields = span_info.split(' ')
    start, end = fields[-2], fields[-1]
    start = start.split(':')
    end = end.split(':')
    assert start[0] == end[0], print(span_info)
    self.sentence_idx = int(start[0]) - 1
    self.start = int(start[1])
    self.end = int(end[1])
    self.type = type_info.strip()[3:-1]
    self.assertion = attr_info.strip()[3:-1]

  def __eq__(self, other):
    return self.sentence_idx == other.sentence_idx and \
           self.start == other.start and \
           self.end == other.end and \
           self.type == other.type 

  def __hash__(self):
    return hash((self.sentence_idx, self.start, self.end, self.type))

  def __str__(self):
    return f"[sid: {self.sentence_idx}, ({self.start}, {self.end}), type={self.type}]"
  
class Relation:
  def __init__(self, line):
    arg1, type_info, arg2 = line.split('||')
    self.arg1 = Concept(arg1 + '||t="dummy"||a="dummy"')
    self.arg2 = Concept(arg2 + '||t="dummy"||a="dummy"')
    self.type = type_info[3:-1]
    assert self.arg1.sentence_idx == self.arg2.sentence_idx, print('Cross-sentence relation!: ' + line)
    self.sentence_idx = self.arg1.sentence_idx


In [115]:
relations = []
rel_types = set()
def get_concepts(txtfile):
  txtfile = Path(txtfile)
  name = txtfile.name.replace('.txt', '')
  confile = os.path.join(label_dir, 'all', (name + '.con'))
  astfile = os.path.join(label_dir, 'all', (name + '.ast'))
  relfile = os.path.join(label_dir, 'all', (name + '.rel'))
  if not os.path.exists(confile):
    return [], 0, 0
  problems = [Concept(line) for line in open(astfile)]
  other_concepts = [Concept(line.strip() + '||a="present"') for line in open(confile)]
  concepts = list(set(problems + other_concepts))
  sentences = [[w for w in line.strip().split(' ') if len(w.strip()) > 0]
                 for s, line in enumerate(open(str(txtfile)))]
  concept_text = []
  ss = defaultdict(int)
  for concept in concepts:
    txt = ' '.join(sentences[concept.sentence_idx][concept.start:concept.end+1])
    concept_text.append(txt)
    ss[concept.sentence_idx] += 1
  if os.path.exists(relfile):
    rels = [Relation(line) for line in open(str(relfile))]
    for r in rels:
      rel_types.add(r.type)
      arg1 = ' '.join(sentences[r.arg1.sentence_idx][r.arg1.start:r.arg1.end+1])
      arg2 = ' '.join(sentences[r.arg2.sentence_idx][r.arg2.start:r.arg2.end+1])
      
      relations.append(f'({arg1}) {r.type} ({arg2})')
  return (concept_text, len([(s_id, s_len) for (s_id, s_len) in ss.items() if s_len > 1]), len(sentences))
ctxt2id = {}
nrof_empty = 0
nrof_files = 0
s_count = 0
a_count = 0
for file in files:
  f_concepts, s_c, a_c = get_concepts(file)
  s_count += s_c
  nrof_files += 1
  a_count += a_c
  if len(f_concepts) < 2:
    nrof_empty += 1
  for ctxt in f_concepts:
    if ctxt not in ctxt2id:
      ctxt2id[ctxt] = len(ctxt2id)
print(len(ctxt2id))
print(nrof_files)
print(nrof_empty)
print(s_count)
print(a_count)

21638
426
1
10962
43941


In [116]:
for relation in relations:
  print(relation)

(Ampicillin) TrAP (sepsis risk factor)
(Gentamycin) TrAP (sepsis risk factor)
(Influenza immunization) TrAP (chronic lung disease)
(cesarean section) TrAP (progressive preterm labor)
(cesarean section) TrAP (rupture of membranes)
(immunization) TrAP (influenza)
(bilateral lower extremeties) TeRP (DVT)
(symptoms) PIP (trauma)
(CT Chest) TeRP (bilateral segmental , subsegmental PEs)
(ateletasis of lingula) PIP (pneumonitis)
(a d-dimer) TeRP (elevated)
(laxative abuse) PIP (Chronic constipation)
(CTA) TeRP (bilateral pulmonary embolisms)
(patient 's urinalysis) TeRP (UTI)
(ABG) TeRP (widened A-a gradient)
(coumadin) TrAP (clots)
(ASA) TeRP (tricyclic)
(antibiotics) TrAP (urinary tract infection)
(levofloxacin) TrAP (UTI)
(examination) TeRP (apparent distress)
(her kidney transplantation) TrCP (a well healed left lower quadrant incision)
(cadaveric pancreas transplantation) TrCP (complication)
(kidney transplantation) TrAP (End stage renal disease)
(Her diabetes mellitus) PIP (retinopathy)

(lasix) TrAP (shortness of breath)
(slight worsening) PIP (the still mild pulmonary edema)
(PERCOCET) TrAP (PAIN)
(The MRI of your knee) TeRP (a meniscal tear)
(CT chest) TeRP (saddle PE)
(EF) TeRP (hyperdynamic)
(Left ventricular systolic function) TeRP (hyperdynamic)
(acute management) TrAP (PE)
(an injury) PIP (meniscal tear)
(physical exam) TeRP (an injury)
(physical exam) TeRP (meniscal tear)
(EKG) TeRP (ischemia)
(hemodynamic monitoring) TeCP (saddle pulmonary embolus)
(anticoagulation) TrAP (saddle pulmonary embolus)
(lysis) TrAP (hemodynamic instability)
(P-pulmonale) PIP (clot burden)
(anticoagulation) TrAP (saddle PE)
(ischemia) PIP (any wall motion abnormalities)
(a small troponin leak) PIP (right heart strain)
(a small troponin leak) PIP (P-pulmonale)
(a small troponin leak) PIP (clot burden)
(right heart strain) PIP (P-pulmonale)
(right heart strain) PIP (clot burden)
(An ECHO) TeRP (any wall motion abnormalities)
(VS) TeRP (tachy)
(Bucket handle medial meniscal tear) PIP 

(a stress test) TeRP (positive)
(diuresis) TrAP (rapid atrial fibrillation)
(percutaneous transluminal coronary angioplasty) TrAP (Coronary artery disease)
(percutaneous transluminal coronary angioplasty) TrAP (myocardial infarction)
(angiography) TeRP (a transient increase)
(angiography) TeRP (diaphoretic)
(A repeat chest x-ray) TeRP (that pneumothorax)
(Acetic acid) TrAP (his left foot ulcer)
(coronary artery bypass graft) TrAP (Coronary artery disease)
(coronary artery bypass graft) TrAP (myocardial infarction)
(tendonitis) PIP (left biceps pain)
(wet-to-dry dressing changes) TrAP (his left foot ulcer)
(intravenous Lopressor) TrAP (rapid atrial fibrillation)
(insulin) TrAP (Diabetes mellitus type 2)
(His chest tube) TrAP (a high chest tube output)
(an insulin drip) TrIP (his diabetes mellitus)
(Lopressor) TrIP (his recurrent atrial fibrillation)
(his blood pressure) TeRP (a transient increase)
(debridement) TrAP (Left foot ulcer)
(His pedal pulses) TeRP (absent)
(continued diuresis)

(metoprolol) TrIP (atrial fibrillation)
(The left ventriculography) TeRP (anterior akinesis)
(The left ventriculography) TeRP (apical severe hypokinesis)
(The left ventriculography) TeRP (a moderately depressed ejection fraction)
(a moderately depressed ejection fraction) PIP (anterior akinesis)
(a moderately depressed ejection fraction) PIP (apical severe hypokinesis)
(penicillin) TrCP (allergic)
(penicillin) TrCP (a rash)
(a rash) PIP (allergic)
(The electrocardiogram) TeRP (pseudonormalization of T waves)
(The electrocardiogram) TeRP (Q waves in V1)
(fresh frozen plasma) TrAP (bleeding)
(anticoagulation) TrAP (thrombosis)
(fresh frozen plasma) TrAP (increasing chest tube drainage)
(3-4 millimeter ST segment elevation in V1) PIP (depression in II , III , and AVF)
(The rectal examination) TeRP (trace guaiac positive)
(repeat cardiac catheterization) TeRP (circumflex lesion after the obtuse marginal 1)
(repeat cardiac catheterization) TeRP (distal posterior descending artery lesion)
(a

In [107]:
with open(os.path.join(label_dir, 'ctxt2id.json'), 'w') as f:
  json.dump(ctxt2id, f)

In [108]:
print(os.path.join(label_dir, 'ctxt2id.json'))

/home/max/data/artifacts/i2b2/2010/new_data/ctxt2id.json


In [109]:
ctxt2id

{'a car seat positioning test': 0,
 'total': 1,
 'Her respirations': 2,
 'Gentamycin': 3,
 'cesarean section': 4,
 'betamethasone': 5,
 'Cesarean section': 6,
 'phototherapy': 7,
 'Influenza immunization': 8,
 'influenza': 9,
 'Prematurity': 10,
 'A cerclage': 11,
 'rupture of membranes': 12,
 'progressive preterm labor': 13,
 'Apgars': 14,
 'her length': 15,
 'Ferinsol': 16,
 'Synagis RSV prophylaxis': 17,
 'full volume feedings': 18,
 'direct': 19,
 'State Screens': 20,
 'antibody': 21,
 'bradycardia': 22,
 'the birth head circumference': 23,
 'Sepsis': 24,
 'her head circumference': 25,
 'the hepatitis B vaccine': 26,
 'The antibiotics': 27,
 'respiratory distress syndrome': 28,
 'elemental iron': 29,
 'hepatitis B surface antigen': 30,
 'blood type': 31,
 'some occasional grunting': 32,
 'the last bilirubin': 33,
 'chronic lung disease': 34,
 'sepsis risk factor': 35,
 'Group B Streptococcus': 36,
 'moderate subcostal retractions': 37,
 'blood products': 38,
 'calorie enhanced brea

In [117]:
rel_types

{'PIP', 'TeCP', 'TeRP', 'TrAP', 'TrCP', 'TrIP', 'TrNAP', 'TrWP'}

In [129]:

def filter_inv_rels(relation2id):
  ignore_inv_relation2id = defaultdict(int)
  ignore_rel_id = 1
  for rel_name, rel_id in relation2id.items():
    if 'INV$' not in rel_name:
      if rel_name == 'NONE':
        ignore_inv_relation2id[rel_name] = 0
      else:
        ignore_inv_relation2id[rel_name] = ignore_rel_id
        ignore_rel_id += 1
  return ignore_inv_relation2id

with open(os.path.join(label_dir, 'info', 'relation2id.json')) as f:
  rels = json.load(f)['name2id']
  rels = filter_inv_rels(rels)

In [130]:
rels

defaultdict(int,
            {'NONE': 0,
             'PIP': 1,
             'TrAP': 2,
             'TeRP': 3,
             'TrCP': 4,
             'TrIP': 5,
             'TeCP': 6,
             'TrNAP': 7,
             'TrWP': 8})

In [131]:
# TODO find better or test out fully custom language model options here.
relid2txt = {
  0: 'not related', # no relation
  1: 'cause of',
  2: 'may treat', # treatment administered
  3: 'may diagnose', # test reveals
  4: 'cause of', # treatment causes
  5: 'may treat', # treatment improve
  6: 'may diagnose', # test investigates
  7: 'may treat', # treatment not administered
  8: 'may treat', # treatment worsen
}
with open(os.path.join(label_dir, 'relid2txt_basic.json'), 'w') as f:
  json.dump(relid2txt, f)
print(os.path.join(label_dir, 'relid2txt_basic.json'))

/home/max/data/artifacts/i2b2/2010/new_data/relid2txt_basic.json


In [132]:
relid2txt_custom = {
  0: 'not related', # no relation
  1: 'cause of', # med causes med
  2: 'administered for', # treatment administered
  3: 'reveals', # test reveals
  4: 'cause of', # treatment causes
  5: 'improves', # treatment improve
  6: 'investigates', # test investigates
  7: 'not administered because of', # treatment not administered
  8: 'worsens', # treatment worsen
}
with open(os.path.join(label_dir, 'relid2txt_custom.json'), 'w') as f:
  json.dump(relid2txt, f)
print(os.path.join(label_dir, 'relid2txt_custom.json'))

/home/max/data/artifacts/i2b2/2010/new_data/relid2txt_custom.json


In [136]:
relid2txt_def = {
  0: 'not related to', # no relation
  1: 'indicates', # med indicates med
  2: 'administered for', # treatment administered
  3: 'reveals', # test reveals
  4: 'causes', # treatment causes
  5: 'improves', # treatment improve
  6: 'investigates', # test investigates
  7: 'not administered because of', # treatment not administered
  8: 'worsens', # treatment worsen
}
with open(os.path.join(label_dir, 'relid2txt_def.json'), 'w') as f:
  json.dump(relid2txt, f)
print(os.path.join(label_dir, 'relid2txt_def.json'))

/home/max/data/artifacts/i2b2/2010/new_data/relid2txt_def.json


In [137]:
relid2txt_def_inv = {
  0: 'not related to', # no relation
  1: 'indicated by', # med indicates med
  2: 'treated by', # treatment administered
  3: 'revealed by', # test reveals
  4: 'caused by', # treatment causes
  5: 'improved by', # treatment improve
  6: 'investigated by', # test investigates
  7: 'not treated by', # treatment not administered
  8: 'worsened by', # treatment worsen
}
with open(os.path.join(label_dir, 'relid2txt_def_inv.json'), 'w') as f:
  json.dump(relid2txt, f)
print(os.path.join(label_dir, 'relid2txt_def_inv.json'))

/home/max/data/artifacts/i2b2/2010/new_data/relid2txt_def_inv.json
