In [73]:
import sys
from os import listdir

from xml.dom.minidom import parse

from util.deptree import *

datadir = './data/train/'

advise = []
effect = []
mechanism = []
interaction = []
with_tag = []
without_tag = []

# process each file in directory
for f in listdir(datadir):

    # parse XML file, obtaining a DOM tree
    tree = parse(datadir + "/" + f)

    # process each sentence in the file
    sentences = tree.getElementsByTagName("sentence")
    for s in sentences:
        sid = s.attributes["id"].value  # get sentence id
        stext = s.attributes["text"].value  # get sentence text
        # load sentence entities
        entities = {}
        ents = s.getElementsByTagName("entity")
        for e in ents:
            id = e.attributes["id"].value
            offs = e.attributes["charOffset"].value.split("-")
            entities[id] = {'start': int(offs[0]), 'end': int(offs[-1])}

        # there are no entity pairs, skip sentence
        if len(entities) <= 1: continue

        # analyze sentence
        analysis = deptree(stext)

        # for each pair in the sentence, decide whether it is DDI and its type
        pairs = s.getElementsByTagName("pair")
        for p in pairs:
            # ground truth
            ddi = p.attributes["ddi"].value
            if (ddi == "true"):
                dditype = p.attributes["type"].value
                if dditype == "int":
                    interaction.append(analysis)
                elif dditype == "advise":
                    advise.append(analysis)
                elif dditype == "mechanism":
                    mechanism.append(analysis)
                elif dditype == "effect":
                    effect.append(analysis)
                with_tag.append(analysis)
            else:
                dditype = "null"
                without_tag.append(analysis)

In [43]:
from collections import Counter

In [100]:
def collect_verbs(trees):
    verbs = []
    for t in trees:
        for n in t.get_nodes():
            if "VB" in t.tree.nodes[n]['tag']:
                verbs.append(t.tree.nodes[n]['lemma'])
    return verbs

In [156]:
help(Counter)

Help on class Counter in module collections:

class Counter(builtins.dict)
 |  Counter(iterable=None, /, **kwds)
 |  
 |  Dict subclass for counting hashable items.  Sometimes called a bag
 |  or multiset.  Elements are stored as dictionary keys and their counts
 |  are stored as dictionary values.
 |  
 |  >>> c = Counter('abcdeabcdabcaba')  # count elements from a string
 |  
 |  >>> c.most_common(3)                # three most common elements
 |  [('a', 5), ('b', 4), ('c', 3)]
 |  >>> sorted(c)                       # list all unique elements
 |  ['a', 'b', 'c', 'd', 'e']
 |  >>> ''.join(sorted(c.elements()))   # list elements with repetitions
 |  'aaaaabbbbcccdde'
 |  >>> sum(c.values())                 # total of all counts
 |  15
 |  
 |  >>> c['a']                          # count of letter 'a'
 |  5
 |  >>> for elem in 'shazam':           # update counts from an iterable
 |  ...     c[elem] += 1                # by adding 1 to each element's count
 |  >>> c['a']                

In [168]:
def find_counts(verbs):
    count = Counter(verbs)
    return list(count.items()), len(verbs)

In [164]:
find_counts(collect_verbs(with_tag))

([('result', 254),
  ('be', 3041),
  ('observe', 75),
  ('combine', 37),
  ('have', 827),
  ('inhibit', 161),
  ('add', 23),
  ('find', 91),
  ('block', 64),
  ('induce', 111),
  ('depress', 17),
  ('enhance', 204),
  ('attenuate', 10),
  ('administer', 388),
  ('receive', 244),
  ('interact', 169),
  ('decrease', 324),
  ('produce', 125),
  ('increase', 717),
  ('report', 252),
  ('cause', 192),
  ('alert', 2),
  ('prevent', 15),
  ('provide', 5),
  ('suggest', 75),
  ('mediate', 8),
  ('indicate', 47),
  ('exhibit', 27),
  ('interfere', 75),
  ('give', 131),
  ('conclude', 6),
  ('act', 14),
  ('expect', 123),
  ('prolong', 167),
  ('know', 85),
  ('ingest', 4),
  ('appear', 22),
  ('extend', 5),
  ('tolerate', 4),
  ('discontinue', 31),
  ('contraindicate', 34),
  ('show', 84),
  ('eliminate', 19),
  ('see', 32),
  ('intoxicate', 1),
  ('treat', 76),
  ('derive', 3),
  ('convert', 19),
  ('catalyze', 1),
  ('affect', 107),
  ('restrict', 1),
  ('reverse', 8),
  ('metabolize', 87),
 

In [146]:
wt_verbs = collect_verbs(with_tag)
wt_most_common = find_most_common(wt_verbs)

In [147]:
wtt_verbs = collect_verbs(without_tag)
wtt_most_common = find_most_common(wtt_verbs)

In [234]:
def find_cluess(verbs_counts, num_verbs, wtt_counts, wtt_num_verbs):
    verbs = {}
    for tup in verbs_counts:
        verbs[tup[0]] = tup[1]/num_verbs

    wtt_verbs = {}
    for tup in wtt_counts:
        wtt_verbs[tup[0]] = tup[1]/wtt_num_verbs
        
    clue_words = []
    for word in verbs.keys():
        if word in wtt_verbs.keys():
            #print(wtt_verbs[word] / verbs[word])
            if  1 - (wtt_verbs[word] / verbs[word]) > 0.9:
                clue_words.append(word)
    
    return clue_words

In [235]:
def find_clues(wt, wtt):
    wtv, wtc = find_counts(collect_verbs(wt))
    wttv, wttc = find_counts(collect_verbs(wtt))
    return find_cluess(wtv, wtc, wttv, wttc)

In [236]:
find_clues(with_tag, without_tag)

['ingest',
 'present',
 'stimulate',
 'augment',
 'anaesthetise',
 'switch',
 'exaggerate',
 'wait',
 'warn',
 'alkalinize',
 'fold',
 'vasoconstrict',
 'lengthen',
 'progress',
 'depolarise',
 'propose',
 'isrecommend',
 'postmarket']

In [238]:
find_clues(advise, without_tag)

['present',
 'advise',
 'undertake',
 'switch',
 'seem',
 'dictate',
 'select',
 'bear',
 'hear',
 'accord',
 'wait',
 'threaten',
 'warn',
 'deplete',
 'exert',
 'continue',
 'exceed',
 'conflict',
 'outweigh',
 'anaesthetise',
 'precipitate',
 'watch',
 'function',
 'beadminister',
 'isrecommend',
 'interrupt']

In [239]:
find_clues(mechanism, without_tag)

['ingest',
 'phosphorylate',
 'correspond',
 'react',
 'match',
 'present',
 'share',
 'promote',
 'empty',
 'evidence',
 'anticipate',
 'alkalinize',
 'market',
 'fold',
 'hydroxylate',
 'fall',
 'propose',
 'threaten',
 'average',
 'represent',
 'desire',
 'leave',
 'denote',
 'last']

In [240]:
find_clues(effect, without_tag)

['present',
 'regulate',
 'counteract',
 'antagonize',
 'stimulate',
 'protect',
 'augment',
 'cecectomize',
 'blunt',
 'exaggerate',
 'necessitate',
 'expose',
 'term',
 'halogenate',
 'sensitize',
 'vasoconstrict',
 'lengthen',
 'shorten',
 'describe',
 'recognize',
 'progress',
 'depolarise',
 'weaken',
 'propose',
 'postmarket',
 'stand']

In [241]:
find_clues(interaction, without_tag)

['anaesthetise',
 'exist',
 'pose',
 'depend',
 'threaten',
 'nondepolarize',
 'kill']

['antagonize',
 'advise',
 'retard',
 'anticipate',
 'develop',
 'depress',
 'achieve',
 'experience',
 'prevent',
 'acidify',
 'act',
 'exceed',
 'displace',
 'start',
 'elevate',
 'augment',
 'establish',
 'expose']

In [58]:
wtt_most_common

[('are', 5123),
 ('have', 4335),
 ('following', 3904),
 ('been', 3440),
 ('be', 3109),
 ('prolonged', 2880),
 ('is', 2824),
 ('increased', 2738),
 ('include', 2728),
 ('increase', 2722),
 ('interact', 1921),
 ('decreased', 1733),
 ('were', 1551),
 ('expected', 1472),
 ('administered', 1469),
 ('was', 1459),
 ('found', 1447),
 ('affecting', 1433),
 ('used', 1282),
 ('including', 1190),
 ('has', 1133),
 ('classified', 949),
 ('containing', 947),
 ('reported', 929),
 ('given', 721),
 ('taking', 721),
 ('receiving', 684),
 ('result', 631),
 ('did', 620),
 ('affect', 589),
 ('decrease', 574),
 ('metabolized', 571),
 ('reduce', 508),
 ('inhibit', 493),
 ('had', 477),
 ('coadministered', 465),
 ('known', 441),
 ('cause', 434),
 ('enhance', 430),
 ('demonstrated', 414),
 ('evaluating', 397),
 ('blocking', 396),
 ('interfere', 361),
 ('observed', 349),
 ('shown', 348),
 ('reduced', 347),
 ('recommended', 332),
 ('Based', 326),
 ('potentiated', 322),
 ('occur', 320),
 ('using', 310),
 ('needed',

In [59]:
wt_most_common

[('be', 956),
 ('is', 680),
 ('are', 562),
 ('been', 537),
 ('have', 501),
 ('administered', 354),
 ('increased', 344),
 ('increase', 303),
 ('has', 280),
 ('reported', 249),
 ('including', 233),
 ('receiving', 222),
 ('used', 217),
 ('following', 217),
 ('include', 197),
 ('decreased', 195),
 ('was', 175),
 ('interact', 169),
 ('containing', 163),
 ('enhance', 157),
 ('reduce', 141),
 ('result', 134),
 ('cause', 123),
 ('taking', 122),
 ('prolonged', 120),
 ('expected', 111),
 ('given', 110),
 ('inhibit', 109),
 ('decrease', 109),
 ('were', 106),
 ('recommended', 101),
 ('coadministered', 93),
 ('found', 91),
 ('potentiated', 87),
 ('known', 85),
 ('metabolized', 81),
 ('reduced', 81),
 ('nondepolarizing', 79),
 ('shown', 77),
 ('associated', 77),
 ('observed', 75),
 ('treated', 73),
 ('potentiate', 69),
 ('produce', 67),
 ('taken', 64),
 ('interfere', 63),
 ('induced', 62),
 ('monitored', 60),
 ('occur', 59),
 ('resulted', 58),
 ('affecting', 56),
 ('demonstrated', 53),
 ('suggest', 

In [30]:
t.tree.nodes[1]

{'address': 1,
 'word': 'The',
 'lemma': 'the',
 'ctag': 'DT',
 'tag': 'DT',
 'feats': '_',
 'head': 3,
 'deps': defaultdict(list, {}),
 'rel': 'det',
 'start': 0,
 'end': 2}

In [27]:
print(t.get_nodes())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]


In [18]:
help(t)

Help on deptree in module util.deptree object:

class deptree(builtins.object)
 |  deptree(txt)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, txt)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  get_LCS(self, n1, n2)
 |      ## --------------------------------------------------------------
 |      ## return the Lowest Common Subsumer of two nodes
 |  
 |  get_ancestors(self, n)
 |      ## --------------------------------------------------------------
 |      ## return the list of ancestors of a node
 |  
 |  get_children(self, n)
 |      ## --------------------------------------------------------------
 |      ## return the children of a node
 |  
 |  get_down_path(self, n1, n2)
 |      ## --------------------------------------------------------------
 |      ## get downwards path from n1 to n2 (return list of node ids, downwards, excluding n1)
 |  
 |  get_fragment_head(self, start, end)
 |      ## ----------------------------------------------