In [1]:
import sys
import os

from xml.dom.minidom import parse
from nltk.tokenize import word_tokenize

In [2]:
DATA_PATH = '/home/bscuser/Documentos/second_term/AHLT/ahlt/data/Train/DrugBank'

In [8]:
# -------- classify_token ----------
# -- check if a token is a drug, and of which type
suffixes = ['azole', 'amine', 'idine', 'mycin']


# cat datapath.xml | xmllint -format - | grep entity | awk '{print $(NF -1),$NF}'

def classify_token(txt):
    # Baseline approach
    # Todo: Add more rules
    if txt.isupper():
        return True, "brand"
    elif txt[-5:] in suffixes:
        return True, "drug"
    else:
        return False, ""


# --------- tokenize sentence -----------
# -- Tokenize sentence, returning tokens and span offsets

def tokenize(txt):
    offset = 0
    tks = []
    # word_tokenize splits words, taking into account punctuations, numbers, etc.
    for t in word_tokenize(txt):
        # keep track of the position where each token should appear, and
        # store that information with the token
        offset = txt.find(t, offset)
        tks.append((t, offset, offset + len(t) - 1))
        offset += len(t)

    # tks is a list of triples (word,start,end)
    return tks


# --------- Entity extractor ----------- 
# -- Extract drug entities from given text and return them as
# -- a list of dictionaries with keys "offset", "text", and "type"

def extract_entities(stext):
    # convert the sentece to a list of tokens
    tokens = tokenize(stext)

    # for each token, check whether it is a drug name or not
    result = []
    for t in tokens:
        tokenTxt = t[0]
        (is_drug, tk_type) = classify_token(tokenTxt)

        # Todo: Change this part to accumulate drugs that appear together
        if is_drug:
            drug_start = t[1]
            drug_end = t[2]
            drug_type = tk_type
            e = {"offset": str(drug_start) + "-" + str(drug_end),
                 "text": stext[drug_start:drug_end],
                 "type": drug_type}
            result.append(e)

    return result

def parse_sentences(file_path):
    tree = parse(file_path)
    sentences = tree.getElementsByTagName("sentence")
    parsed_sentences = {}
    for s in sentences:
        sid = s.attributes["id"].value  # get sentence id
        stext = s.attributes["text"].value  # get sentence text
        tokenized = tokenize(stext)
        parsed_sentences[sid] = tokenized
    return parsed_sentences


# process each file in directory
for f in os.listdir(DATA_PATH):
    s = parse_sentences(os.path.join(DATA_PATH,f))
    print(s)
    print()

{'DDI-DrugBank.d85.s0': [('The', 0, 2), ('use', 4, 6), ('of', 8, 9), ('NSAIDs', 11, 16), ('in', 18, 19), ('patients', 21, 28), ('who', 30, 32), ('are', 34, 36), ('receiving', 38, 46), ('ACE', 48, 50), ('inhibitors', 52, 61), ('may', 63, 65), ('potentiate', 67, 76), ('renal', 78, 82), ('disease', 84, 90), ('states', 92, 97), ('.', 98, 98)], 'DDI-DrugBank.d85.s1': [('In', 0, 1), ('vitro', 3, 7), ('studies', 9, 15), ('have', 17, 20), ('shown', 22, 26), ('that', 28, 31), ('naproxen', 33, 40), ('anion', 42, 46), (',', 47, 47), ('because', 49, 55), ('of', 57, 58), ('its', 60, 62), ('affinity', 64, 71), ('for', 73, 75), ('protein', 77, 83), (',', 84, 84), ('may', 86, 88), ('displace', 90, 97), ('from', 99, 102), ('their', 104, 108), ('binding', 110, 116), ('sites', 118, 122), ('other', 124, 128), ('drugs', 130, 134), ('which', 136, 140), ('are', 142, 144), ('also', 146, 149), ('albumin-bound', 151, 163), ('.', 165, 165)], 'DDI-DrugBank.d85.s2': [('Theoretically', 0, 12), (',', 13, 13), ('the'

{'DDI-DrugBank.d186.s0': [('An', 0, 1), ('encephalopathic', 3, 17), ('syndrome', 19, 26), ('(', 28, 28), ('characterized', 29, 41), ('by', 43, 44), ('weakness', 46, 53), (',', 54, 54), ('lethargy', 56, 63), (',', 64, 64), ('fever', 66, 70), (',', 71, 71), ('tremulousness', 73, 85), ('and', 87, 89), ('confusion', 91, 99), (',', 100, 100), ('extrapyramidal', 102, 115), ('symptoms', 117, 124), (',', 125, 125), ('leukocytosis', 127, 138), (',', 139, 139), ('elevated', 141, 148), ('serum', 150, 154), ('enzymes', 156, 162), (',', 163, 163), ('BUN', 165, 167), (',', 168, 168), ('and', 170, 172), ('FBS', 174, 176), (')', 177, 177), ('followed', 179, 186), ('by', 188, 189), ('irreversible', 191, 202), ('brain', 204, 208), ('damage', 210, 215), ('has', 217, 219), ('occurred', 221, 228), ('in', 230, 231), ('a', 233, 233), ('few', 235, 237), ('patients', 239, 246), ('treated', 248, 254), ('with', 256, 259), ('lithium', 261, 267), ('plus', 269, 272), ('HALDOL', 274, 279), ('.', 280, 280)], 'DDI-Dru

{'DDI-DrugBank.d29.s0': [('Esomeprazole', 0, 11), ('is', 13, 14), ('extensively', 16, 26), ('metabolized', 28, 38), ('in', 40, 41), ('the', 43, 45), ('liver', 47, 51), ('by', 53, 54), ('CYP2C19', 56, 62), ('and', 64, 66), ('CYP3A4', 68, 73), ('.', 74, 74)], 'DDI-DrugBank.d29.s1': [('In', 0, 1), ('vitro', 3, 7), ('and', 9, 11), ('in', 13, 14), ('vivo', 16, 19), ('studies', 21, 27), ('have', 29, 32), ('shown', 34, 38), ('that', 40, 43), ('esomeprazole', 45, 56), ('is', 58, 59), ('not', 61, 63), ('likely', 65, 70), ('to', 72, 73), ('inhibit', 75, 81), ('CYPs', 83, 86), ('1A2', 88, 90), (',', 91, 91), ('2A6', 93, 95), (',', 96, 96), ('2C9', 98, 100), (',', 101, 101), ('2D6', 103, 105), (',', 106, 106), ('2E1', 108, 110), ('and', 112, 114), ('3A4', 116, 118), ('.', 119, 119)], 'DDI-DrugBank.d29.s2': [('No', 0, 1), ('clinically', 3, 12), ('relevant', 14, 21), ('interactions', 23, 34), ('with', 36, 39), ('drugs', 41, 45), ('metabolized', 47, 57), ('by', 59, 60), ('these', 62, 66), ('CYP', 68,

{'DDI-DrugBank.d270.s0': [('Nevirapine', 0, 9), ('is', 11, 12), ('principally', 14, 24), ('metabolized', 26, 36), ('by', 38, 39), ('the', 41, 43), ('liver', 45, 49), ('via', 51, 53), ('the', 55, 57), ('cytochrome', 59, 68), ('P450', 70, 73), ('isoenzymes', 75, 84), (',', 85, 85), ('3A4', 87, 89), ('and', 91, 93), ('2B6', 95, 97), ('.', 98, 98)], 'DDI-DrugBank.d270.s1': [('Nevirapine', 0, 9), ('is', 11, 12), ('known', 14, 18), ('to', 20, 21), ('be', 23, 24), ('an', 26, 27), ('inducer', 29, 35), ('of', 37, 38), ('these', 40, 44), ('enzymes', 46, 52), ('.', 53, 53)], 'DDI-DrugBank.d270.s2': [('As', 0, 1), ('a', 3, 3), ('result', 5, 10), (',', 11, 11), ('drugs', 13, 17), ('that', 19, 22), ('are', 24, 26), ('metabolized', 28, 38), ('by', 40, 41), ('these', 43, 47), ('enzyme', 49, 54), ('systems', 56, 62), ('may', 64, 66), ('have', 68, 71), ('lower', 73, 77), ('than', 79, 82), ('expected', 84, 91), ('plasma', 93, 98), ('levels', 100, 105), ('when', 107, 110), ('coadministered', 112, 125), ('

{'DDI-DrugBank.d94.s0': [('Clinically', 0, 9), ('meaningful', 11, 20), ('drug', 22, 25), ('interactions', 27, 38), ('have', 40, 43), ('occurred', 45, 52), ('with', 54, 57), ('concomitant', 59, 69), ('medications', 71, 81), ('and', 83, 85), ('include', 87, 93), (',', 94, 94), ('but', 96, 98), ('are', 100, 102), ('not', 104, 106), ('limited', 108, 114), ('to', 116, 117), ('the', 119, 121), ('following', 123, 131), (':', 132, 132), ('Agents', 134, 139), ('Highly', 141, 146), ('Bound', 148, 152), ('to', 154, 155), ('Plasma', 157, 162), ('Protein', 164, 170), ('Carbamazepine', 172, 184), ('is', 186, 187), ('not', 189, 191), ('highly', 193, 198), ('bound', 200, 204), ('to', 206, 207), ('plasma', 209, 214), ('proteins', 216, 223), (';', 224, 224)], 'DDI-DrugBank.d94.s1': [('therefore', 0, 8), (',', 9, 9), ('administration', 11, 24), ('of', 26, 27), ('EQUETROTM', 29, 37), ('to', 39, 40), ('a', 42, 42), ('patient', 44, 50), ('taking', 52, 57), ('another', 59, 65), ('drug', 67, 70), ('that', 72,

{'DDI-DrugBank.d506.s0': [('If', 0, 1), ('phenytoin', 3, 11), ('or', 13, 14), ('other', 16, 20), ('hepatic', 22, 28), ('enzyme', 30, 35), ('inducers', 37, 44), ('are', 46, 48), ('taken', 50, 54), ('concurrently', 56, 67), ('with', 69, 72), ('Norpace', 74, 80), ('or', 82, 83), ('Norpace', 85, 91), ('CR', 93, 94), (',', 95, 95), ('lower', 97, 101), ('plasma', 103, 108), ('levels', 110, 115), ('of', 117, 118), ('disopyramide', 120, 131), ('may', 133, 135), ('occur', 137, 141), ('.', 142, 142)], 'DDI-DrugBank.d506.s1': [('Monitoring', 0, 9), ('of', 11, 12), ('disopyramide', 14, 25), ('plasma', 27, 32), ('levels', 34, 39), ('is', 41, 42), ('recommended', 44, 54), ('in', 56, 57), ('such', 59, 62), ('concurrent', 64, 73), ('use', 75, 77), ('to', 79, 80), ('avoid', 82, 86), ('ineffective', 88, 98), ('therapy', 100, 106), ('.', 107, 107)], 'DDI-DrugBank.d506.s2': [('Other', 0, 4), ('antiarrhythmic', 6, 19), ('drugs', 21, 25), ('(', 27, 27), ('eg', 28, 29), (',', 30, 30), ('quinidine', 32, 40), 

{'DDI-DrugBank.d437.s0': [('Amprenavir', 0, 9), ('is', 11, 12), ('metabolized', 14, 24), ('in', 26, 27), ('the', 29, 31), ('liver', 33, 37), ('by', 39, 40), ('the', 42, 44), ('cytochrome', 46, 55), ('P450', 57, 60), ('enzyme', 62, 67), ('system', 69, 74), ('.', 75, 75)], 'DDI-DrugBank.d437.s1': [('Amprenavir', 0, 9), ('inhibits', 11, 18), ('CYP3A4', 20, 25), ('.', 26, 26)], 'DDI-DrugBank.d437.s2': [('Caution', 0, 6), ('should', 8, 13), ('be', 15, 16), ('used', 18, 21), ('when', 23, 26), ('coadministering', 28, 42), ('medications', 44, 54), ('that', 56, 59), ('are', 61, 63), ('substrates', 65, 74), (',', 75, 75), ('inhibitors', 77, 86), (',', 87, 87), ('or', 89, 90), ('inducers', 92, 99), ('of', 101, 102), ('CYP3A4', 104, 109), (',', 110, 110), ('or', 112, 113), ('potentially', 115, 125), ('toxic', 127, 131), ('medications', 133, 143), ('that', 145, 148), ('are', 150, 152), ('metabolized', 154, 164), ('by', 166, 167), ('CYP3A4', 169, 174), ('.', 175, 175)], 'DDI-DrugBank.d437.s3': [('Am

{'DDI-DrugBank.d456.s0': [('Co-treatment', 0, 11), ('with', 13, 16), ('the', 18, 20), ('potent', 22, 27), ('CYP3A4', 29, 34), ('inhibitor', 36, 44), ('ketoconazole', 46, 57), ('increases', 59, 67), ('erlotinib', 69, 77), ('AUC', 79, 81), ('by', 83, 84), ('2/3', 86, 88), ('.', 89, 89)], 'DDI-DrugBank.d456.s1': [('Caution', 0, 6), ('should', 8, 13), ('be', 15, 16), ('used', 18, 21), ('when', 23, 26), ('administering', 28, 40), ('or', 42, 43), ('taking', 45, 50), ('TARCEVA', 52, 58), ('with', 60, 63), ('ketoconazole', 65, 76), ('and', 78, 80), ('other', 82, 86), ('strong', 88, 93), ('CYP3A4', 95, 100), ('inhibitors', 102, 111), ('such', 113, 116), ('as', 118, 119), (',', 120, 120), ('but', 122, 124), ('not', 126, 128), ('limited', 130, 136), ('to', 138, 139), (',', 140, 140), ('atazanavir', 142, 151), (',', 152, 152), ('clarithromycin', 154, 167), (',', 168, 168), ('indinavir', 170, 178), (',', 179, 179), ('itraconazole', 181, 192), (',', 193, 193), ('nefazodone', 195, 204), (',', 205, 20

{'DDI-DrugBank.d373.s0': [('Beta-adrenergic', 0, 14), ('Blocking', 16, 23), ('Agents', 25, 30), (':', 31, 31), ('Experience', 33, 42), ('in', 44, 45), ('over', 47, 50), ('1400', 52, 55), ('patients', 57, 64), ('in', 66, 67), ('a', 69, 69), ('non-comparative', 71, 85), ('clinical', 87, 94), ('trial', 96, 100), ('has', 102, 104), ('shown', 106, 110), ('that', 112, 115), ('concomitant', 117, 127), ('administration', 129, 142), ('of', 144, 145), ('nifedipine', 147, 156), ('and', 158, 160), ('beta-blocking', 162, 174), ('agents', 176, 181), ('is', 183, 184), ('usually', 186, 192), ('well', 194, 197), ('tolerated', 199, 207), (',', 208, 208), ('but', 210, 212), ('there', 214, 218), ('have', 220, 223), ('been', 225, 228), ('occasional', 230, 239), ('literature', 241, 250), ('reports', 252, 258), ('suggesting', 260, 269), ('that', 271, 274), ('the', 276, 278), ('combination', 280, 290), ('may', 292, 294), ('increase', 296, 303), ('the', 305, 307), ('likelihood', 309, 318), ('of', 320, 321), ('

{'DDI-DrugBank.d558.s0': [('Drug/Laboratory', 0, 14), ('Test', 16, 19), ('Interactions', 21, 32), ('None', 34, 37), ('known', 39, 43), ('.', 44, 44)], 'DDI-DrugBank.d558.s1': [('Drug-Drug', 0, 8), ('Interactions', 10, 21), ('Cimetidine', 23, 32), (':', 33, 33), ('Concomitant', 35, 45), ('use', 47, 49), ('of', 51, 52), ('cimetidine', 54, 63), ('is', 65, 66), ('contraindicated', 68, 82), ('.', 83, 83)], 'DDI-DrugBank.d558.s2': [('Cimetidine', 0, 9), ('at', 11, 12), ('400', 14, 16), ('mg', 18, 19), ('BID', 21, 23), ('(', 25, 25), ('the', 26, 28), ('usual', 30, 34), ('prescription', 36, 47), ('dose', 49, 52), (')', 53, 53), ('co-administered', 55, 69), ('with', 71, 74), ('TIKOSYN', 76, 82), ('(', 84, 84), ('500', 85, 87), ('mcg', 89, 91), ('BID', 93, 95), (')', 96, 96), ('for', 98, 100), ('7', 102, 102), ('days', 104, 107), ('has', 109, 111), ('been', 113, 116), ('shown', 118, 122), ('to', 124, 125), ('increase', 127, 134), ('dofetilide', 136, 145), ('plasma', 147, 152), ('levels', 154, 15

{'DDI-DrugBank.d296.s0': [('Diphenhydramine', 0, 14), ('hydrochloride', 16, 28), ('has', 30, 32), ('additive', 34, 41), ('effects', 43, 49), ('with', 51, 54), ('alcohol', 56, 62), ('and', 64, 66), ('other', 68, 72), ('CNS', 74, 76), ('depressants', 78, 88), ('(', 90, 90), ('hypnotics', 91, 99), (',', 100, 100), ('sedatives', 102, 110), (',', 111, 111), ('tranquilizers', 113, 125), (',', 126, 126), ('etc', 128, 130), (')', 131, 131), ('.', 132, 132)], 'DDI-DrugBank.d296.s1': [('MAO', 0, 2), ('inhibitors', 4, 13), ('prolong', 15, 21), ('and', 23, 25), ('intensify', 27, 35), ('the', 37, 39), ('anticholinergic', 41, 55), ('(', 57, 57), ('drying', 58, 63), (')', 64, 64), ('effects', 66, 72), ('of', 74, 75), ('antihistamines', 77, 90), ('.', 91, 91)]}

{'DDI-DrugBank.d110.s0': [('The', 0, 2), ('administration', 4, 17), ('of', 19, 20), ('local', 22, 26), ('anesthetic', 28, 37), ('solutions', 39, 47), ('containing', 49, 58), ('epinephrine', 60, 70), ('or', 72, 73), ('norepinephrine', 75, 88), 

{'DDI-DrugBank.d314.s0': [('Aminoglutethimide', 0, 16), (':', 17, 17), ('Aminoglutethimide', 19, 35), ('may', 37, 39), ('diminish', 41, 48), ('adrenal', 50, 56), ('suppression', 58, 68), ('by', 70, 71), ('corticosteroids', 73, 87), ('.', 88, 88)], 'DDI-DrugBank.d314.s1': [('Amphotericin', 0, 11), ('B', 13, 13), ('injection', 15, 23), ('and', 25, 27), ('potassium-depleting', 29, 47), ('agents', 49, 54), (':', 55, 55), ('When', 57, 60), ('corticosteroids', 62, 76), ('are', 78, 80), ('administered', 82, 93), ('concomitantly', 95, 107), ('with', 109, 112), ('potassium-depleting', 114, 132), ('agents', 134, 139), ('(', 141, 141), ('e.g.', 142, 145), (',', 146, 146), ('amphotericin', 148, 159), ('B', 161, 161), (',', 162, 162), ('diuretics', 164, 172), (')', 173, 173), (',', 174, 174), ('patients', 176, 183), ('should', 185, 190), ('be', 192, 193), ('observed', 195, 202), ('closely', 204, 210), ('for', 212, 214), ('development', 216, 226), ('of', 228, 229), ('hypokalemia', 231, 241), ('.', 2

{'DDI-DrugBank.d233.s0': [('Use', 0, 2), ('with', 4, 7), ('Allopurinol', 9, 19), (':', 20, 20), ('The', 22, 24), ('principal', 26, 34), ('pathway', 36, 42), ('for', 44, 46), ('detoxification', 48, 61), ('of', 63, 64), ('azathioprine', 66, 77), ('is', 79, 80), ('inhibited', 82, 90), ('by', 92, 93), ('allopurinol', 95, 105), ('.', 106, 106)], 'DDI-DrugBank.d233.s1': [('Patients', 0, 7), ('receiving', 9, 17), ('azathioprine', 19, 30), ('and', 32, 34), ('allopurinol', 36, 46), ('concomitantly', 48, 60), ('should', 62, 67), ('have', 69, 72), ('a', 74, 74), ('dose', 76, 79), ('reduction', 81, 89), ('of', 91, 92), ('azathioprine', 94, 105), (',', 106, 106), ('to', 108, 109), ('approximately', 111, 123), ('1/3', 125, 127), ('to', 129, 130), ('1/4', 132, 134), ('the', 136, 138), ('usual', 140, 144), ('dose', 146, 149), ('.', 150, 150)], 'DDI-DrugBank.d233.s2': [('Use', 0, 2), ('with', 4, 7), ('Other', 9, 13), ('Agents', 15, 20), ('Affecting', 22, 30), ('Myelopoesis', 32, 42), (':', 43, 43), ('D

{'DDI-DrugBank.d92.s0': [('No', 0, 1), ('well-known', 3, 12), ('drug', 14, 17), ('interactions', 19, 30), ('with', 32, 35), ('glutamic', 37, 44), ('acid', 46, 49)]}

{'DDI-DrugBank.d398.s0': [('The', 0, 2), ('use', 4, 6), ('of', 8, 9), ('codeine', 11, 17), ('may', 19, 21), ('result', 23, 28), ('in', 30, 31), ('additive', 33, 40), ('CNS', 42, 44), ('depressant', 46, 55), ('effects', 57, 63), ('when', 65, 68), ('coadministered', 70, 83), ('with', 85, 88), ('alcohol', 90, 96), (',', 97, 97), ('antihistamines', 99, 112), (',', 113, 113), ('psychotropics', 115, 127), ('or', 129, 130), ('other', 132, 136), ('drugs', 138, 142), ('that', 144, 147), ('produce', 149, 155), ('CNS', 157, 159), ('depression', 161, 170), ('.', 171, 171)], 'DDI-DrugBank.d398.s1': [('Serious', 0, 6), ('toxicity', 8, 15), ('may', 17, 19), ('result', 21, 26), ('if', 28, 29), ('dextromethorphan', 31, 46), ('is', 48, 49), ('coadministered', 51, 64), ('with', 66, 69), ('monoamine', 71, 79), ('oxidase', 81, 87), ('inhibitor

{'DDI-DrugBank.d521.s0': [('Many', 0, 3), ('other', 5, 9), ('medicines', 11, 19), ('may', 21, 23), ('increase', 25, 32), ('or', 34, 35), ('decrease', 37, 44), ('the', 46, 48), ('effects', 50, 56), ('of', 58, 59), ('glimepiride', 61, 71), ('or', 73, 74), ('affect', 76, 81), ('your', 83, 86), ('condition', 88, 96), ('.', 97, 97)], 'DDI-DrugBank.d521.s1': [('Before', 0, 5), ('taking', 7, 12), ('glimepiride', 14, 24), (',', 25, 25), ('tell', 27, 30), ('your', 32, 35), ('doctor', 37, 42), ('if', 44, 45), ('you', 47, 49), ('are', 51, 53), ('taking', 55, 60), ('any', 62, 64), ('of', 66, 67), ('the', 69, 71), ('following', 73, 81), ('medicines', 83, 91), (':', 92, 92), ('-', 94, 94), ('aspirin', 96, 102), ('or', 104, 105), ('another', 107, 113), ('salicylate', 115, 124), ('such', 126, 129), ('as', 131, 132), ('magnesium/choline', 134, 150), ('salicylate', 152, 161), ('(', 163, 163), ('Trilisate', 164, 172), (')', 173, 173), (',', 174, 174), ('salsalate', 176, 184), ('(', 186, 186), ('Disalcid'