# Task 1: Baseline NERC

In [1]:
## imports 
import pandas as pd

In [160]:
## check if the name contains special symbols 
def check_chemical_name(some_text:str) -> str:
    """Function which checks whether a text contains numbers, special characters 
    which are similar to chemical names."""
    ## naming binary molecules 
    greek_prefix = ["hemi","mono",
                    "di","tri",
                    "tetra","penta",
                    "hexa","hepta",
                    "octa","nona",
                    "deca"]
    ## Metallic Names 
    metal_names = ["cuprous","cupric",
                   "ferrous","ferric",
                   "mercurous","mercuric",
                   "stannous","stannic"]
    ## Non-metal suffixes 
    non_metal_names = ["ide"]
    ## Others
    other_names = ['ite','ate']
    ## Bonds 
    bonds = ["ene",'yne']
    ## functional groups 
    func_groups = [
                   "carboxy","carbamoyl",
                   "chloroformyl","hydroxy",
                   "formyl","oxo","alkyl",
                   "alkoxy","epoxy","halo",
                   "amine","cyano","nitro","nitroso",
                   "azo","sulpho","alkyl thio","mercapto"
                   ]
    ## functional group suffixes 
    func_g_suffix = [
                    "oic","amide","oyl","chloride","acid",
                    "ol","al","one","carboxylate","amine","nitrile",
                    "sulphonic","thiol"
                    ]
    ## other names 
    other_names2 = ["meth","eth","methyl","ethyl","yl"]
    ## bring them all together 
    tmp_list = [greek_prefix,
                metal_names,non_metal_names,
                other_names,
                bonds,
                func_groups,
                func_g_suffix,other_names2]
    ## merging & making into a set
    full_list = set(sum(tmp_list, []))
    ## now we can check if it starts with a number ? 
    starts_with_digit = some_text[:1].isdigit()
    ## contains numbers?
    contains_number = any(char.isdigit() for char in some_text)
    ## does it contain hyphens?
    contain_hyphen = ("-" in some_text)
    ## contains parenthesis
    contain_parenthesis = ("(" in some_text) and (")" in some_text)
    ## does it contain a comma?
    contain_comma = ("," in some_text)
    #return (full_list, starts_with_digit, contain_hyphen, contain_parenthesis,contain_comma)
    ## iterate over our set 
    cnt = sum([1 for x in full_list if x in some_text])
    #print(cnt)
    ## check if the
    val = (starts_with_digit+contains_number+contain_hyphen+contain_parenthesis+contain_comma)
    #print(f"Total Rules out of 5 which apply: {val}")
    ## contains number + hyphen + parenthesis? --> definetly a drug!
    if contains_number and contain_hyphen and contain_parenthesis:
        #return "drug"
        print("drug")
    ## number and comma? also a drug 
    if contains_number and contain_comma:
        #return "drug"
        print("drug")
    ## if the value of our rules is greater than 3, and there is at least 
    ## one occurence of a "chemical suffix", return drug
    if val > 3 and cnt>1: 
        #return "drug"
        print("drug")

In [161]:
## testing with a random drug name 
txt = "1,2-betahydroxy-2,3-oleic acid"
check_chemical_name(txt)
l = list(map(check_chemical_name, drug_name)) ## kinda sucks 

drug
drug
drug
drug
drug
drug


In [167]:
from xml.dom.minidom import parse
import glob 
import numba 
from numba import njit

## Vars
DATA_PATH = "./data"
file_list = glob.glob(DATA_PATH+"/*/*.xml")

#@njit(nopython=False) --> Numba doesnt work with XML parser :/ 
## get more info from all the DRUGS
def get_all_drug_names(list_of_files:list) -> dict:
    ## parse all the trees 
    parsed_trees = list(map(parse, list_of_files))
    ## get the elements 
    elements = list(map(lambda x: x.getElementsByTagName("entity"),parsed_trees))
    ## get their values 
    drug_type = [x[0].attributes['type'].value for x in elements if len(x)>1]
    drug_name = [x[0].attributes['text'].value for x in elements if len(x)>1]
    ## zip together 
    d = dict(zip(drug_name, drug_type))
    return d

d = get_all_drug_names(file_list)

In [169]:
import pandas as pd
df = pd.DataFrame.from_records(d,index=[0]).T.reset_index()
df.columns = ['comp',"type"]
#df.to_csv("./data/drugs.csv")
df.type.value_counts()

drug      363
brand     168
group     130
drug_n     33
Name: type, dtype: int64

In [176]:
print(df.shape)
df = df.drop_duplicates()

(694, 2)


In [179]:
df.sort_values("comp",ascending=True)

Unnamed: 0,comp,type
0,"1-methyl-4-phenyl-1,2,5,6-tetrahydropyridine",drug_n
1,"16,16-dimethylprostaglandin E2",drug_n
2,18-Methoxycoronaridine,drug_n
3,5HT3 Antagonists,group
4,6MNA,drug_n
...,...,...
689,warfarin-type anticoagulant,group
690,xanthine bronchodilators,group
691,xanthine derivatives,group
692,zidovudine,drug


# Task 2: ML NERC

In [98]:
import sys
import re
from os import listdir

from xml.dom.minidom import parse

In [99]:
!pip install -qq nltk

In [100]:
from nltk.tokenize import word_tokenize

In [96]:
def tokenize(txt):
    offset = 0
    tks = []
    ## word_tokenize splits words, taking into account punctuations, numbers, etc.
    for t in word_tokenize(txt):
        ## keep track of the position where each token should appear, and
        ## store that information with the token
        offset = txt.find(t, offset)
        tks.append((t, offset, offset+len(t)-1))
        offset += len(t)

    ## tks is a list of triples (word,start,end)
    return tks

In [136]:
def get_tag(token, spans) :
   (_,start,end) = token
   for (spanS,spanE,spanT) in spans :
      if start==spanS and end<=spanE : return "B-"+spanT
      elif start>=spanS and end<=spanE : return "I-"+spanT

   return "O"

In [182]:
def extract_features(tokens:str) -> list:
    """Function to extract features from the """
    # for each token, generate list of features and add it to the result
    result = []
    for k in range(0,len(tokens)):
        tokenFeatures = []
        t = tokens[k][0]
        ## so we can count capitals here
        count_caps = str(any(char.isupper() for char in t))
        all_caps = str(t.isupper())
        t = t.lower()
        ## Append the actual word 
        tokenFeatures.append("form="+t)
        ## the suffixes
        tokenFeatures.append("suf5="+t[-5:])
        tokenFeatures.append("suf4="+t[-4:])
        tokenFeatures.append("suf3="+t[-3:])
        tokenFeatures.append("suf2="+t[-2:])
        ## get the prefixes
        tokenFeatures.append("pref5="+t[:5])
        tokenFeatures.append("pref4="+t[:4])
        tokenFeatures.append("pref3="+t[:3])
        tokenFeatures.append("pref2="+t[:2])
        ## Are there any numbers 
        tokenFeatures.append("CountNum="+str(any(char.isdigit() for char in t)))
        ## Any capital 
        tokenFeatures.append("CountCaps="+count_caps)
        tokenFeatures.append("AllCaps="+all_caps)
        ## any hyphens? 
        tokenFeatures.append("Hyphen="+str(("-" in t)))
        ## parenthesis
        tokenFeatures.append("Parenth="+str(("(" and ")" in t)))
        ## previous word - Suffix & Prefix 
        if k>0 :
            tPrev = tokens[k-1][0]
            tokenFeatures.append("Prevsuf5="+tPrev[-5:])
            tokenFeatures.append("Prevsuf4="+tPrev[-4:])
            tokenFeatures.append("Prevsuf3="+tPrev[-3:])
            tokenFeatures.append("Prevsuf2="+tPrev[-2:])
            ## get the prefixes
            tokenFeatures.append("Prevpref5="+tPrev[:5])
            tokenFeatures.append("Prevpref4="+tPrev[:4])
            tokenFeatures.append("Prevpref3="+tPrev[:3])
            tokenFeatures.append("Prevpref2="+tPrev[:2])
        else :
            tokenFeatures.append("BoS")

        ## The next word - Suffix + Prefix
        if k<len(tokens)-1 :
            tNext = tokens[k+1][0]
            tokenFeatures.append("Nextsuf5="+tNext[-5:])
            tokenFeatures.append("Nextsuf4="+tNext[-4:])
            tokenFeatures.append("Nextsuf3="+tNext[-3:])
            tokenFeatures.append("Nextsuf2="+tNext[-2:])
            ## get the prefixes
            tokenFeatures.append("Nextpref5="+tNext[:5])
            tokenFeatures.append("Nextpref4="+tNext[:4])
            tokenFeatures.append("Nextpref3="+tNext[:3])
            tokenFeatures.append("Nextpref2="+tNext[:2])
        else:
            tokenFeatures.append("EoS")
        ## Next Next Word (Plus 2)
        if k<len(tokens)-2 :
            tNextNext = tokens[k+2][0]
            tokenFeatures.append("NxtNextsuf5="+tNextNext[-5:])
            tokenFeatures.append("NxtNextsuf4="+tNextNext[-4:])
            tokenFeatures.append("NxtNextsuf3="+tNextNext[-3:])
            tokenFeatures.append("NxtNextsuf2="+tNextNext[-2:])
            ## get the prefixes
            tokenFeatures.append("NxtNextpref5="+tNextNext[:5])
            tokenFeatures.append("NxtNextpref4="+tNextNext[:4])
            tokenFeatures.append("NxtNextpref3="+tNextNext[:3])
            tokenFeatures.append("NxtNextpref2="+tNextNext[:2])
        ## Previous Previous (Minus 2)
        if k>1:
            tPrevPrev = tokens[k-2][0]
            tokenFeatures.append("PrePrevsuf5="+tPrevPrev[-5:])
            tokenFeatures.append("PrePrevsuf4="+tPrevPrev[-4:])
            tokenFeatures.append("PrePrevsuf3="+tPrevPrev[-3:])
            tokenFeatures.append("PrePrevsuf2="+tPrevPrev[-2:])
            ## get the prefixes
            tokenFeatures.append("PrevPrevpref5="+tPrevPrev[:5])
            tokenFeatures.append("PrevPrevpref4="+tPrevPrev[:4])
            tokenFeatures.append("PrevPrevpref3="+tPrevPrev[:3])
            tokenFeatures.append("PrevPrevpref2="+tPrevPrev[:2])
        
        result.append(tokenFeatures)
        
    return result

In [183]:
datadir = "/Users/Eric/Documents/Uni/Msc/Courses/Sem2/AHLT/LAB/task1/labAHLT/data/train"

In [184]:
## "MAIN"
# process each file in directory
for f in listdir(datadir)[:1] :
   
   # parse XML file, obtaining a DOM tree
   tree = parse(datadir+"/"+f)
   
   # process each sentence in the file
   sentences = tree.getElementsByTagName("sentence")
   for s in sentences :
      sid = s.attributes["id"].value   # get sentence id
      spans = []
      stext = s.attributes["text"].value   # get sentence text
      entities = s.getElementsByTagName("entity")
      for e in entities :
         # for discontinuous entities, we only get the first span
         # (will not work, but there are few of them)
         (start,end) = e.attributes["charOffset"].value.split(";")[0].split("-")
         typ =  e.attributes["type"].value
         spans.append((int(start),int(end),typ))
         

      # convert the sentence to a list of tokens
      tokens = tokenize(stext)
      # extract sentence features
      features = extract_features(tokens)
      print(stext)
      [print(x) for x in features]
      #[print(len(x)) for x in features]

      # print features in format expected by crfsuite trainer
      for i in range (0,len(tokens)) :
         # see if the token is part of an entity
         tag = get_tag(tokens[i], spans) 
         #print (sid, tokens[i][0], tokens[i][1], tokens[i][2], tag, "\t".join(features[i]), sep='\t')

      # blank line to separate sentences
      print()

Milk, milk products, and calcium-rich foods or drugs may impair the absorption of EMCYT.
['form=milk', 'suf5=milk', 'suf4=milk', 'suf3=ilk', 'suf2=lk', 'pref5=milk', 'pref4=milk', 'pref3=mil', 'pref2=mi', 'CountNum=False', 'CountCaps=True', 'AllCaps=False', 'Hyphen=False', 'Parenth=False', 'BoS', 'Nextsuf5=,', 'Nextsuf4=,', 'Nextsuf3=,', 'Nextsuf2=,', 'Nextpref5=,', 'Nextpref4=,', 'Nextpref3=,', 'Nextpref2=,', 'NxtNextsuf5=milk', 'NxtNextsuf4=milk', 'NxtNextsuf3=ilk', 'NxtNextsuf2=lk', 'NxtNextpref5=milk', 'NxtNextpref4=milk', 'NxtNextpref3=mil', 'NxtNextpref2=mi']
['form=,', 'suf5=,', 'suf4=,', 'suf3=,', 'suf2=,', 'pref5=,', 'pref4=,', 'pref3=,', 'pref2=,', 'CountNum=False', 'CountCaps=False', 'AllCaps=False', 'Hyphen=False', 'Parenth=False', 'Prevsuf5=Milk', 'Prevsuf4=Milk', 'Prevsuf3=ilk', 'Prevsuf2=lk', 'Prevpref5=Milk', 'Prevpref4=Milk', 'Prevpref3=Mil', 'Prevpref2=Mi', 'Nextsuf5=milk', 'Nextsuf4=milk', 'Nextsuf3=ilk', 'Nextsuf2=lk', 'Nextpref5=milk', 'Nextpref4=milk', 'Nextpref3=