# Building Gazetteer List

In [1]:
from collections import defaultdict
from nltk.corpus import stopwords
from konlpy.tag import Mecab
from lxml import etree as ET
from lxml.builder import E
import pandas as pd
import os
import re


In [2]:
tagger = Mecab()

In [3]:
# 국립국어원 가장 많이 쓰이는 한국어 단어 300만 어절
st = open("./korean_stopwords.txt", "r")
lines = st.readlines()
ko_stopwords=[]
for line in lines:
    l = line.split()
    ko_stopwords.append(l[0])

In [4]:
additional = ["후", "전", "의한"]


ko_stopwords = ko_stopwords + additional

In [5]:
# TEST: Do this word exist as a standard Korean word?
for sw in ko_stopwords:
    if sw == "얼굴":
        print("exist")

In [6]:
xmls = os.listdir("./xml/")
trees = defaultdict()
for xml in xmls:
    trees[xml[:-4]] = ET.parse("./xml/"+xml)
        

In [7]:
roots= []
for name, tree  in trees.items():
    roots.append(tree.getroot())
    print(tree.getroot())

<Element seoulAsan at 0x7f6c68204f00>
<Element diseases at 0x7f6c68204fa0>
<Element interventions at 0x7f6c682070a0>
<Element medicines at 0x7f6c68207140>
<Element bodies at 0x7f6c682071e0>


In [8]:
ko_exclusion = ["기타"]
en_exclusion = []

In [9]:
# input: key & value (key is one string having one or more puntuations)
# output: subdivided keys and values (divided keys have the same value like the original)
def split_by_comma(k, v): # returns list of dict
    subdivision=[] # subdivide words with puntuations
    commas = indices_finder(", ", k)
    slashs = indices_finder("\/", k)
    punts=[]
    for comma in commas:
        punts.append(comma)
    for slash in slashs:
        punts.append(slash)
    punts = sorted(punts, key=getKey) # sorting tuples of list by 0st index
    if len(punts) >= 1:
        if len(punts) >= 3:
            for punt in punts:
                if punt == punts[0]: # if it's the first puntuation
                    prev_punt = punt
                    subdivision.append({k[:punt[0]]: v})
                elif punt == punts[-1]: # it it's the last puntuation
                    subdivision.append({k[prev_punt[1]:punt[0]]:v})
                    subdivision.append({k[punt[1]:]:v})
                else: # if it's a puntuation somewhere in the middle
                    subdivision.append({k[prev_punt[1]:punt[0]]: v})
                    prev_punt = punt
        elif len(punts) == 2:
            for punt in punts:
                if punt == punts[0]: # if it's the first puntuation
                    subdivision.append({k[:punt[0]]: v})
                elif punt == punts[-1]: # it it's the last puntuation
                    subdivision.append({k[punts[-2][1]:punt[0]]: v}) 
                    subdivision.append({k[punt[1]:]:v})
        else: # if the number of puntuation is one
            subdivision.append({k[:punts[0][0]]:v})
            subdivision.append({k[punts[0][1]:]:v})
    
    return subdivision

def getKey(item):
    return item[0]

def is_subword(word, target): # check if "target" is in "word" as a subword
    if len(word) >= len(target):
        s = 0
        e = len(target)
        while e <= len(word):
            if word[s:e] == target:
                return True
            s += 1
            e += 1
        return False
    return False

        
def korean_entities_builder(korean, two_names):
    kolist = korean.split(" ")
    name = two_names[0]
    candidates = []
    
    for ko in kolist: # investigating subword
        if ko not in ko_exclusion and ko not in ko_stopwords and not is_subword(ko, name):
            candidates.append(ko)
    return candidates
    
def english_entities_builder(english, two_names):
    en_stopwords = stopwords.words("english")
    enlist = english.split(" ")
    name = two_names[1]
    candidates = []
    
    for en in enlist:
        pas = False
        if en not in en_stopwords and not is_subword(en, name):
            candidates.append(en)
    
    return candidates
    
    
def pull_all_gazetteers(string_with_p, two_names):
    ko_candidates = []
    en_candidates = []
    prior = ""
    later = ""
    result = []
    
    list1 = ko_p_en_p(string_with_p)
    added= []
    l = []
    # spliting brackets in formats like
    # 외상에 의한 고막 천공(Traumatic rupture of ear drum [Traumatic tympanic membrane perforation])
    for ls in list1:   
        list2 = en_b_en_b(ls)
        if type(list2) == str:
            added.append(list2)
        elif type(list2) == list:
            while len(list2) >= 2:
                for ls2 in list2:
                    added.append(ls2)
                for add in added:
                    if add in list2:
                        list2.remove(add)
    for add in added:
        l.append(add)
    
    if len(l) >= 3:
        for j in l:
            if hangul(j) == 1:
                ko_candidates = ko_candidates + korean_entities_builder(j, two_names)
            else:
                en_candidates = en_candidates + english_entities_builder(j.lower(), two_names)
        result = en_candidates + ko_candidates + l
                
    elif len(l) == 2:
        prior = l[0]
        later = l[1]
        prior_s = indices_finder(" ", prior)
        later_s = indices_finder(" ", prior)

        if hangul(prior) == 1:
            ko_candidates = korean_entities_builder(prior, two_names)
        else:
            en_candidates = english_entities_builder(prior.lower(), two_names)
            
        if hangul(later) == 1:
            ko_candidates = korean_entities_builder(later, two_names)
        else:
            en_candidates = english_entities_builder(later.lower(), two_names)
            
        result = en_candidates + ko_candidates + [prior] + [later]
        
    else:
        if hangul(string_with_p) == 1:
            ko_candidates = korean_entities_builder(string_with_p, two_names)
        else:
            en_candidates = english_entities_builder(string_with_p.lower(), two_names)
            
        result = en_candidates + ko_candidates + [prior] + [later]
    
    result = list(filter(lambda x: x != "", result)) # removing empty elements
    return result
            
def ko_p_en_p(text): # processing text format like "korean(english)"
    if len(text) >= 3 and len(indices_finder("\(", text)) >= 1 and len(indices_finder("\)", text)) >= 1:
        p_start = re.search("\(", text).start()
        p_end = re.search("\)", text).start()
        return [text[:p_start].lower(), text[p_start+1:p_end].lower()]
    else: # If this text doesn't have parentheses
        return text.lower()
    
def en_b_en_b(text): # processing text format like "english[english]"
    if len(text) >= 3 and len(indices_finder("\[", text)) >= 1 and len(indices_finder("\]", text)) >= 1:
        p_start = re.search("\[", text).start()
        p_end = re.search("\]", text).start()
        return [text[:p_start].lower(), text[p_start+1:p_end].lower()]
    else: # If this text doesn't have parentheses
        return text.lower()
    
def indices_finder(pattern, string): # Using this to find indices of spaces
    r = []
    for i in re.finditer(pattern, string):
        r.append((i.start(), i.end()))
    return r

def hangul(string): # if string contains Hangul returns 1 / English returns 0 / mixed returns -1
    ko = re.compile('[^ a-z | A-Z]')
    en = re.compile('[^ ㄱ-ㅎ|ㅏ-ㅣ|가-힣]')
    kos = re.findall(ko, string)
    ens = re.findall(en, string)
    if len(kos) >= 1 and len(ens) == 0: # if the content is consist of only Korean characters
        return 1
    elif len(ens) >= 1 and len(kos) == 0: # if the content is consist of only English characters
        return 0
    else: # if the content is consist of only Korean & English characters mixed
        return -1

In [10]:
def build_from_bodies(bodies, gazetteer):
    l = len(bodies)
    for body in bodies: # to add body_part first not to check body parts as disease or intervention
        d=defaultdict()
        body_name = body[0].text
        two_names = ko_p_en_p(body_name)
        
        if two_names[0] not in ko_exclusion and ko_stopwords:
            d[two_names[0]] = "BODY_PART" # e.g. adding "간" 
        if two_names[1] not in en_exclusion and ko_stopwords:
            d[two_names[1].lower()] = "BODY_PART" # e.g. adding "liver"
        
        body_part = body[1].text
        if body_part not in ko_exclusion and ko_stopwords:
            d[body_part] = "BODY_PART"
        
        body_disease = body[2].text
        if body_disease not in ko_exclusion and ko_stopwords:
            d[body_disease] = "DISEASE"
            
        body_medDep = body[3].text
        if body_medDep not in ko_exclusion and ko_stopwords:
            d[body_medDep] = "MEDICAL_DEPARTMENT"
        
        # Checking if new keys are already declared in existing dict
        for k in d.keys():
            if k in gazetteer: # if the new key is already declared in the dict
                if d[k] == gazetteer[k]: # and their types are also the same
                    continue
                else: # yet, if their types are different (which have different semantics)
                    print(k, gazetteer[k], d[k]) # THIS SHOULD NOT HAPPEN
                    gazetteer[k].append(d[k]) # adding additional value
            else:
                gazetteer[k] = d[k]
    
    for body in bodies:
        rd = body[4] # must use child value body_related_disease!
        for r in rd:
            body_related_diseases = pull_all_gazetteers(r.text, two_names)
            for body_related_disease in body_related_diseases:
                if body_related_disease in gazetteer.keys():
                    continue
                d[body_related_disease] = "DISEASE"
                
        
        ri = body[5] # must use child value body_related_disease!
        for r in ri:
            body_related_interventions = pull_all_gazetteers(r.text, two_names)    
            for body_related_intervention in body_related_interventions:
                if body_related_intervention in gazetteer.keys():
                    continue
                d[body_related_intervention] = "INTERVENTION"
                
        # Checking if new keys are already declared in existing dict
        for k in d.keys():
            if k in gazetteer: # if the new key is already declared in the dict
                if d[k] == gazetteer[k]: # and their types are also the same
                    continue
                else: # yet, if their types are different (which have different semantics)
                    print(k, gazetteer[k], d[k]) # THIS SHOULD NOT HAPPEN
                    gazetteer[k].append(d[k]) # adding additional value
            else:
                gazetteer[k] = d[k]    
        
        divided=[]
        remove=[]
        for k, v in gazetteer.items(): # subdivide words with commas
            if k == None:
                continue
            div = split_by_comma(k, v)
            if len(div) >= 2:
                for d in div:
                    divided.append(d)
                remove.append(k) # add abnormal keys to remove
        
        for r in remove:
            gazetteer.pop(r)

        for d in divided: # adding subdivided dicts to gazetteer dict
            for k, v in d.items(): 
                if k not in gazetteer: # after check if it's exist in gazetteer
                    gazetteer[k] = v
                    
    i=1
    for k, v in gazetteer.items():
        i += 1 
        print("{}\t\t\t\t\t{}".format(k, v))
    print(i,"개")

def build_from_diseases(diseases, gazetteer):
    l = len(diseases)
    for disease in diseases:
def build_from_interventions(interventions, gazetteer):
    l = len(interventions)
    for intervention in interventions:
def build_from_medicines(medicines, gazetteer):
    l = len(medicines)
    for medicine in medicines:
    

In [11]:
for root in roots:
    print(root.tag)

seoulAsan
diseases
interventions
medicines
bodies


In [12]:
gazetteer= defaultdict()
for root in roots:
    if root.tag == "bodies":
        build_from_bodies(root, gazetteer)
    #elif root.tag == "diseases":
    #    build_from_diseases(root, gazetteer)
    #elif root.tag == "interventions":
    #    build_from_interventions(root, gazetteer)
    #elif root.tag == "medicines":
    #    build_from_medicines(root, gazetteer)

간					BODY_PART
liver					BODY_PART
배					BODY_PART
소화기질환					DISEASE
소화기내과					MEDICAL_DEPARTMENT
갈비뼈					BODY_PART
rib					BODY_PART
가슴					BODY_PART
근골격질환					DISEASE
None					MEDICAL_DEPARTMENT
갑상선					BODY_PART
thyroid					BODY_PART
목					BODY_PART
유방내분비질환					DISEASE
겨드랑이					BODY_PART
axilla					BODY_PART
종양혈액질환					DISEASE
견갑골					BODY_PART
scapula					BODY_PART
팔					BODY_PART
고환					BODY_PART
testis					BODY_PART
골반					BODY_PART
신장비뇨기질환					DISEASE
비뇨의학과					MEDICAL_DEPARTMENT
pelvis					BODY_PART
산부인과					MEDICAL_DEPARTMENT
골수					BODY_PART
bone marrow					BODY_PART
전신					BODY_PART
관절					BODY_PART
joint					BODY_PART
괄약근					BODY_PART
sphincter					BODY_PART
귀					BODY_PART
ear					BODY_PART
귀코목질환					DISEASE
이비인후과					MEDICAL_DEPARTMENT
근육					BODY_PART
muscle					BODY_PART
정형외과					MEDICAL_DEPARTMENT
기관지					BODY_PART
bronchus					BODY_PART
호흡기질환					DISEASE
난소					BODY_PART
ovary					BODY_PART
여성질환					DISEASE
뇌					BODY_PART
brain					BODY_PART
머리					BODY_PART
뇌신경정신질

In [13]:
gazetteer.pop(None)
i=1
max_len = max(len(k) for k in gazetteer if k != None) 

for k, v in gazetteer.items():
    i += 1 
    print("{}{}".format(k.ljust(max_len + 1), v))
print(i,"개")

간                                                                                          BODY_PART
liver                                                                                      BODY_PART
배                                                                                          BODY_PART
소화기질환                                                                                      DISEASE
소화기내과                                                                                      MEDICAL_DEPARTMENT
갈비뼈                                                                                        BODY_PART
rib                                                                                        BODY_PART
가슴                                                                                         BODY_PART
근골격질환                                                                                      DISEASE
갑상선                                                                                   

하대정맥                                                                                       BODY_PART
inferior vena cava                                                                         BODY_PART
흉골                                                                                         BODY_PART
sternum                                                                                    BODY_PART
흉막                                                                                         BODY_PART
pleura                                                                                     BODY_PART
흉막강                                                                                        BODY_PART
pleural cavity                                                                             BODY_PART
흉강                                                                                         BODY_PART
thoracic cavity                                                                            

In [14]:
import pandas as pd

In [15]:
gazetteer_dict = dict(gazetteer)

In [16]:
print(gazetteer)

defaultdict(None, {'간': 'BODY_PART', 'liver': 'BODY_PART', '배': 'BODY_PART', '소화기질환': 'DISEASE', '소화기내과': 'MEDICAL_DEPARTMENT', '갈비뼈': 'BODY_PART', 'rib': 'BODY_PART', '가슴': 'BODY_PART', '근골격질환': 'DISEASE', '갑상선': 'BODY_PART', 'thyroid': 'BODY_PART', '목': 'BODY_PART', '유방내분비질환': 'DISEASE', '겨드랑이': 'BODY_PART', 'axilla': 'BODY_PART', '종양혈액질환': 'DISEASE', '견갑골': 'BODY_PART', 'scapula': 'BODY_PART', '팔': 'BODY_PART', '고환': 'BODY_PART', 'testis': 'BODY_PART', '골반': 'BODY_PART', '신장비뇨기질환': 'DISEASE', '비뇨의학과': 'MEDICAL_DEPARTMENT', 'pelvis': 'BODY_PART', '산부인과': 'MEDICAL_DEPARTMENT', '골수': 'BODY_PART', 'bone marrow': 'BODY_PART', '전신': 'BODY_PART', '관절': 'BODY_PART', 'joint': 'BODY_PART', '괄약근': 'BODY_PART', 'sphincter': 'BODY_PART', '귀': 'BODY_PART', 'ear': 'BODY_PART', '귀코목질환': 'DISEASE', '이비인후과': 'MEDICAL_DEPARTMENT', '근육': 'BODY_PART', 'muscle': 'BODY_PART', '정형외과': 'MEDICAL_DEPARTMENT', '기관지': 'BODY_PART', 'bronchus': 'BODY_PART', '호흡기질환': 'DISEASE', '난소': 'BODY_PART', 'ovary': 'BODY_PA

In [20]:
gazetteer_entities = pd.DataFrame(list(gazetteer.keys()), columns=["ENTITIES"])
gazetteer_types = pd.DataFrame(list(gazetteer.values()), columns=["TYPES"])
gazetteer_df = pd.concat([gazetteer_entities, gazetteer_types], axis=1)

In [21]:
gazetteer_df

Unnamed: 0,ENTITIES,TYPES
0,간,BODY_PART
1,liver,BODY_PART
2,배,BODY_PART
3,소화기질환,DISEASE
4,소화기내과,MEDICAL_DEPARTMENT
...,...,...
3324,glossopharyngeal neuralgia,DISEASE
3325,glossodynia,DISEASE
3326,설통,DISEASE
3327,흉선암,DISEASE


In [23]:
with pd.ExcelWriter("./bodies_Gazetteer.xlsx") as wrter:
    gazetteer_df.to_excel(wrter, sheet_name="bodies")