In [None]:
"""
This notebook is used to turn the NIO.owl into the .xml format required as the input of the ConceptMapper.

## input format:
    <AnnotationAssertion>
        <AnnotationProperty abbreviatedIRI="metadata:prefixIRI"/>
        <AbbreviatedIRI>obo:NBO_0000304</AbbreviatedIRI>
        <Literal datatypeIRI="http://www.w3.org/2001/XMLSchema#string">NBO:0000304</Literal>
    </AnnotationAssertion>
    <AnnotationAssertion>
        <AnnotationProperty abbreviatedIRI="rdfs:label"/>
        <AbbreviatedIRI>obo:NBO_0000304</AbbreviatedIRI>
        <Literal datatypeIRI="http://www.w3.org/2001/XMLSchema#string">memory loss behavior</Literal>
    </AnnotationAssertion>
    <AnnotationAssertion>
        <AnnotationProperty abbreviatedIRI="obo:synonym"/>
        <AbbreviatedIRI>obo:NBO_0000304</AbbreviatedIRI>
        <Literal datatypeIRI="http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral">forgetting</Literal>
    </AnnotationAssertion>

## expected format:
<?xml version="1.0" ?>
<synonym>
    <token id = "ncbi xxxxx", canonical="colon, nos">
         <variant base="colon, nos"/>
         <variant base="large intestine"/>
         <variant base="large bowel"/>
    </token>
</synonym>

"""

In [None]:
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
from xml.dom import minidom
from nltk.stem import WordNetLemmatizer

In [1]:
nio_path = "/Users/yidesdo21/Projects/inputs/ontologies/NIO1.1.owl"

with open(nio_path) as f:
    contents = f.read()
#     print(contents)

soup = BeautifulSoup(contents, 'html.parser')

In [6]:
token_ids = set()
id_canonicals = list()
id_variants = list()
syns = ["obo:synonym", "oboInOwl:hasExactSynonym", "oboInOwl:hasNarrowSynonym", 
        "NDDUO:Synonym", "oboInOwl:hasBroadSynonym", "oboInOwl:hasRelatedSynonym"]
token_ids_meta, token_ids_label = list(), list()

for anno_assert in soup.find_all('annotationassertion'):
    property_tag = anno_assert.annotationproperty
    
    # property_attri extracts "rdfs:label" in <AnnotationProperty abbreviatedIRI="rdfs:label"/>
    # it can also be <AnnotationProperty abbreviatedIRI="obo:synonym"/>, etc. 
    property_attri = property_tag["abbreviatediri"]    # str
    
    # iri_tag extracts "obo:OBI_0002193" in <AbbreviatedIRI>obo:OBI_0002193</AbbreviatedIRI>
    iri_tag = anno_assert.abbreviatediri   # the iri tag. type(iri_tag) == Tag
    
    # only look at the <AbbreviatedIRI> structure, exclusing the <IRI>
    if iri_tag is not None:
        token_id = iri_tag.contents[0]
        
        if property_attri == "metadata:prefixIRI":  # point to the token id ## the number is smaller than the token_ids in the labels
            token_ids_meta.append(token_id)
   
        if property_attri == "rdfs:label":   # point to the canonical
            token_ids.add(token_id)   # # point to the token id
            canonical = anno_assert.literal.get_text()
#             canonical = anno_assert.literal.get_text().lower() 
            id_canonicals.append((token_id, canonical))
#             id_canonicals.append((token_id, wnl.lemmatize(canonical)))  # has duplications, why?
            token_ids_label.append(token_id)
            # using set(id_canonicals) can filter the duplications
#             print(anno_assert)
#             print("----------")
          
        if property_attri in syns:  # point to the variant base
            variant = anno_assert.literal.get_text()
#             variant = anno_assert.literal.get_text().lower() 
            id_variants.append((token_id, variant))
#             id_variants.append((token_id, wnl.lemmatize(variant)))
    
token_ids = sorted(list(token_ids))


In [18]:
dict_id_canonicals = defaultdict(list)
for token_id, cano in id_canonicals:
    dict_id_canonicals[token_id].append(cano)

In [21]:
dict_id_variants = defaultdict(list)
for token_id, variant in id_variants:
    dict_id_variants[token_id].append(variant)

In [24]:
## only for <IRI> 
ids_iri = list()   # this list shouldn't be used in later processing because it is incomplete, use id_canonical instead
id_cano_iri = list()
id_var_iri = list()
onto_matches = ["AlzheimerOntology", "OntoDT", "obo", "bfo", "NDDO"]

for anno_assert in soup.find_all('annotationassertion'):
    property_tag = anno_assert.annotationproperty
    
    # property_attri extracts "rdfs:label" in <AnnotationProperty abbreviatedIRI="rdfs:label"/>
    # it can also be <AnnotationProperty abbreviatedIRI="NDDUO:Synonym"/>, etc. 
    property_attri = property_tag["abbreviatediri"]    # str
    
    # tag_iri has a url, the last part is the token id (need preprocessing)
    # e.g. <IRI>http://scai.fraunhofer.de/AlzheimerOntology#brainstem</IRI>
    # replace the "#" sign as ":" in <AlzheimerOntology#brainstem>, then it becomes the token id
    iri_url = anno_assert.iri 
    
    # only look at the <IRI> structure, excluding the <AbbreviatedIRI>
    if iri_url is not None:
        if property_attri in ["metadata:prefixIRI", "NDDUO:Synonym", "rdfs:label"]:   # have checked for the "NDDUO:Synonym", this is the only attribute that has the synonyms
#         if property_attri == "metadata:prefixIRI":   # check if the guess of using <IRI> as the token id is true for those who are not from ADO. the answer is no 
            url = iri_url.contents[0]
            split_url = url.split("/")
        
            if len(split_url) > 3 and any(x in split_url[3] for x in onto_matches):   # dumped the unclear tokens
                rev_fir, rev_sec, third = split_url[-1], split_url[-2], split_url[3]
                len_split = len(split_url)
                if len_split == 4:  
                    token_id = rev_fir.replace("#", ":")  # For ADO and OntoDT
                    
                elif len_split == 5:   # only one AlzheimerOntology#NINCDS/ADRDA_criteria, three duplications  
                    if rev_sec == "AlzheimerOntology#NINCDS": # for ADO
                        token_id = (rev_sec+"_"+rev_fir).replace("#",":")   
                    elif rev_sec == "OntoDT" or rev_sec == "NDDO":  # for OntoDT and NDDO
                        token_id = rev_sec+":"+rev_fir
                    elif rev_sec == "obo":  # for obo-fma, obo-npt
                        token_id = rev_sec+":"+rev_fir.replace("#", "_").replace(".owl", "")

                elif len_split == 6: # for obo-bahavior, bfo, NDDO
                    if rev_sec == "DMtypes":   # exclude DMtypes, not in the OntoDT.owl
                        continue    
                    if third == "ofo":
                        token_id = third+":"+rev_fir
                    elif third == "bfo":
                        token_id = third+":"+rev_fir.replace("#", "_")
                    elif third == "NDDO":
                        token_id = third+":"+rev_sec+"_"+rev_fir
                        
                elif len_split == 8:  # for OntoDT#OntoDT_184436
                    token_id = rev_fir.replace("#", ":")
            
                    
                if property_attri == "metadata:prefixIRI":    # using this attribute to extract token ids is incomplete
                    ids_iri.append(token_id)
 
                if property_attri == "rdfs:label":   # point to the canonical
                    canonical = anno_assert.literal.get_text()
#                     canonical = anno_assert.literal.get_text().lower()
                    if token_id.split(":")[0] == "OntoDT":
                        cano_onto = canonical.split(":")
                        if len(cano_onto) > 1:
                            canonical = cano_onto[1].lstrip()
                    id_cano_iri.append((token_id, canonical))
#                     id_cano_iri.append((token_id, wnl.lemmatize(canonical)))  
           
                if property_attri == "NDDUO:Synonym":  # point to the variant base
                    variant = anno_assert.literal.get_text()
#                     variant = anno_assert.literal.get_text().lower()
                    if variant == "http://www.ebi.ac.uk/efo/efo_0000493":
                        continue
                    id_var_iri.append((token_id, variant))
#                     id_var_iri.append((token_id, wnl.lemmatize(variant)))
    


In [30]:
## turn the ontologies with <IRI> tag into id_canonical and id_variant dictionaries
## add the ontologies to the existed dict_id_canonicals
token_id_iri = list()

for token_id, cano in id_cano_iri:
    token_id_iri.append(token_id)
    dict_id_canonicals[token_id].append(cano)
    
for token_id, variant in id_var_iri:
    dict_id_variants[token_id].append(variant)

In [37]:
# create file
xmldoc = minidom.Document()

# creat root element
root_element = xmldoc.createElement('synonym')
xmldoc.appendChild(root_element)

# combine token ids from both <AbbreviatedIRI> and <IRI> structures
token_ids.extend(list(set(token_id_iri)))   # don't use ids_iri, not every canonical has m"eta:prefixiri" attribute, use in the token ids in the id_canonical dictionary

for token_id in token_ids:
    productChild = xmldoc.createElement('token')
    productChild.setAttribute('id', token_id)  # attribute, value
    
    canonical = dict_id_canonicals.get(token_id, None)
    if canonical is not None:
        productChild.setAttribute('canonical', canonical[0])  # take the first canonical if one token_id has multiple canonicals
        variant_list = canonical[1:]  # add the rest of the canonicals as the variant
    else:
#         print(token_id)   # the print statement prints nothing, it won't get into the else statement 
        continue
        
    root_element.appendChild(productChild) 

    variant = dict_id_variants[token_id]
    variant_list.extend(variant)
    
    if variant_list is not None:
        variant_list = list(set(variant_list)) # add the canonicals to the variant, eliminate the duplications
    else:
        variant_list = variant  # when the token_id only have one canonical and have no variants
    
    variant_list.append(canonical[0])  # the canonical has to be in the variant; I don't want to mess up with the above codes so I add the canonical in the end of the variant list
    
    variant_list = list(set(variant_list))  # avoid duplications
    
    for user in range(len(variant_list)):

        # create child element
        product_grandChild = xmldoc.createElement('variant')

        # insert user data into element
        product_grandChild.setAttribute('base', variant_list[user])
        productChild.appendChild(product_grandChild)    

    
xml_str = xmldoc.toprettyxml(indent ="\t")

# save file
dict_path = "/Users/yidesdo21/Projects/inputs/dictionary/"
save_path_file = "nio_case.xml"   # have to be very careful for this, this is not the final xml dictionary 

with open(dict_path+save_path_file, "w") as f:
    f.write(xml_str)


In [38]:
len(token_ids)

3755