In [None]:
"""
This notebook is used to 1) read the outputs from the PathogenExperimenter of the NIO annotator and from the PTC annotator, 2) link mention id to the canonical, and 3) visualize through SpaCy

"""

In [1]:
import spacy
from spacy import displacy
import xmltodict
from pathlib import Path
import json
from collections import defaultdict
import jsbeautifier

In [22]:
## step 1: read the outputs from the PathogenExperimenter from the NIO annotator
output_path = "/Users/yidesdo21/Projects/outputs/10_ptc_ten_recent/txt/"
output_file = "nio_ten_recent_spans_nested_case.txt"

with open(output_path+output_file) as f:
    output = f.read().splitlines()[14:-2]

In [23]:
## step 2: link mention id to the canonical
## input format:
# <?xml version="1.0" ?>
# <synonym>
#     <token id="NDDO:NDDO_00000021" canonical="visit">
#         <variant base="visit"/>
#     </token>
#     <token id="NDDO:NDDO_00000022" canonical="diagnostic data">
#         <variant base="diagnostic data"/>
#     </token>
# </synonym>

## expected format:
# {"NDDO:NDDO_00000021": "visit",
#  "NDDO:NDDO_00000022": "diagnostic data"}

xml_path = "/Users/yidesdo21/Projects/inputs/dictionary/nio_ado_case.xml"

with open(xml_path) as f:
    nio_xml = f.read()

token_cano = dict()  # {"token_id": "canonical"}
nio_dict = xmltodict.parse(nio_xml)

canos = nio_dict["synonym"]["token"]
for cano in canos:
    token_cano[cano["@id"]] = cano["@canonical"]

In [30]:
## step 3: pull out the mentioned entities by slicing the article with starting and ending indices
##  3.1: create the package for each article, so we can access the mention ids, starting 
##       and ending indices for each article with the same index.
article_ids = list() # article_ids[i] to access the article id
mention_ids = list() # mention_ids[i] to access the list of mention ids for the [i]th article
start_ind = list()   # start_ind[i] to access the list of start indices for the [i]th article
end_ind = list()     # end_ind[i] to access the list of end indices for the [i]th article
docs = list()   # to create the list of dictionaries as the input for spaCy visualizer
                 # this has to be ourside of the articles iteration
docs_text = list()   # not for diaplacy, but for showing the mentioned entites with the canonical  

for k in range(len(output)):
    if k%5 == 0:  
        article_ids.append(output[k].split(":")[1])
    elif k%5 == 1:
        mention_id = output[k].split("[")[1][:-1].split(", ")
        mention_ids.append(mention_id)
    elif k%5 == 2:
        starts = output[k].split("[")[1][:-1].split(", ")
        start_ind.append([int(x) for x in starts])
    elif k%5 == 3:
        ends = output[k].split("[")[1][:-1].split(", ")
        end_ind.append([int(x) for x in ends])    
        


In [31]:
##  3.2: iterate each article to create the annotation 
article_path = "/Users/yidesdo21/Projects/inputs/articles/04_ptc_ten_recent/articles-txt-format/" 
# article_path = "/Users/yidesdo21/Projects/inputs/articles/03_tests_for_dictionary/"
article_num = len(article_ids)
titles = dict()

for i in range(article_num):  # iteratre over each of the i[th] article
    with open(article_path+str(article_ids[i])+".txt") as f:  # use article ids here
        article = f.readlines()
        
    # for the i[th] article, get the lists of mention ids, starting and ending indices
    mention_ids_for_i = mention_ids[i]
    start_ind_for_i = start_ind[i]
    end_ind_for_i = end_ind[i]
    anno_len = len(mention_ids_for_i)
    
    ## step 4: visualize the text span with the annotations by using spaCy visualizer
    # create the list of dictionaries as the input for spaCy visualizer
    # ex = [{"text": "But Google is starting from behind.",
    #        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    #        "title": "Test"}]
    # html = displacy.render(ex, style="ent", manual=True, options={"colors":{"ORG":"yellow"}})

    article_dict = dict() 
    article_dict_text = dict()
    article_dict["title"] = article_ids[i]   
    article_dict["text"] = article[0]  # article is list of string, ["article"]
    article_dict_text["title"] = article_ids[i]   
    article_dict_text["text"] = article[0]  # article is list of string, ["article"]    
    
    title_mapping = article_ids[i].split("_")
    if len(title_mapping) > 1:   ## this is for the database_identifier format only
        titles[title_mapping[1]] = article_ids[i]   # to help the PTC titles to map from identifiers to the database_identfiier
    
    ents_list = list()
    ents_list_text = list()  # for docs_text

    ## there are duplications, need to filter; no need to filter if using spaCy visualizer, will overlay 
    # NDDUO:Study_type 549 554 study
    # ----------
    # NDDUO:Study_type 549 554 study

    start_end = set()  # use this to avoid the duplications in the starting-ending range 
    
    for anno_cnt in range(anno_len):  # iterate over all entity annotations in the i[th] article 
        mentioned_id = mention_ids_for_i[anno_cnt]
        start_index = start_ind_for_i[anno_cnt]
        end_index = end_ind_for_i[anno_cnt]
        mentioned_entity = article[0][start_index:end_index]
        canonical = token_cano[mentioned_id]
        
        if (start_index,end_index) not in start_end:
            start_end.add((start_index,end_index))
            ent = dict()
            ent["start"] = start_index
            ent["end"] = end_index
            ent["label"] = ""   # for illustration reasons
#             ent["label"] = mentioned_id+"|"+canonical  # change mentioned_entity into canonical in the xml dictionary
            ents_list.append(ent)                             # the labels can only be in uppercase

        ent_text = dict()   # this part is not affected because it is for the json not for the displacy
        ent_text["span"] = start_index, end_index
        ent_text["mention"] = mentioned_entity
        ent_text["identifier"] = mentioned_id    # the mention after grounding: ontological identifier
        ent_text["concept"] = canonical         # the mention after grounding: ontological concept
        ent_text["type"] = ""
        ent_text["annotator"] = "NIO"
        if ent_text not in ents_list_text:
#             print(ent_text)
#             print("--------")
            ents_list_text.append(ent_text)  
        
    article_dict["ents"] = ents_list 
    article_dict_text["ents"] = ents_list_text

    docs.append(article_dict)   # this has to be outside of the articles iteration
    docs_text.append(article_dict_text)  # similar to docs, but have mentioned_entities
    
    

{'span': (0, 3), 'mention': 'Tau', 'identifier': 'AlzheimerOntology:t_tau', 'concept': 't-tau', 'type': '', 'annotator': 'NIO'}
--------
{'span': (20, 29), 'mention': 'filaments', 'identifier': 'AlzheimerOntology:Filaments', 'concept': 'Filament', 'type': '', 'annotator': 'NIO'}
--------
{'span': (58, 62), 'mention': 'core', 'identifier': 'obo:IAO_0000224', 'concept': 'core', 'type': '', 'annotator': 'NIO'}
--------
{'span': (81, 90), 'mention': 'filaments', 'identifier': 'AlzheimerOntology:Filaments', 'concept': 'Filament', 'type': '', 'annotator': 'NIO'}
--------
{'span': (94, 113), 'mention': "Alzheimer's disease", 'identifier': 'AlzheimerOntology:Subtypes', 'concept': 'Alzheimer disease', 'type': '', 'annotator': 'NIO'}
--------
{'span': (94, 105), 'mention': "Alzheimer's", 'identifier': 'AlzheimerOntology:Subtypes', 'concept': 'Alzheimer disease', 'type': '', 'annotator': 'NIO'}
--------
{'span': (94, 103), 'mention': 'Alzheimer', 'identifier': 'AlzheimerOntology:Subtypes', 'conce

In [33]:
def write_to_json(path, doc, span_raw):
    """write the annotations to the json format
    inputs:
        path: the path to the output json file
        doc: the file to be written to the json format
        span_raw: the file name of the raw text spans
    output: a json file written in the output path
    """
    options = jsbeautifier.default_options()
    options.indent_size = 8
    
    sorted_doc = sorted(doc, key=lambda d: d['title']) 
    json_object = jsbeautifier.beautify(json.dumps(sorted_doc), options)

    with open(path+span_raw.split(".")[0]+'.json', 'w') as fp:
        fp.write(json_object)

In [34]:
result_path = "/Users/yidesdo21/Projects/outputs/10_ptc_ten_recent/json/"
write_to_json(result_path, docs_text, output_file)

In [7]:
displacy.render(docs, style="ent", manual=True, options={"colors":{}})

In [36]:
ptc_path = "/Users/yidesdo21/Projects/ptc/Code.Python/output/"
ptc_file = "ten_recent.PubTator"

with open(ptc_path+ptc_file) as f:
    ptc_output = f.read().splitlines()

In [38]:
ptc_outputs = list()  # turn one abstract into one list 
ptc_abs = list()

for i in range(len(ptc_output)):
    line = ptc_output[i]
    ptc_abs.append(line)
    
    if len(line) == 0:    # start another article
        ptc_outputs.append(ptc_abs)
        ptc_abs = list()

ptc_outputs = ptc_outputs[:-1]   # the last element is empty

In [39]:
ptc_docs = list()
ptc_docs_text = list()  # for the json
ptc_cnt = len(ptc_outputs)

for i in range(ptc_cnt):   # iterate over each abstract 
    one_abs = ptc_outputs[i]
    abs_dict = dict()
    abs_dict_text = dict()  # for the json
    text = ""
    ents = list()
    ents_text = list()  # for the json
    
    
    for line in one_abs:   # iterate over each line in the abstract
        line_split = line.split("|")
        if len(line_split) > 1:  # either the title line or the abstract line
            abs_dict["title"] = titles.get(line_split[0])
            abs_dict_text["title"] = titles.get(line_split[0])
            
#             print((line_split[0]))
            
            if line_split[1] == "t":
                text += line_split[2]
            elif line_split[1] == "a":
                text += "."  # can't add a space because of the PTC annotations, the slicing will be wrong
                text += line_split[2]
                abs_dict["text"] = text
                abs_dict_text["text"] = text
    
        span_split = line.split("\t")
        ent = dict()
        ent_text = dict()   # for the json
        
        if len(span_split) > 1:  # the text span lines 
            start, end, label = span_split[1], span_split[2], span_split[3]
            ent["start"] = int(start)
            ent["end"] = int(end)
            ent["label"] = span_split[4]  # for illustrations
#             ent["label"] = label
            ents.append(ent)
    
            ent_text["span"] = int(start), int(end)
            ent_text["mention"] = label

#             print(span_split)
            
            if len(span_split) > 5:
                ent_text["identifier"] = span_split[5]    # the mention after grounding: ontological identifier
            else:
                ent_text["identifier"] = "none"
                
            ent_text["concept"] = ""         # not included in the PTC output, have to link to the ontologies
            ent_text["type"] = span_split[4]
            ent_text["annotator"] = "PTC"
            ents_text.append(ent_text)  
    
        if len(line) == 0:  # end of this abstract
            abs_dict["ents"] = ents
            ptc_docs.append(abs_dict)
            abs_dict_text["ents"] = ents_text
            ptc_docs_text.append(abs_dict_text)
            
        
        
        

In [28]:
write_to_json(result_path, ptc_docs_text, output_file.replace("nio","ptc"))
output_file.replace("nio","ptc")

In [134]:
displacy.render(ptc_docs, style="ent", manual=True, 
                options={"colors":{"GENE":"aqua", "DISEASE":"pink", "CHEMICAL":"violet",
                                   "SNP":"moccasin", "SPECIES":"lightgreen"}})

In [61]:
# NIO annotations - docs_text
# PTC annotations - ptc_docs_text

a = defaultdict(list) 

for elem in ptc_docs_text:
    a[(elem['title'],elem['text'])].extend(elem['ents'])

for elem in docs_text:
    a[(elem['title'],elem['text'])].extend(elem['ents'])

Output = [{"title":x[0], 'text':x[1], "ents": sorted(y, key=lambda x: x['span'])} for x, y in a.items()]

In [65]:
write_to_json(result_path, Output, output_file.replace("nio","combined"))

In [66]:
filtered_output = list()

for i in Output:
    filtered_dict = dict()
    ents = i["ents"]
    filtered_ents = list()
    start_end_set = set()
    filtered_dict["title"], filtered_dict["text"] = i["title"],i["text"]
    
    for ent in ents:
        start,end,label = ent["start"],ent["end"],ent["label"]
        span = (start,end)
        if span in start_end_set:
            continue
        else:
            start_end_set.add(span)
            filtered_ents.append(ent)
    
    filtered_dict["ents"] = filtered_ents
    filtered_output.append(filtered_dict)
    

In [69]:
displacy.render(filtered_output, style="ent", manual=True, 
                options={"colors":{"GENE":"aqua", "DISEASE":"pink", "CHEMICAL":"violet",
                                   "SNP":"moccasin", "SPECIES":"lightgreen"}})