In [1]:
import spacy
from spacy import displacy
import xmltodict
from pathlib import Path
import json
from collections import defaultdict
import jsbeautifier

### The first part is for the NIO annotations: 
- visusalize the NIO annotations removing the nested annotations in disPlacy
- output the full NIO annotations (including the nested ones) in the json format

In [22]:
# automate the text span reading process
## step 1: read the outputs from the PathogenExperimenter, the outputs look like:
# ['article id:217583',
#  'predicted mention id:[obo:GO_0007613, obo:OGMS_0000031, NDDUO:Study_type, NDDUO:Study_type, obo:NBO_0000175, obo:GO_0007613, obo:BFO_0000015, obo:NBO_0000215, obo:NBO_0000175, obo:GO_0007613, obo:OBI_0000753, obo:BFO_0000015, obo:NBO_0000175, obo:GO_0007613, obo:BFO_0000015, obo:NBO_0000175, obo:GO_0007613]',
#  'predicted mention starts:[64, 382, 448, 448, 525, 546, 645, 725, 742, 763, 820, 1061, 1355, 1384, 1664, 1777, 1798]',
#  'predicted mention ends:[70, 389, 453, 453, 541, 552, 655, 730, 758, 769, 825, 1071, 1371, 1390, 1674, 1793, 1804]',
#  '----------',
## update for step 1: after running the articles in a large volume ~ 1000, the outputs look like:
# [[ISI_000328805100012], [[obo:GO_0007610, obo:NBO_0000313, ...]],
#  [[8, 8, ...]], [[17, 17, ...]], 
#  [NLM_17085785], [[AlzheimerOntology:amyloid_beta_protein, AlzheimerOntology:amyloid_beta_protein, ...]],
#  [[54, 54, ...]], [[66, 66, ...]], ...]
# [[article_title 1], [[UIDs for a1]], [[starting indices for a1]], [[ending indices for a1]],
#  [article_title 2], [[UIDs for a2]], [[starting indices for a2]], [[ending indices for a2]], ...]


output_path = "/Users/yidesdo21/Projects/outputs/10_ptc_ten_recent/txt/"
# output_file = "02_nio_with_canonical_as_variant/02_07_memory_recent_10.txt"
output_file = "nio_ten_recent_spans_nested_case.txt"

with open(output_path+output_file) as f:
    output = f.read().splitlines()[14:-2]
    

    

In [23]:
## step 2: link mention id to the canonical
##  goal - turn the nio.xml into a dictionary format
##  this is to use the canonical as the label in the spaCy's annotation 
## input format:
# <?xml version="1.0" ?>
# <synonym>
#     <token id="NDDO:NDDO_00000021" canonical="visit">
#         <variant base="visit"/>
#     </token>
#     <token id="NDDO:NDDO_00000022" canonical="diagnostic data">
#         <variant base="diagnostic data"/>
#     </token>
# </synonym>

## expected format:
# {"NDDO:NDDO_00000021": "visit",
#  "NDDO:NDDO_00000022": "diagnostic data"}

xml_path = "/Users/yidesdo21/Projects/inputs/dictionary/nio_ado_case.xml"
# xml_path = "/Users/yidesdo21/Projects/codes/nio_iri_v3_2.xml"

with open(xml_path) as f:
    nio_xml = f.read()

token_cano = dict()  # {"token_id": "canonical"}
nio_dict = xmltodict.parse(nio_xml)

canos = nio_dict["synonym"]["token"]
for cano in canos:
    token_cano[cano["@id"]] = cano["@canonical"]

In [30]:
## step 3: pull out the mentioned entities by slicing the article with starting and ending indices
##  3.1: create the package for each article, so we can access the mention ids, starting 
##       and ending indices for each article with the same index.
article_ids = list() # article_ids[i] to access the article id
mention_ids = list() # mention_ids[i] to access the list of mention ids for the [i]th article
start_ind = list()   # start_ind[i] to access the list of start indices for the [i]th article
end_ind = list()     # end_ind[i] to access the list of end indices for the [i]th article
docs = list()   # to create the list of dictionaries as the input for spaCy visualizer
                 # this has to be ourside of the articles iteration
docs_text = list()   # not for diaplacy, but for showing the mentioned entites with the canonical  

for k in range(len(output)):
    if k%5 == 0:  
        article_ids.append(output[k].split(":")[1])
    elif k%5 == 1:
        mention_id = output[k].split("[")[1][:-1].split(", ")
        mention_ids.append(mention_id)
    elif k%5 == 2:
        starts = output[k].split("[")[1][:-1].split(", ")
        start_ind.append([int(x) for x in starts])
    elif k%5 == 3:
        ends = output[k].split("[")[1][:-1].split(", ")
        end_ind.append([int(x) for x in ends])    
        


In [31]:
##  3.2: iterate each article to create the annotation 
article_path = "/Users/yidesdo21/Projects/inputs/articles/04_ptc_ten_recent/articles-txt-format/" 
# article_path = "/Users/yidesdo21/Projects/inputs/articles/03_tests_for_dictionary/"
article_num = len(article_ids)
titles = dict()

for i in range(article_num):  # iteratre over each of the i[th] article
    with open(article_path+str(article_ids[i])+".txt") as f:  # use article ids here
        article = f.readlines()
        
    # for the i[th] article, get the lists of mention ids, starting and ending indices
    mention_ids_for_i = mention_ids[i]
    start_ind_for_i = start_ind[i]
    end_ind_for_i = end_ind[i]
    anno_len = len(mention_ids_for_i)
    
    ## step 4: visualize the text span with the annotations by using spaCy visualizer
    # create the list of dictionaries as the input for spaCy visualizer
    # ex = [{"text": "But Google is starting from behind.",
    #        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    #        "title": "Test"}]
    # html = displacy.render(ex, style="ent", manual=True, options={"colors":{"ORG":"yellow"}})

    article_dict = dict() 
    article_dict_text = dict()
    article_dict["title"] = article_ids[i]   
    article_dict["text"] = article[0]  # article is list of string, ["article"]
    article_dict_text["title"] = article_ids[i]   
    article_dict_text["text"] = article[0]  # article is list of string, ["article"]    
    
    title_mapping = article_ids[i].split("_")
    if len(title_mapping) > 1:   ## this is for the database_identifier format only
        titles[title_mapping[1]] = article_ids[i]   # to help the PTC titles to map from identifiers to the database_identfiier
    
    ents_list = list()
    ents_list_text = list()  # for docs_text

    ## there are duplications, need to filter; no need to filter if using spaCy visualizer, will overlay 
    # NDDUO:Study_type 549 554 study
    # ----------
    # NDDUO:Study_type 549 554 study

    start_end = set()  # use this to avoid the duplications in the starting-ending range 
    
    for anno_cnt in range(anno_len):  # iterate over all entity annotations in the i[th] article 
        mentioned_id = mention_ids_for_i[anno_cnt]
        start_index = start_ind_for_i[anno_cnt]
        end_index = end_ind_for_i[anno_cnt]
        mentioned_entity = article[0][start_index:end_index]
        canonical = token_cano[mentioned_id]
        
        if (start_index,end_index) not in start_end:
            start_end.add((start_index,end_index))
            ent = dict()
            ent["start"] = start_index
            ent["end"] = end_index
            ent["label"] = ""   # for illustration reasons
#             ent["label"] = mentioned_id+"|"+canonical  # change mentioned_entity into canonical in the xml dictionary
            ents_list.append(ent)                             # the labels can only be in uppercase

        ent_text = dict()   # this part is not affected because it is for the json not for the displacy
        ent_text["span"] = start_index, end_index
        ent_text["mention"] = mentioned_entity
        ent_text["identifier"] = mentioned_id    # the mention after grounding: ontological identifier
        ent_text["concept"] = canonical         # the mention after grounding: ontological concept
        ent_text["type"] = ""
        ent_text["annotator"] = "NIO"
        if ent_text not in ents_list_text:
#             print(ent_text)
#             print("--------")
            ents_list_text.append(ent_text)  
        
    article_dict["ents"] = ents_list 
    article_dict_text["ents"] = ents_list_text

    docs.append(article_dict)   # this has to be outside of the articles iteration
    docs_text.append(article_dict_text)  # similar to docs, but have mentioned_entities
    
    

{'span': (0, 3), 'mention': 'Tau', 'identifier': 'AlzheimerOntology:t_tau', 'concept': 't-tau', 'type': '', 'annotator': 'NIO'}
--------
{'span': (20, 29), 'mention': 'filaments', 'identifier': 'AlzheimerOntology:Filaments', 'concept': 'Filament', 'type': '', 'annotator': 'NIO'}
--------
{'span': (58, 62), 'mention': 'core', 'identifier': 'obo:IAO_0000224', 'concept': 'core', 'type': '', 'annotator': 'NIO'}
--------
{'span': (81, 90), 'mention': 'filaments', 'identifier': 'AlzheimerOntology:Filaments', 'concept': 'Filament', 'type': '', 'annotator': 'NIO'}
--------
{'span': (94, 113), 'mention': "Alzheimer's disease", 'identifier': 'AlzheimerOntology:Subtypes', 'concept': 'Alzheimer disease', 'type': '', 'annotator': 'NIO'}
--------
{'span': (94, 105), 'mention': "Alzheimer's", 'identifier': 'AlzheimerOntology:Subtypes', 'concept': 'Alzheimer disease', 'type': '', 'annotator': 'NIO'}
--------
{'span': (94, 103), 'mention': 'Alzheimer', 'identifier': 'AlzheimerOntology:Subtypes', 'conce

In [14]:
# docs

In [8]:
# docs_text

In [32]:
docs_text

[{'title': 'NLM_31721178',
  'text': "Tau (297-391) forms filaments that structurally mimic the core of paired helical filaments in Alzheimer's disease brain.The constituent paired helical filaments (PHFs) in neurofibrillary tangles are insoluble intracellular deposits central to the development of Alzheimer's disease (AD) and other tauopathies. Full-length tau requires the addition of anionic cofactors such as heparin to enhance assembly. We have shown that a fragment from the proteolytically stable core of the PHF, tau 297-391 known as 'dGAE', spontaneously forms cross-β-containing PHFs and straight filaments under physiological conditions. Here, we have analysed and compared the structures of the filaments formed by dGAE in\xa0vitro with those deposited in the brains of individuals diagnosed with AD. We show that dGAE forms PHFs that share a macromolecular structure similar to those found in brain tissue. Thus, dGAEs may serve as a model system for studying core domain assembly and 

In [33]:
def write_to_json(path, doc, span_raw):
    """write the annotations to the json format
    inputs:
        path: the path to the output json file
        doc: the file to be written to the json format
        span_raw: the file name of the raw text spans
    output: a json file written in the output path
    """
    options = jsbeautifier.default_options()
    options.indent_size = 8
    
    sorted_doc = sorted(doc, key=lambda d: d['title']) 
    json_object = jsbeautifier.beautify(json.dumps(sorted_doc), options)

    with open(path+span_raw.split(".")[0]+'.json', 'w') as fp:
        fp.write(json_object)

In [34]:
result_path = "/Users/yidesdo21/Projects/outputs/10_ptc_ten_recent/json/"

In [35]:
write_to_json(result_path, docs_text, output_file)

In [80]:
# options = jsbeautifier.default_options()
# options.indent_size = 8

In [81]:
# # export docs_text to json: either 1) write a json object outside of the with open... by using json.dumps
# #                       or 2) use with open ... and write json.dump within the with open
# # docs_text has article, mentioned entity, mentioned id, canonical, and entity text span
# # result_path = "/Users/yidesdo21/Projects/outputs/03_annotation_with_articles/json_with_entity/"
# result_path = "/Users/yidesdo21/Projects/outputs/10_ptc_ten_recent/json/"
# json_object = jsbeautifier.beautify(json.dumps(docs_text), options)

# with open(result_path+output_file.split(".")[0]+'.json', 'w') as fp:
#     fp.write(json_object)

In [89]:
output_file

'nio_ten_recent_spans_nested_case.txt'

### Results from the NIO annotator

In [7]:
# need to use docs, rather than docs_text 
displacy.render(docs, style="ent", manual=True, options={"colors":{}})

### The second part for the PTC annotations
- visusalize the PTC annotations removing the nested annotations in disPlacy
- output the full PTC annotations (including the nested ones) in the json format

In [None]:
# goal: visualize the text span with the annotations by using spaCy visualizer
# create the list of dictionaries as the input for spaCy visualizer
# ex = [{"text": "But Google is starting from behind.",
#        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
#        "title": "Test"}]
# html = displacy.render(ex, style="ent", manual=True, options={"colors":{"ORG":"yellow"}})

In [None]:
# ptc annotations: 
# 34060233|t|[Title]
# 34060233|a|[Abstract]
# 34060233	4	8	BIN1	Gene	274
# 34060233	9	17	rs744373	SNP	rs744373

In [36]:
ptc_path = "/Users/yidesdo21/Projects/ptc/Code.Python/output/"
ptc_file = "ten_recent.PubTator"

In [37]:
with open(ptc_path+ptc_file) as f:
    ptc_output = f.read().splitlines()

In [38]:
ptc_outputs = list()  # turn one abstract into one list 
ptc_abs = list()

for i in range(len(ptc_output)):
    line = ptc_output[i]
    ptc_abs.append(line)
    
    if len(line) == 0:    # start another article
        ptc_outputs.append(ptc_abs)
        ptc_abs = list()

ptc_outputs = ptc_outputs[:-1]   # the last element is empty

#### create a map to link the identifiers to the ontological concepts — 
- genes/proteins → NCBI Gene identifiers, 
- Genetic variants → dbSNP RS identifiers, 
- Species → NCBI taxonomy identifiers, 
- Disease → MeSH identifiers, Chemicals → MeSH identifiers, 
- cell lines → Cellosaurus 

In [19]:
# titles

In [20]:
# ptc_output

In [39]:
ptc_docs = list()
ptc_docs_text = list()  # for the json
ptc_cnt = len(ptc_outputs)

for i in range(ptc_cnt):   # iterate over each abstract 
    one_abs = ptc_outputs[i]
    abs_dict = dict()
    abs_dict_text = dict()  # for the json
    text = ""
    ents = list()
    ents_text = list()  # for the json
    
    
    for line in one_abs:   # iterate over each line in the abstract
        line_split = line.split("|")
        if len(line_split) > 1:  # either the title line or the abstract line
            abs_dict["title"] = titles.get(line_split[0])
            abs_dict_text["title"] = titles.get(line_split[0])
            
#             print((line_split[0]))
            
            if line_split[1] == "t":
                text += line_split[2]
            elif line_split[1] == "a":
                text += "."  # can't add a space because of the PTC annotations, the slicing will be wrong
                text += line_split[2]
                abs_dict["text"] = text
                abs_dict_text["text"] = text
    
        span_split = line.split("\t")
        ent = dict()
        ent_text = dict()   # for the json
        
        if len(span_split) > 1:  # the text span lines 
            start, end, label = span_split[1], span_split[2], span_split[3]
            ent["start"] = int(start)
            ent["end"] = int(end)
            ent["label"] = span_split[4]  # for illustrations
#             ent["label"] = label
            ents.append(ent)
    
            ent_text["span"] = int(start), int(end)
            ent_text["mention"] = label

#             print(span_split)
            
            if len(span_split) > 5:
                ent_text["identifier"] = span_split[5]    # the mention after grounding: ontological identifier
            else:
                ent_text["identifier"] = "none"
                
            ent_text["concept"] = ""         # not included in the PTC output, have to link to the ontologies
            ent_text["type"] = span_split[4]
            ent_text["annotator"] = "PTC"
            ents_text.append(ent_text)  
    
        if len(line) == 0:  # end of this abstract
            abs_dict["ents"] = ents
            ptc_docs.append(abs_dict)
            abs_dict_text["ents"] = ents_text
            ptc_docs_text.append(abs_dict_text)
            
        
        
        

In [40]:
len(ptc_docs_text)

10

In [41]:
ptc_docs_text

[{'title': 'NLM_34060233',
  'text': "The BIN1 rs744373 Alzheimer's disease risk SNP is associated with faster Aβ-associated tau accumulation and cognitive decline.INTRODUCTION: The BIN1 rs744373 single nucleotide polymorphism (SNP) is a key genetic risk locus for Alzheimer's disease (AD) associated with tau pathology. Because tau typically accumulates in response to amyloid beta (Aβ), we tested whether BIN1 rs744373 accelerates Aβ-related tau accumulation. METHODS: We included two samples (Alzheimer's Disease Neuroimaging Initiative [ADNI], n\xa0=\xa0153; Biomarkers for Identifying Neurodegenerative Disorders Early and Reliably [BioFINDER], n\xa0=\xa063) with longitudinal (18) F-Flortaucipir positron emission tomography (PET), Aβ biomarkers, and longitudinal cognitive assessments. We assessed whether BIN1 rs744373 was associated with faster tau-PET accumulation at a given level of Aβ and whether faster BIN1 rs744373-associated tau-PET accumulation mediated cognitive decline. RESULTS: 

In [27]:
len(ptc_docs)

10

In [28]:
write_to_json(result_path, ptc_docs_text, output_file.replace("nio","ptc"))

In [91]:
output_file.replace("nio","ptc")

'ptc_ten_recent_spans_nested_case.txt'

In [134]:
displacy.render(ptc_docs, style="ent", manual=True, 
                options={"colors":{"GENE":"aqua", "DISEASE":"pink", "CHEMICAL":"violet",
                                   "SNP":"moccasin", "SPECIES":"lightgreen"}})

### combine the annotation spans from NIO and PTC
- leave the duplications there
- filtering duplications: when the text spans are the same, the grounding will be the PTC's. We are not removing the duplications for the same starting index but different ending indices. We are not removing the duplications for the mentions with different grounded results (different identifiers or different ontological concepts). 

#### the combined annotations without filtering duplications

In [61]:
# NIO annotations - docs_text
# PTC annotations - ptc_docs_text

a = defaultdict(list) 

for elem in ptc_docs_text:
    a[(elem['title'],elem['text'])].extend(elem['ents'])

for elem in docs_text:
    a[(elem['title'],elem['text'])].extend(elem['ents'])

Output = [{"title":x[0], 'text':x[1], "ents": sorted(y, key=lambda x: x['span'])} for x, y in a.items()]

In [62]:
len(Output)

10

In [63]:
Output[0]

{'title': 'NLM_34060233',
 'text': "The BIN1 rs744373 Alzheimer's disease risk SNP is associated with faster Aβ-associated tau accumulation and cognitive decline.INTRODUCTION: The BIN1 rs744373 single nucleotide polymorphism (SNP) is a key genetic risk locus for Alzheimer's disease (AD) associated with tau pathology. Because tau typically accumulates in response to amyloid beta (Aβ), we tested whether BIN1 rs744373 accelerates Aβ-related tau accumulation. METHODS: We included two samples (Alzheimer's Disease Neuroimaging Initiative [ADNI], n\xa0=\xa0153; Biomarkers for Identifying Neurodegenerative Disorders Early and Reliably [BioFINDER], n\xa0=\xa063) with longitudinal (18) F-Flortaucipir positron emission tomography (PET), Aβ biomarkers, and longitudinal cognitive assessments. We assessed whether BIN1 rs744373 was associated with faster tau-PET accumulation at a given level of Aβ and whether faster BIN1 rs744373-associated tau-PET accumulation mediated cognitive decline. RESULTS: BI

In [64]:
result_path

'/Users/yidesdo21/Projects/outputs/10_ptc_ten_recent/json/'

In [65]:
write_to_json(result_path, Output, output_file.replace("nio","combined"))

#### the combined annotations after filtering duplications

In [66]:
filtered_output = list()

for i in Output:
    filtered_dict = dict()
    ents = i["ents"]
    filtered_ents = list()
    start_end_set = set()
    filtered_dict["title"], filtered_dict["text"] = i["title"],i["text"]
    
    for ent in ents:
        start,end,label = ent["start"],ent["end"],ent["label"]
        span = (start,end)
        if span in start_end_set:
            continue
        else:
            start_end_set.add(span)
            filtered_ents.append(ent)
    
    filtered_dict["ents"] = filtered_ents
    filtered_output.append(filtered_dict)
    

In [68]:
filtered_output

[{'title': 'NLM_34060233',
  'text': "The BIN1 rs744373 Alzheimer's disease risk SNP is associated with faster Aβ-associated tau accumulation and cognitive decline.INTRODUCTION: The BIN1 rs744373 single nucleotide polymorphism (SNP) is a key genetic risk locus for Alzheimer's disease (AD) associated with tau pathology. Because tau typically accumulates in response to amyloid beta (Aβ), we tested whether BIN1 rs744373 accelerates Aβ-related tau accumulation. METHODS: We included two samples (Alzheimer's Disease Neuroimaging Initiative [ADNI], n\xa0=\xa0153; Biomarkers for Identifying Neurodegenerative Disorders Early and Reliably [BioFINDER], n\xa0=\xa063) with longitudinal (18) F-Flortaucipir positron emission tomography (PET), Aβ biomarkers, and longitudinal cognitive assessments. We assessed whether BIN1 rs744373 was associated with faster tau-PET accumulation at a given level of Aβ and whether faster BIN1 rs744373-associated tau-PET accumulation mediated cognitive decline. RESULTS: 

In [69]:
displacy.render(filtered_output, style="ent", manual=True, 
                options={"colors":{"GENE":"aqua", "DISEASE":"pink", "CHEMICAL":"violet",
                                   "SNP":"moccasin", "SPECIES":"lightgreen"}})

### ignore the lines below

In [33]:
# html = displacy.render(docs, style="ent", manual=True, options={"colors":{}})

In [32]:
# # errors for the html
# html_path = "/Users/yidesdo21/Projects/outputs/03_annotation_with_articles/html/"
# with open(html_path+"test.html", "w") as f:
#     f.write(html)

In [11]:
ex = [{"text": "But Google is starting from behind.",
       "ents": [{"start": 4, "end": 10, "label": "ORG"},
                {"start": 4, "end": 13, "label": "TEST"}
               ],
       "title": "Test"},]
#      {"text": "But Google is starting from behind.",
#        "ents": [
# #                 {"start": 4, "end": 10, "label": "ORG"},
#                 {"start": 4, "end": 13, "label": "TEST"}
#                ],
#        "title": "Test"}]
html = displacy.render(ex, style="ent", manual=True, options={"colors":{}})
#                        {"colors":{"ORG":"yellow"}})

In [18]:
from spacy.tokens import Doc

Doc.set_extension('my_ents', default=None, force=True)

def move_ents_to_attr(doc):
    if doc._.my_ents is None:
        doc._.my_ents = []
    doc._.my_ents.extend(doc.ents)
    doc.ents = []
    return doc


nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(move_ents_to_attr, last=True)


In [38]:
nlp = spacy.load("en_core_web_sm")

In [39]:
# text = "But Google is starting from behind."
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
doc = nlp(text)


In [40]:
doc

When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.

In [41]:
displacy.render(doc, style="ent", options={"colors":{}})