###  Create a system that evaluates if identifiers of `indication_paths.json` have the correct `name` . 
Create a simple script that queries each ID against some authoritative resource(s)

In [2]:
#Import libraries
import requests
import yaml
import time
import tqdm
import pandas as pd
import csv
import os.path
from api_access import * #Apis of authority resources
from collections import Counter

### Input nodes

In [14]:
identifiers = []
prefix_id = []

# Open original indication_paths.yml file
with open("./../indication_paths.yaml", "r") as indication_paths:
    paths = yaml.load(indication_paths, Loader=yaml.CBaseLoader)
    paths_test = paths


    # Loop for each idication in  
    for ind in tqdm.tqdm((paths_test)):
        #Get nodes of indications
        nodes = ind["nodes"]
        
        for n in nodes:
            if n["id"] not in identifiers:
                identifiers.append(n["id"])
                prefix_id.append(n["id"].split(":")[0])

100%|█████████████████████████████████████████████████████████| 4549/4549 [00:00<00:00, 5592.08it/s]


In [15]:
"Number of unique nodes:",len((identifiers))

('Number of unique nodes:', 5225)

Count nodes by prefix (all ids not as set)

In [4]:
Counter(prefix_id)

Counter({'MESH': 2514,
         'UniProt': 803,
         'GO': 837,
         'REACT': 91,
         'UBERON': 137,
         'NCBITaxon': 166,
         'TIGR': 1,
         'InterPro': 117,
         'HP': 262,
         'CL': 40,
         'CHEBI': 129,
         'DB': 107,
         'PR': 5,
         'Pfam': 16})

### As reference, list of accessed resources: 

 'MESH': https://id.nlm.nih.gov/mesh/ <br>
 'UniProt': https://www.ebi.ac.uk/proteins/api/ <br>
 'GO': http://api.geneontology.org/api/ <br>
 'REACT': https://nodenormalization-sri.renci.org/ <br>
 'UBERON': https://nodenormalization-sri.renci.org/ <br>
 'NCBITaxon': https://nodenormalization-sri.renci.org/ <br>
 'TIGR': *<br>
 'InterPro': https://www.ebi.ac.uk/interpro/api/protein/reviewed/entry/interpro/<br>
 'HP': https://nodenormalization-sri.renci.org/<br>
 'CL': https://nodenormalization-sri.renci.org/<br>
 'CHEBI': http://mychem.info/v1/chem/<br>
 'DB': https://go.drugbank.com/releases/latest#open-data ⬅️ Download data <br>
 'PR': https://nodenormalization-sri.renci.org/<br>
 'Pfam': https://www.ebi.ac.uk/interpro/api/entry/pfam/

### Iterate over each node and send request to corresponding authoritative API resource
Estimate time of runing 5 hours. <br>
Run `02_validate_identifier_names.py` python script

In [None]:
#Define lists

#updated indications
indication_paths_updated = []

#dictionary preferred names
pref_name_dict = dict()

output_file = "preferred_node_names.csv"
error_identifiers_file = "error_ids.csv"


file_exists = os.path.isfile(output_file)
error_file_exists = os.path.isfile(error_identifiers_file)

with open(output_file, 'a', newline='') as outputfile:
    with open(error_identifiers_file, 'a', newline='') as error_file:
        fieldnames = ['graph_id', 'node_id', 'original_node_name', "preferred_node_name"]
        writer = csv.DictWriter(outputfile, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
            
        error_fieldnames = ['graph_id', 'node_id']
        error_writer = csv.DictWriter(error_file, fieldnames=error_fieldnames)
        if not error_file_exists:
            error_writer.writeheader()
        

        # Open original indication_paths.yml file
        with open("./../indication_paths.yaml", "r") as indication_paths:
            paths = yaml.load(indication_paths, Loader=yaml.CBaseLoader)
            paths_test = paths


            # Loop for each idication in  
            for ind in tqdm.tqdm((paths_test)):
                #Get nodes of indications
                nodes = ind["nodes"]
                

                #call access_api function depending on identifier prefix 
                for n in nodes:
                    #time.sleep(0.5)
                    
                    #if identifier already saved in output file. Keep it and continue. 
                    saved_results = open(output_file, 'r').read()
                    if n["id"] in saved_results: 
                        print("Already_saved: ", n["id"])
                    
                    #if identifier is not saved in file
                    else:
                        prefix = n["id"].split(":")[0] #Get prefix

                        try:
                            #if node was already evaluated and saved in dictionary. 
                            if n["id"] in pref_name_dict: 
                                print("Already_saved: ", n["id"])
                                
                            #if node identifier hasnt been evaluated:
                            else: 
                                if prefix == "MESH": 
                                    #send request
                                    preferred_name = access_MESH_API(n["id"])

                                    #to dict
                                    pref_name_dict[n["id"]] = preferred_name

                                    ##write outputfile
                                    write_output_file(writer,ind["graph"]["_id"],n["id"], n["name"], preferred_name)


                                if prefix == "UniProt":
                                    #send request
                                    preferred_name = access_UniProt_API(n["id"])

                                    #to dict
                                    pref_name_dict[n["id"]] = preferred_name

                                    ##write outputfile
                                    write_output_file(writer,ind["graph"]["_id"],n["id"], n["name"], preferred_name)


                                if prefix == "GO":
                                    #send request
                                    preferred_name = access_GO_API(n["id"])

                                    #to dict
                                    pref_name_dict[n["id"]] = preferred_name

                                    ##write outputfile
                                    write_output_file(writer,ind["graph"]["_id"],n["id"], n["name"], preferred_name)

                                if prefix == "CHEBI":  
                                    #send request
                                    preferred_name = access_mychem_API(n["id"])

                                    #to dict
                                    pref_name_dict[n["id"]] = preferred_name

                                    ##write outputfile
                                    write_output_file(writer,ind["graph"]["_id"],n["id"], n["name"], preferred_name)

                                list_prefix_for_node_normalizer = ["MESH","HP","NCBITaxon", "UBERON","REACT", "PR"]
                                if prefix in list_prefix_for_node_normalizer: 
                                    #send request
                                    preferred_name = access_nodenormalizer_API(n["id"])

                                    #to dict
                                    pref_name_dict[n["id"]] = preferred_name

                                    ###write outputfile
                                    write_output_file(writer,ind["graph"]["_id"],n["id"], n["name"], preferred_name)
                                
                                if prefix == "InterPro":  
                                    #send request
                                    preferred_name = access_interpro_API(n["id"])

                                    #to dict
                                    pref_name_dict[n["id"]] = preferred_name

                                    ##write outputfile
                                    write_output_file(writer,ind["graph"]["_id"],n["id"], n["name"], preferred_name)
                                
                                if prefix == "Pfam":  
                                    #send request
                                    preferred_name = access_pfam_API(n["id"])

                                    #to dict
                                    pref_name_dict[n["id"]] = preferred_name

                                    ##write outputfile
                                    write_output_file(writer,ind["graph"]["_id"],n["id"], n["name"], preferred_name)


                            print("Successful response: ", n["id"] )

                        #Error
                        except: 
                            error_saved_results = open(error_identifiers_file, 'r').read()
                            if n["id"] in error_saved_results: 
                                print("Already_saved as error: ", n["id"])
                                
                            else:
                                error_writer.writerow({"graph_id": ind["graph"]["_id"], "node_id":n["id"]})
                                print("Error requesting: ", n["id"])


### Integrate DB data
Resource: https://go.drugbank.com/releases/latest#open-data

In [None]:
#open drugBank file 

#Released on: 2022-01-03
db_data = pd.read_csv("drugbank vocabulary.csv")
db_data.head()

In [None]:
output_file = "preferred_node_names.csv"
error_identifiers_file = "error_ids.csv"

file_exists = os.path.isfile(output_file)
error_file_exists = os.path.isfile(error_identifiers_file)

nodes_evaluated =[]
with open(output_file, 'a', newline='') as outputfile:
    with open(error_identifiers_file, 'a', newline='') as error_file:
        fieldnames = ['graph_id', 'node_id', 'original_node_name', "preferred_node_name"]
        writer = csv.DictWriter(outputfile, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
            
        error_fieldnames = ['graph_id', 'node_id']
        error_writer = csv.DictWriter(error_file, fieldnames=error_fieldnames)
        if not error_file_exists:
            error_writer.writeheader()

    with open("./../indication_paths.yaml", "r") as indication_paths:
            paths = yaml.load(indication_paths, Loader=yaml.CBaseLoader)
            paths_test = paths

            # Loop for each idication in  
            for ind in tqdm.tqdm((paths_test)):

                #Get nodes of indications
                nodes = ind["nodes"]
                nodes = list(filter(lambda node: node["id"].startswith("DB:"), nodes))
                
                for node in nodes:
                    saved_results = open(output_file, 'r').read()
                    if node["id"] in saved_results: 
                        print("Already_saved: ", node["id"])
                   
                    else:
                        if node["id"] not in nodes_evaluated:
                            node_db = node['id'].split("DB:")[1]

                            if (db_data["DrugBank ID"] == node_db).any():
                                preferred_name = db_data[db_data["DrugBank ID"] == node_db]["Common name"].iloc[0]
                                nodes_evaluated.append(node['id'])

                                if preferred_name:
                                    ##write outputfile
                                        write_output_file(writer,ind["graph"]["_id"],node["id"], node["name"], preferred_name)

                                else:
                                    error_saved_results = open(error_identifiers_file, 'r').read()
                                    if n["id"] in error_saved_results: 
                                        print("Already_saved as error: ", n["id"])

                                    else:
                                        error_writer.writerow({"graph_id": ind["graph"]["_id"], "node_id":node["id"]})
                                        print("Error requesting: ", n["id"])



### Check results 

In [9]:
#preferred names 
pref_nodes = pd.read_csv("preferred_node_names.csv")
pref_nodes

Unnamed: 0,graph_id,node_id,original_node_name,preferred_node_name
0,DB00619_MESH_D015464_1,MESH:D000068877,imatinib,Imatinib Mesylate
1,DB00619_MESH_D015464_1,UniProt:P00519,BCR/ABL,Tyrosine-protein kinase ABL1
2,DB00619_MESH_D015464_1,MESH:D015464,CML (ph+),"Leukemia, Myelogenous, Chronic, BCR-ABL Positive"
3,DB00619_MESH_D034721_1,UniProt:P10721,c-Kit,Mast/stem cell growth factor receptor Kit
4,DB00619_MESH_D034721_1,UniProt:P16234,Pdgf,Platelet-derived growth factor receptor alpha
...,...,...,...,...
5141,DB04855_MESH_D001281_1,MESH:C118667,dronedarone,[OBSOLETE] dronedarone
5142,DB12243_MESH_D000690_1,MESH:C005435,edaravone,EDARAVONE
5143,DB00512_MESH_D004697_1,CHEBI:16576,D-alanyl-D-alanine,D-Alanyl-D-alanine
5144,DB06603_MESH_D016410_1,MESH:C496932,Panobinostat,Panobinostat


In [10]:
#error file 
request_errors = pd.read_csv("error_ids.csv")
request_errors

Unnamed: 0,graph_id,node_id
0,DB01601_MESH_D015658_1,UniProt:Q72874
1,DB00182_MESH_D001289_3,UniProt:Q99870
2,DB00495_MESH_D015658_1,UniProt:Q72547
3,DB00495_MESH_D015658_2,UniProt:Q72547
4,DB00758_MESH_D054058_1,PR:000028445
...,...,...
223,DB01059_MESH_D004756_1,UniProt:A0A156J405
224,DB01017_MESH_D000196_1,UniProt:S2ZP52
225,DB01017_MESH_D000196_1,UniProt:S3ABF8
226,DB02703_MESH_D004756_1,UniProt:V5AL63


In [11]:
#only set of ids
len(set(request_errors["node_id"]))

68

In [12]:
#Total number of unique identifiers - evaluated
5225 - (5146 +68)

11

There are 11 identifiers that were not saved in error or preferred name ?

In [16]:
identifiers = set(identifiers)
error_pref_nodes = (set(pref_nodes["node_id"]) | set(request_errors["node_id"]) )

In [17]:
(identifiers ^ error_pref_nodes)

{'DB:D015378',
 'DB:DBMET01698',
 'DB:DBMET02573',
 'DB:DBMET03189',
 'DB:DBSALT001045',
 'DB:DBSALT001065',
 'GO:005507',
 'NCBITaxon:139',
 'NCBITaxon:5519',
 'TIGR:02074',
 'UniProt:P3535'}