# HMDB Standalone "Pending" API (Association-Centric)    
  

[Data Assignment](https://github.com/biothings/mygene.info/issues/110)  
[Data Download](https://hmdb.ca/downloads)    
  
    
      
What does this do?  
This program takes an input file, `proteins.xml`, from the [HMDB database](https://hmdb.ca/downloads).

In [1]:
import pandas as pd 
import math
import numpy as np
import json, os
import xml.etree.ElementTree as ET

from IPython.display import display
from biothings.utils.dataload import dict_convert, dict_sweep

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load `.xml` protein file

In [2]:
# -- Set file path ---
protein_xml = os.path.join("/Users/nacosta/Documents/hmdb_proteins.xml")

# --- Upload XML Data (workaround for pandas >1.3, need to upgrade Biothings) --- 
xml_data = open(protein_xml, 'r', encoding='UTF-8').read()  # Read file
root = ET.XML(xml_data)  # Parse XML


---  
## Parse `.xml` protein file  
  
- To be explicit see this example at [biothings github](https://github.com/biothings/pending.api/issues/30#issuecomment-904319224)    
            
- The data under `subject` should could from the `protein_properties` section,
- the data under `object` should come from `metabolite_reference.metabolite`,
- and the `pmid` should come from `metabolite_reference.reference` .  
            
            `    {
                    "_id": "10682982_1",
                    "predicate": "PART_OF",    
                    "predication_id": 10682982,
                    "pmid": 16530800,
                    "subject": {
                        "ncbigene": "3346",
                        "name": "HTN1",
                        "semantic_type_abbreviation": "gngm",
                        "semantic_type_name": "Gene or Genome",    
                        "novelty": 1    
                    },
                    "object": {
                        "umls": "C1481993",
                        "name": "Haloviruses",
                        "semantic_type_abbreviation": "virs",
                        "semantic_type_name": "Virus",    
                        "novelty": 1
                    }
                },`

    
  ### *"Since HMDB has already done mappings to other database identifiers (e.g., https://hmdb.ca/metabolites/HMDB0015122#links), you should include those in your object dict.  Doesn't look like those links are in in the proteins file, so probably you'll also need to get that info from another file...:"*

In [6]:
data_list=[] # final data holder 

# load ID mapping dataframe

map_csv="/Users/nacosta/Documents/hmdb_proteins_mapping.csv"
map_df=pd.read_csv(map_csv)




# iterate over the parsed tree
for t in root[0].iter():
    metabolites = [] # holder for metabolite_associations.metabolite, the associations w/o references
    
    if t.tag == "{http://www.hmdb.ca}protein": # get the protein nodes only
        
        # we need the first accession number, this is main protein _id in our doc
        _id = t.find("{http://www.hmdb.ca}accession")
        _id = _id.text   
        
        ct=1 # setup counter for the associations
        
        for m in t.findall("{http://www.hmdb.ca}metabolite_references"):
            
            id_list=[] # setup association id list
            
            for ref in m:
                data={}
                #print(ref.tag)                    
                
                _id2=_id+"_%s"%ct
                id_list.append(_id2)
                ct+=1
                data["_id"]=_id2 
                data['predicate']="biolink:related_to"
                data["pmid"]= None
                data["subject"]={}
                data["object"]={}
    
                for met_ref in ref.findall("{http://www.hmdb.ca}reference"):
                    for refs in met_ref:
                        if "pubmed_id" in refs.tag:
                            data['pmid']=refs.text
                for met in ref.findall("{http://www.hmdb.ca}metabolite"):
                    for info in met:
                        tag=info.tag.split("}")[1]
                        text=info.text
                        data["object"][tag]=text

                        if tag == "accession":
                            temp_df=map_df[map_df['HMDB']== text]
                            drugbankID=list(set(temp_df["Drugbank"].values))
                            chebi=list(set(temp_df["ChEBI"].values))
                            kegg=list(set(temp_df["KEGG"].values))


                            if len(chebi) == 1:
                                if pd.isna(chebi[0]):
                                    data["object"]['ChEBI'] = None
                                else:
                                    data["object"]['ChEBI'] = str(chebi[0])
                            else:
                                data["object"]['ChEBI'] = chebi  


                            if len(drugbankID) == 1:
                                if pd.isna(drugbankID[0]):
                                    data["object"]['Drugbank'] = None
                                else:
                                    data["object"]['Drugbank'] = str(drugbankID[0])
                            else:
                                data["object"]['Drugbank'] = drugbankID


                            if len(kegg) == 1:
                                if pd.isna(kegg[0]):
                                    data["object"]['KEGG'] = None
                                else:
                                    data["object"]['KEGG'] = str(kegg[0])
                            else:
                                data["object"]['KEGG'] = kegg
                
                # setup subject info       
                #uniprot_id, uniprot_name, genbank_protein_id, hgnc_id, genbank_gene_id, and gene_name.        
                uniprot_id = t.find("{http://www.hmdb.ca}uniprot_id")
                uniprot_id = uniprot_id.text
                data["subject"]["uniprot_id"]=uniprot_id
                
                uniprot_name= t.find("{http://www.hmdb.ca}uniprot_name")
                uniprot_name = uniprot_name.text
                data["subject"]["uniprot_name"]=uniprot_name
                
                genbank_protein_id= t.find("{http://www.hmdb.ca}genbank_protein_id")
                data["subject"]["genbank_protein_id"]=genbank_protein_id.text
                
                hgnc_id= t.find("{http://www.hmdb.ca}hgnc_id")
                data["subject"]["hgnc_id"]=hgnc_id.text
                
                genbank_gene_id=t.find("{http://www.hmdb.ca}genbank_gene_id")
                data["subject"]["genbank_gene_id"]=genbank_gene_id.text
                
                gene_name = t.find("{http://www.hmdb.ca}gene_name")
                data["subject"]["gene_name"]=gene_name.text
        

                data_list.append(data)
                 
 
        for met_assc in t.findall("{http://www.hmdb.ca}metabolite_associations"):
            for met_assc_ in met_assc.findall("{http://www.hmdb.ca}metabolite"):
               for met_assc_id in met_assc_.findall("{http://www.hmdb.ca}accession"):
                    
                    # if the metabolite association was already present above (in metabolite_refereces)
                    # we want to pass adding id to dict to avoid making a duplicate document 
                    pass_assc=False # set bool check for duplicates

                    # Check for duplicate in list, set pass_assc bool to True 
                    for elem in data_list:
                        if met_assc_id.text == elem['object']['accession']:
                            #print('yes ', accession)
                            pass_assc = True
                            
                    # if bool is True pass making duplicate doc       
                    if pass_assc==True: pass
                    else:
                        data={"_id": _id+"_%s"%ct,'predicate':"biolink:related_to", 'pmid': None, 'subject':{}, 'object':{'accession': met_assc_id.text} }
                        ct+=1


                        for met_assc_name in met_assc_.findall("{http://www.hmdb.ca}name"):
                            data["object"]['name'] = met_assc_name.text

                        
                        temp_df=map_df[map_df['HMDB']==met_assc_id.text]
                        drugbankID=list(set(temp_df["Drugbank"].values))
                        chebi=list(set(temp_df["ChEBI"].values))
                        kegg=list(set(temp_df["KEGG"].values))


                        if len(chebi) == 1:
                            if pd.isna(chebi[0]):
                                pdata["object"]['ChEBI'] = None
                            else:
                                data["object"]['ChEBI'] = str(chebi[0])
                        else:
                            data["object"]['ChEBI'] = chebi  


                        if len(drugbankID) == 1:
                            if pd.isna(drugbankID[0]):
                                data["object"]['Drugbank'] = None
                            else:
                                data["object"]['Drugbank'] = str(drugbankID[0])
                        else:
                            data["object"]['Drugbank'] = drugbankID


                        if len(kegg) == 1:
                            if pd.isna(kegg[0]):
                                data["object"]['KEGG'] = None
                            else:
                                data["object"]['KEGG'] = str(kegg[0])
                        else:
                            data["object"]['KEGG'] = kegg


                        #setup subject info       
                        #uniprot_id, uniprot_name, genbank_protein_id, hgnc_id, genbank_gene_id, and gene_name.        
                        uniprot_id = t.find("{http://www.hmdb.ca}uniprot_id")
                        uniprot_id = uniprot_id.text
                        data["subject"]["uniprot_id"]=uniprot_id
                        
                        uniprot_name= t.find("{http://www.hmdb.ca}uniprot_name")
                        uniprot_name = uniprot_name.text
                        data["subject"]["uniprot_name"]=uniprot_name
                        
                        genbank_protein_id= t.find("{http://www.hmdb.ca}genbank_protein_id")
                        data["subject"]["genbank_protein_id"]=genbank_protein_id.text
                        
                        hgnc_id= t.find("{http://www.hmdb.ca}hgnc_id")
                        data["subject"]["hgnc_id"]=hgnc_id.text
                        
                        genbank_gene_id=t.find("{http://www.hmdb.ca}genbank_gene_id")
                        data["subject"]["genbank_gene_id"]=genbank_gene_id.text
                        
                        gene_name = t.find("{http://www.hmdb.ca}gene_name")
                        data["subject"]["gene_name"]=gene_name.text
                
                        data_list.append(data)





In [7]:
print(json.dumps(data_list, sort_keys=False, indent=4))

[
    {
        "_id": "HMDBP00001_1",
        "predicate": "biolink:related_to",
        "pmid": "11752352",
        "subject": {
            "uniprot_id": "P21589",
            "uniprot_name": "5NTD_HUMAN",
            "genbank_protein_id": "23897",
            "hgnc_id": "HGNC:8021",
            "genbank_gene_id": "X55740",
            "gene_name": "NT5E"
        },
        "object": {
            "name": "Pentoxifylline",
            "accession": "HMDB0014944",
            "ChEBI": "7986.0",
            "Drugbank": "DB00806",
            "KEGG": "C07424"
        }
    },
    {
        "_id": "HMDBP00001_2",
        "predicate": "biolink:related_to",
        "pmid": "16426349",
        "subject": {
            "uniprot_id": "P21589",
            "uniprot_name": "5NTD_HUMAN",
            "genbank_protein_id": "23897",
            "hgnc_id": "HGNC:8021",
            "genbank_gene_id": "X55740",
            "gene_name": "NT5E"
        },
        "object": {
            "name": "Pentoxi

## References:

[Biconductor in R metabolite ID mapping package](http://bioconductor.org/packages/release/data/annotation/vignettes/metaboliteIDmapping/inst/doc/metaboliteIDmapping.html)  
[Bioconda](https://bioconda.github.io/recipes/bioconductor-metaboliteidmapping/README.html)

---