# HMDB Standalone "Pending" API (Association-Centric)    
  

[Data Assignment](https://github.com/biothings/mygene.info/issues/110)  
[Data Download](https://hmdb.ca/downloads)    
  
    
      
What does this do?  
This program takes an input file, `proteins.xml`, from the [HMDB database](https://hmdb.ca/downloads).

In [1]:
import pandas as pd 
import time
import numpy as np
import json, os
import xml.etree.ElementTree as ET


from bs4 import BeautifulSoup as bs
from IPython.display import display
from biothings.utils.dataload import dict_convert, dict_sweep

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load `.xml` protein file

In [3]:
# -- Set file path ---
protein_xml = os.path.join("/Users/nacosta/Documents/hmdb_proteins.xml")
meta_xml = os.path.join("/Users/nacosta/Documents/hmdb_metabolites.xml")


In [21]:
def get_file_size(file_name):
    file_size=0
    if os.path.isfile(file_name):
        file_size = os.path.getsize(file_name)
        file_size=round(file_size / (1024*1024), 2)
    file_size='{:,.2f}'.format(file_size)
    return str(file_size) + 'MB'


In [22]:
get_file_size(meta_xml)

'3,889.45MB'

In [4]:
from lxml import etree as etree_lxml

def load_xml(opts):
    """Return the sample XML file as a string."""
    with open(meta_xml, opts) as xml:
        return xml.read()


xml_as_bytes = load_xml('rb')
tree = etree_lxml.fromstring(xml_as_bytes)


In [5]:
# --- Upload XML Data (workaround for pandas >1.3, need to upgrade Biothings) --- 
xml_data = open(protein_xml, 'r', encoding='UTF-8').read()  # Read file
root = ET.XML(xml_data)  # Parse XML


---  
## Parse `.xml` protein file  
  
- To be explicit see this example at [biothings github](https://github.com/biothings/pending.api/issues/30#issuecomment-904319224)    
            
- The data under `subject` should could from the `protein_properties` section,
- the data under `object` should come from `metabolite_reference.metabolite`,
- and the `pmid` should come from `metabolite_reference.reference` .  
            
            `    {
                    "_id": "10682982_1",
                    "predicate": "PART_OF",    
                    "predication_id": 10682982,
                    "pmid": 16530800,
                    "subject": {
                        "ncbigene": "3346",
                        "name": "HTN1",
                        "semantic_type_abbreviation": "gngm",
                        "semantic_type_name": "Gene or Genome",    
                        "novelty": 1    
                    },
                    "object": {
                        "umls": "C1481993",
                        "name": "Haloviruses",
                        "semantic_type_abbreviation": "virs",
                        "semantic_type_name": "Virus",    
                        "novelty": 1
                    }
                },`

    
  ### *"Since HMDB has already done mappings to other database identifiers (e.g., https://hmdb.ca/metabolites/HMDB0015122#links), you should include those in your object dict.  Doesn't look like those links are in in the proteins file, so probably you'll also need to get that info from another file...:"*

In [23]:
def load_hmdb_data(data_folder):
        


    
    def load_metabolite():
        metabolite_dict={}
        for meta in metabolite:
            #print(meta.tag)
            accession=meta.find('{http://www.hmdb.ca}accession')
            kegg=meta.find('{http://www.hmdb.ca}kegg_id')
            chemspider=meta.find('{http://www.hmdb.ca}chemspider_id')
            chebi=meta.find('{http://www.hmdb.ca}chebi_id')
            pubchem=meta.find('{http://www.hmdb.ca}pubchem_compound_id')

            metabolite_dict.setdefault(accession.text, {
                                                        "kegg_id":kegg.text,
                                                        "chemspider_id": chemspider.text,
                                                        "chebi_id": chebi.text,
                                                        "pubchem_compound_id": pubchem.text

            })
        return metabolite_dict;


    # --- Enter subject method --
    def enter_subject(data, t):
        # setup subject info       
        #uniprot_id, uniprot_name, genbank_protein_id, hgnc_id, genbank_gene_id, and gene_name.        
        uniprot_id = t.find("{http://www.hmdb.ca}uniprot_id")
        uniprot_id = uniprot_id.text
        data["subject"]["uniprot_id"]=uniprot_id

        uniprot_name= t.find("{http://www.hmdb.ca}uniprot_name")
        uniprot_name = uniprot_name.text
        data["subject"]["uniprot_name"]=uniprot_name

        genbank_protein_id= t.find("{http://www.hmdb.ca}genbank_protein_id")
        data["subject"]["genbank_protein_id"]=genbank_protein_id.text

        hgnc_id= t.find("{http://www.hmdb.ca}hgnc_id")
        data["subject"]["hgnc_id"]=hgnc_id.text

        genbank_gene_id=t.find("{http://www.hmdb.ca}genbank_gene_id")
        data["subject"]["genbank_gene_id"]=genbank_gene_id.text

        gene_name = t.find("{http://www.hmdb.ca}gene_name")
        data["subject"]["gene_name"]=gene_name.text

        return data;
 

    # --- Enter mapping method --- 
    def enter_mapping_ids(metabolite_dict, text):
        data["object"]["kegg_id"]=metabolite_dict[text]["kegg_id"]
        data["object"]["chemspider_id"]=metabolite_dict[text]["chemspider_id"]
        data["object"]["chebi_id"]=metabolite_dict[text]["chebi_id"]
        data["object"]["pubchem_compound_id"]=metabolite_dict[text]["pubchem_compound_id"]

        return metabolite_dict;


    # --- Set input XML file path ---
    protein_xml = os.path.join(data_folder, "hmdb_proteins.xml")
    meta_xml = os.path.join(data_folder, "hmdb_metabolites.xml")

    # --- Upload XML Data (workaround for pandas <1.3, need to upgrade Biothings) --- 
    xml_data = open(protein_xml, 'r', encoding='UTF-8').read()  # Read file
    protein_root = ET.XML(xml_data)  # Parse XML

    # --- Load in the metabolites XML ---   
    xml_as_bytes = open(meta_xml, 'rb', encoding='UTF-8').read()
    tree = etree_lxml.fromstring(xml_as_bytes)

    data_list=[] # final data holder 

    # --- Go through metabolites and create dictionary ---
    metabolite = tree.findall('{http://www.hmdb.ca}metabolite', {})
    metabolite_dict=load_metabolite()


    print(data_list)

    # --- Iterate over the root ---
    for t in protein_root[0].iter():
        metabolites = [] # holder for metabolite_associations.metabolite, the associations w/o references
        
        if t.tag == "{http://www.hmdb.ca}protein": # get the protein nodes only
            
            # we need the first accession number, this is main protein _id in our doc
            _id = t.find("{http://www.hmdb.ca}accession")
            _id = _id.text   
            protein_type=t.find("{http://www.hmdb.ca}protein_type")
            ct=1 # setup counter for the associations
            
            
            # ---------- Metabolite associations with references ------------  

            for m in t.findall("{http://www.hmdb.ca}metabolite_references"):
                
                
                for ref in m:
                    # Setup dictionary data doc
                    data={}
                    _id2=_id+"_%s"%ct
                    ct+=1
                    data["_id"]=_id2 
                    data["pmid"]= None
                    data["subject"]={"protein_type": protein_type.text}
                    data["object"]={}
        
                    # pull out the reference tags and get the pubmed_id
                    for met_ref in ref.findall("{http://www.hmdb.ca}reference"):
                        for refs in met_ref:
                            if "pubmed_id" in refs.tag:
                                data['pmid']=refs.text

                    for met in ref.findall("{http://www.hmdb.ca}metabolite"):
                        for info in met:
                            tag=info.tag.split("}")[1]
                            text=info.text
                            
                            data["object"][tag]=text

                            # get the extra IDs from the metabolite xml
                            #{'kegg_id': 'C01092', 'chemspider_id': '4578', 'chebi_id': '127029', 'pubchem_compound_id': '4740'}

                            if "accession" in tag:
                                #print(metabolite_dict[text])
                                enter_mapping_ids(metabolite_dict, text)
                                #data["object"]["kegg_id"]=metabolite_dict[text]["kegg_id"]
                                #data["object"]["chemspider_id"]=metabolite_dict[text]["chemspider_id"]
                                #data["object"]["chebi_id"]=metabolite_dict[text]["chebi_id"]
                                #data["object"]["pubchem_compound_id"]=metabolite_dict[text]["pubchem_compound_id"]

                    # Call enter_subject method to fill in subject data 
                    data=enter_subject(data,t)  
                    data_list.append(data)

            # ---------- Metabolite associations without references ------------     
    
            for met_assc in t.findall("{http://www.hmdb.ca}metabolite_associations"):
                for met_assc_ in met_assc.findall("{http://www.hmdb.ca}metabolite"):
                    for met_assc_id in met_assc_.findall("{http://www.hmdb.ca}accession"):
                        
                        # if the metabolite association was already present above (in metabolite_refereces)
                        # we want to pass adding id to dict to avoid making a duplicate document 
                        pass_assc=False # set bool 

                        # Check for duplicate in list, set pass_assc bool to True 
                        for elem in data_list:
                            if met_assc_id.text == elem['object']['accession']:                            
                                pass_assc = True
                                
                        # if bool is True pass making duplicate doc       
                        if pass_assc==True: 
                            pass
                        else:
                            data={"_id": _id+"_%s"%ct, 'pmid': 'Unknown', 'subject':{}, 'object':{'accession': met_assc_id.text} }
                            ct+=1
                            enter_mapping_ids(metabolite_dict, met_assc_id.text)
                            #data["object"]["kegg_id"]=metabolite_dict[met_assc_id.text]["kegg_id"]
                            #data["object"]["chemspider_id"]=metabolite_dict[met_assc_id.text]["chemspider_id"]
                            #data["object"]["chebi_id"]=metabolite_dict[met_assc_id.text]["chebi_id"]
                            #data["object"]["pubchem_compound_id"]=metabolite_dict[met_assc_id.text]["pubchem_compound_id"]

                            for met_assc_name in met_assc_.findall("{http://www.hmdb.ca}name"):
                                data["object"]['name'] = met_assc_name.text

                            # Call enter_subject method to fill in subject data 
                            data=enter_subject(data,t)  
                            data_list.append(data)        

    for doc_ in data_list:
        print(json.dumps(doc_, sort_keys=False, indent=4))

In [24]:
print(json.dumps(data_list, sort_keys=False, indent=4))

NameError: name 'data_list' is not defined

In [31]:
# Association Parser    
def load_hmdb_data(data_folder="/Users/nacosta/Documents"):
        

    # --- Helper Methods --- 
    # --- Enter subject into document --
    def enter_subject(data, tags):
        # setup subject info       
        #uniprot_id, uniprot_name, genbank_protein_id, hgnc_id, genbank_gene_id, and gene_name.        
        uniprot_id = tags.find("{http://www.hmdb.ca}uniprot_id")
        uniprot_id = uniprot_id.text
        data["subject"]["uniprot_id"]=uniprot_id

        uniprot_name= tags.find("{http://www.hmdb.ca}uniprot_name")
        uniprot_name = uniprot_name.text
        data["subject"]["uniprot_name"]=uniprot_name

        genbank_protein_id= tags.find("{http://www.hmdb.ca}genbank_protein_id")
        data["subject"]["genbank_protein_id"]=genbank_protein_id.text

        hgnc_id= tags.find("{http://www.hmdb.ca}hgnc_id")
        data["subject"]["hgnc_id"]=hgnc_id.text

        genbank_gene_id=tags.find("{http://www.hmdb.ca}genbank_gene_id")
        data["subject"]["genbank_gene_id"]=genbank_gene_id.text

        gene_name = tags.find("{http://www.hmdb.ca}gene_name")
        data["subject"]["gene_name"]=gene_name.text

        return data;
 

    # --- Create a dictionary to hold our metabolite mapping values from the metabolite XML ---
    def make_metbolite_dict():
            # --- Load in the metabolites XML --- 
        xml_as_bytes = open(meta_xml, 'rb').read()
        metabolite_tree = etree_lxml.fromstring(xml_as_bytes)
        metabolite = metabolite_tree.findall('{http://www.hmdb.ca}metabolite', {})
        mapping_dict={}

        for meta in metabolite:
            accession=meta.find('{http://www.hmdb.ca}accession')
            kegg=meta.find('{http://www.hmdb.ca}kegg_id')
            chemspider=meta.find('{http://www.hmdb.ca}chemspider_id')
            chebi=meta.find('{http://www.hmdb.ca}chebi_id')
            pubchem=meta.find('{http://www.hmdb.ca}pubchem_compound_id')

            mapping_dict.setdefault(accession.text, {
                                                        "kegg_id":kegg.text,
                                                        "chemspider_id": chemspider.text,
                                                        "chebi_id": chebi.text,
                                                        "pubchem_compound_id": pubchem.text

        })

        return mapping_dict;



    # --- Enter mapping IDs into the document --- 
    def enter_mapping_ids(mapping_dict, text):
        # get the extra IDs from the metabolite xml
        # {'kegg_id': 'C01092', 'chemspider_id': '4578', 'chebi_id': '127029', 'pubchem_compound_id': '4740'}
        data["object"]["kegg_id"]=mapping_dict[text]["kegg_id"]
        data["object"]["chemspider_id"]=mapping_dict[text]["chemspider_id"]
        data["object"]["chebi_id"]=mapping_dict[text]["chebi_id"]
        data["object"]["pubchem_compound_id"]=mapping_dict[text]["pubchem_compound_id"]

        return mapping_dict;

    # ------------------------------------------------- 

    # --- Set input XML file path ---
    protein_xml = os.path.join(data_folder, "hmdb_proteins.xml")
    meta_xml = os.path.join(data_folder, "hmdb_metabolites.xml")

    # --- Load XML Data --- 
    xml_data = open(protein_xml, 'r', encoding='UTF-8').read()  # Read file
    protein_tree = ET.XML(xml_data)  # Parse XML
    data_list=[] # final data holder 
    mapping_dict=make_metbolite_dict() # load metabolite file and get the mapping ids 


    # --- Iterate over the root ---
    for tags in protein_tree[0].iter():
        metabolites = [] # holder for metabolite_associations.metabolite, the associations w/o references
        
        if tags.tag == "{http://www.hmdb.ca}protein": # get the protein nodes only
            
            # we need the first accession number, this is main protein _id in our doc
            _id = tags.find("{http://www.hmdb.ca}accession")
            _id = _id.text   
            protein_type=tags.find("{http://www.hmdb.ca}protein_type")
            ct=1 # setup counter for the associations
            
            
            # ---------- Metabolite associations with references ------------  
            for m in tags.findall("{http://www.hmdb.ca}metabolite_references"):
                for ref in m:
                    # set the main accession id 
                    _id2=_id+"_%s"%ct
                    ct+=1 # update accession counter
                    # create dictionary document
                    data={}
                    data={ "_id": _id2, "pmid": None, "subject": { "protein_type": protein_type.text}, "object":{}}
        
                    # pull out the reference tags and get the pubmed_id
                    for met_ref in ref.findall("{http://www.hmdb.ca}reference"):
                        for refs in met_ref:
                            if "pubmed_id" in refs.tag:
                                data['pmid']=refs.text

                    for met in ref.findall("{http://www.hmdb.ca}metabolite"):
                        for info in met:
                            tag=info.tag.split("}")[1]
                            text=info.text
                            data["object"][tag]=text

                            if "accession" in tag:
                                enter_mapping_ids(mapping_dict, text)
                               
                    # Call enter_subject method to fill in subject data 
                    data=enter_subject(data,tags)  
                    data_list.append(data)


            # ---------- Metabolite associations without references ------------         
            # find the metabolite_association tags and extract the information
            for met_assc in tags.findall("{http://www.hmdb.ca}metabolite_associations"):
                for met_assc_ in met_assc.findall("{http://www.hmdb.ca}metabolite"):
                    for met_assc_id in met_assc_.findall("{http://www.hmdb.ca}accession"):
                        
                        # --- Check for duplicate ID, if found, skip making document --- 
                        # if the metabolite association was already present above (in metabolite_refereces)
                        # we want to pass adding id to dict to avoid making a duplicate document 
                        pass_assc=False # set bool 
                        for elem in data_list:
                            if met_assc_id.text == elem['object']['accession']:                            
                                pass_assc = True
                                
                        # if bool is True pass making duplicate doc       
                        if pass_assc==True: 
                            pass

                        # else create the document 
                        else:
                            # create data dict for association accession 
                            data={"_id": _id+"_%s"%ct, 'pmid': 'Unknown', 'subject':{}, 'object':{'accession': met_assc_id.text} }
                            ct+=1 # update the id counter 

                            enter_mapping_ids(mapping_dict, met_assc_id.text) # add the mapping ids for this accession

                            for met_assc_name in met_assc_.findall("{http://www.hmdb.ca}name"):
                                data["object"]['name'] = met_assc_name.text

                            # Call enter_subject method to fill in subject data 
                            data=enter_subject(data,tags)  
                            data_list.append(data)        

    for doc_ in data_list:
        #print(json.dumps(doc_, sort_keys=False, indent=4))
        yield doc_

In [32]:
from lxml import etree as etree_lxml

print(next(load_hmdb_data(data_folder="/Users/nacosta/Documents")))

{'_id': 'HMDBP00001_1', 'pmid': '11752352', 'subject': {'protein_type': 'Unknown', 'uniprot_id': 'P21589', 'uniprot_name': '5NTD_HUMAN', 'genbank_protein_id': '23897', 'hgnc_id': 'HGNC:8021', 'genbank_gene_id': 'X55740', 'gene_name': 'NT5E'}, 'object': {'name': 'Pentoxifylline', 'accession': 'HMDB0014944', 'kegg_id': 'C07424', 'chemspider_id': '4578', 'chebi_id': '127029', 'pubchem_compound_id': '4740'}}


{'_id': 'HMDBP00001_1', 'pmid': '11752352', 'subject': {'protein_type': 'Unknown', 'uniprot_id': 'P21589', 'uniprot_name': '5NTD_HUMAN', 'genbank_protein_id': '23897', 'hgnc_id': 'HGNC:8021', 'genbank_gene_id': 'X55740', 'gene_name': 'NT5E'}, 'object': {'name': 'Pentoxifylline', 'accession': 'HMDB0014944', 'kegg_id': 'C07424', 'chemspider_id': '4578', 'chebi_id': '127029', 'pubchem_compound_id': '4740'}}


## References:

[Biconductor in R metabolite ID mapping package](http://bioconductor.org/packages/release/data/annotation/vignettes/metaboliteIDmapping/inst/doc/metaboliteIDmapping.html)  
[Bioconda](https://bioconda.github.io/recipes/bioconductor-metaboliteidmapping/README.html)

---