# HMDB Standalone "Pending" API (Association-Centric)  for
[Data Assignment](https://github.com/biothings/mygene.info/issues/110)  
[Data Download](https://hmdb.ca/downloads)

In [34]:
import pandas as pd 
import numpy as np
from IPython.display import display
from biothings.utils.dataload import dict_convert, dict_sweep
import os

In [2]:
# set folder path
protein_zipfolder = os.path.join("c:\\Users\\19802\\Documents\\dev\scripps\\BioThings_SuLab\\data\\hmdb_proteins.xml")

# we're reading an XML
df=pd.read_xml(protein_zipfolder)


In [119]:
import xml.etree.ElementTree as ET
# --- Upload XML Data (workaround for pandas >1.3, need to upgrade Biothings) --- 
xml_data = open(protein_zipfolder, 'r', encoding='UTF-8').read()  # Read file
root = ET.XML(xml_data)  # Parse XML
data = []
cols = []
for i, child in enumerate(root):
    data.append([subchild.text for subchild in child])
    cols.append(child.tag)
    



In [120]:
# --- Load Data into Pandas ---
df = pd.DataFrame(data).T  # Write in DF and transpose it
#df.columns = cols  # Update column names
df=df.T
df.columns=['version', 'creation_date', 'update_date', 'accession',
       'secondary_accessions', 'protein_type', 'synonyms', 'gene_name',
       'general_function', 'specific_function', 'pathways',
       'metabolite_associations', 'go_classifications',
       'subcellular_locations', 'gene_properties', 'protein_properties',
       'genbank_protein_id', 'uniprot_id', 'uniprot_name', 'pdb_ids',
       'genbank_gene_id', 'genecard_id', 'geneatlas_id', 'hgnc_id',
       'general_references', 'metabolite_references']

df=df.replace('\\n',np.NaN, regex=True)
df.head()

Unnamed: 0,version,creation_date,update_date,accession,secondary_accessions,protein_type,synonyms,gene_name,general_function,specific_function,...,genbank_protein_id,uniprot_id,uniprot_name,pdb_ids,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id,general_references,metabolite_references
0,4.0,2013-01-16 01:48:37 UTC,2017-12-08 07:09:13 UTC,HMDBP00001,,Unknown,,NT5E,Involved in hydrolase activity,,...,23897,P21589,5NTD_HUMAN,,X55740,NT5E,NT5E,HGNC:8021,,
1,4.0,2013-01-16 01:48:37 UTC,2017-12-20 20:30:53 UTC,HMDBP00002,,Unknown,,DCTD,Involved in zinc ion binding,,...,61742819,P32321,DCTD_HUMAN,,NM_001921.2,DCTD,DCTD,HGNC:2710,,
2,4.0,2013-01-16 01:48:37 UTC,2017-12-20 20:31:24 UTC,HMDBP00003,,Unknown,,CMPK1,Involved in ATP binding,,...,33150592,P30085,KCY_HUMAN,,AF087865,CMPK1,CMPK1,HGNC:18170,,
3,4.0,2013-01-16 01:48:37 UTC,2017-12-08 07:09:13 UTC,HMDBP00004,,Unknown,,NT5C1B,Involved in nucleotide binding,,...,50593110,Q96P26,5NT1B_HUMAN,,NM_001002006.2,NT5C1B,NT5C1B,HGNC:17818,,
4,4.0,2013-01-16 01:48:38 UTC,2017-12-08 07:09:13 UTC,HMDBP00005,,Unknown,,NT5C1A,Involved in nucleotide binding,,...,12659324,Q9BXI3,5NT1A_HUMAN,,AF331801,NT5C1A,NT5C1A,HGNC:17819,,


In [5]:
df.head()

Unnamed: 0,version,creation_date,update_date,accession,secondary_accessions,protein_type,synonyms,gene_name,general_function,specific_function,...,genbank_protein_id,uniprot_id,uniprot_name,pdb_ids,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id,general_references,metabolite_references
0,4.0,2013-01-16 01:48:37 UTC,2017-12-08 07:09:13 UTC,HMDBP00001,,Unknown,,NT5E,Involved in hydrolase activity,Hydrolyzes extracellular nucleotides into memb...,...,23897.0,P21589,5NTD_HUMAN,,X55740,NT5E,NT5E,HGNC:8021,,
1,4.0,2013-01-16 01:48:37 UTC,2017-12-20 20:30:53 UTC,HMDBP00002,,Unknown,,DCTD,Involved in zinc ion binding,Supplies the nucleotide substrate for thymidyl...,...,61742819.0,P32321,DCTD_HUMAN,,NM_001921.2,DCTD,DCTD,HGNC:2710,,
2,4.0,2013-01-16 01:48:37 UTC,2017-12-20 20:31:24 UTC,HMDBP00003,,Unknown,,CMPK1,Involved in ATP binding,Catalyzes specific phosphoryl transfer from AT...,...,33150592.0,P30085,KCY_HUMAN,,AF087865,CMPK1,CMPK1,HGNC:18170,,
3,4.0,2013-01-16 01:48:37 UTC,2017-12-08 07:09:13 UTC,HMDBP00004,,Unknown,,NT5C1B,Involved in nucleotide binding,Dephosphorylates the 5' and 2'(3')-phosphates ...,...,50593110.0,Q96P26,5NT1B_HUMAN,,NM_001002006.2,NT5C1B,NT5C1B,HGNC:17818,,
4,4.0,2013-01-16 01:48:38 UTC,2017-12-08 07:09:13 UTC,HMDBP00005,,Unknown,,NT5C1A,Involved in nucleotide binding,Dephosphorylates the 5' and 2'(3')-phosphates ...,...,12659324.0,Q9BXI3,5NT1A_HUMAN,,AF331801,NT5C1A,NT5C1A,HGNC:17819,,


## Data Prep

In [6]:
df.columns.values

array(['version', 'creation_date', 'update_date', 'accession',
       'secondary_accessions', 'protein_type', 'synonyms', 'gene_name',
       'general_function', 'specific_function', 'pathways',
       'metabolite_associations', 'go_classifications',
       'subcellular_locations', 'gene_properties', 'protein_properties',
       'genbank_protein_id', 'uniprot_id', 'uniprot_name', 'pdb_ids',
       'genbank_gene_id', 'genecard_id', 'geneatlas_id', 'hgnc_id',
       'general_references', 'metabolite_references'], dtype=object)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5702 entries, 0 to 5701
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   version                  5702 non-null   float64
 1   creation_date            5702 non-null   object 
 2   update_date              5702 non-null   object 
 3   accession                5702 non-null   object 
 4   secondary_accessions     0 non-null      float64
 5   protein_type             5702 non-null   object 
 6   synonyms                 0 non-null      float64
 7   gene_name                5623 non-null   object 
 8   general_function         5175 non-null   object 
 9   specific_function        5059 non-null   object 
 10  pathways                 0 non-null      float64
 11  metabolite_associations  0 non-null      float64
 12  go_classifications       0 non-null      float64
 13  subcellular_locations    0 non-null      float64
 14  gene_properties         

In [8]:
df.describe()

Unnamed: 0,version,secondary_accessions,synonyms,pathways,metabolite_associations,go_classifications,subcellular_locations,gene_properties,protein_properties,genbank_protein_id,pdb_ids,general_references,metabolite_references
count,5702.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4616.0,0.0,0.0,0.0
mean,4.0,,,,,,,,,55162810.0,,,
std,0.0,,,,,,,,,69028360.0,,,
min,4.0,,,,,,,,,12584.0,,,
25%,4.0,,,,,,,,,5489670.0,,,
50%,4.0,,,,,,,,,21749390.0,,,
75%,4.0,,,,,,,,,71279870.0,,,
max,4.0,,,,,,,,,312434000.0,,,


In [9]:
df.describe(include=object)

Unnamed: 0,creation_date,update_date,accession,protein_type,gene_name,general_function,specific_function,uniprot_id,uniprot_name,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id
count,5702,5702,5702,5702,5623,5175,5059,5702,5249,5163,5175,5019,5557
unique,2745,238,5702,3,5459,675,4280,5702,5248,5118,5021,4873,5313
top,2013-01-16 03:19:53 UTC,2017-12-08 07:10:31 UTC,HMDBP00001,Unknown,NAT2,Involved in catalytic activity,Potential calcium-dependent cell-adhesion prot...,P21589,ST1A3_HUMAN,J01415,NAT2,NAT2,HGNC:7646
freq,8,114,1,3685,29,298,53,1,2,7,29,29,34


**What columns are full of all `null` values**?

In [121]:
# get columns with all null values
null_cols= list(df.loc[:, df.isna().all()].columns.values)
df.loc[:, df.isna().all()] 

Unnamed: 0,secondary_accessions,synonyms,pathways,metabolite_associations,go_classifications,subcellular_locations,gene_properties,protein_properties,pdb_ids,general_references,metabolite_references
0,,,,,,,,,,,
1,,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
5697,,,,,,,,,,,
5698,,,,,,,,,,,
5699,,,,,,,,,,,
5700,,,,,,,,,,,


In [11]:
print(len(null_cols))
display(null_cols)

11


['secondary_accessions',
 'synonyms',
 'pathways',
 'metabolite_associations',
 'go_classifications',
 'subcellular_locations',
 'gene_properties',
 'protein_properties',
 'pdb_ids',
 'general_references',
 'metabolite_references']

**What columns contain ``null`` values?**

In [12]:
# get columns with any null values
null_cols_ = list(df.loc[:, df.isna().any()].columns.values)
df.loc[:, df.isna().any()]


Unnamed: 0,secondary_accessions,synonyms,gene_name,general_function,specific_function,pathways,metabolite_associations,go_classifications,subcellular_locations,gene_properties,protein_properties,genbank_protein_id,uniprot_name,pdb_ids,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id,general_references,metabolite_references
0,,,NT5E,Involved in hydrolase activity,Hydrolyzes extracellular nucleotides into memb...,,,,,,,23897.0,5NTD_HUMAN,,X55740,NT5E,NT5E,HGNC:8021,,
1,,,DCTD,Involved in zinc ion binding,Supplies the nucleotide substrate for thymidyl...,,,,,,,61742819.0,DCTD_HUMAN,,NM_001921.2,DCTD,DCTD,HGNC:2710,,
2,,,CMPK1,Involved in ATP binding,Catalyzes specific phosphoryl transfer from AT...,,,,,,,33150592.0,KCY_HUMAN,,AF087865,CMPK1,CMPK1,HGNC:18170,,
3,,,NT5C1B,Involved in nucleotide binding,Dephosphorylates the 5' and 2'(3')-phosphates ...,,,,,,,50593110.0,5NT1B_HUMAN,,NM_001002006.2,NT5C1B,NT5C1B,HGNC:17818,,
4,,,NT5C1A,Involved in nucleotide binding,Dephosphorylates the 5' and 2'(3')-phosphates ...,,,,,,,12659324.0,5NT1A_HUMAN,,AF331801,NT5C1A,NT5C1A,HGNC:17819,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5697,,,CHRNA6,Involved in acetylcholine receptor activity,"After binding acetylcholine, the AChR responds...",,,,,,,1458118.0,ACHA6_HUMAN,,U62435,CHRNA6,,GNC:15963,,
5698,,,HTR3D,Involved in transmembrane transport,This is one of the several different receptors...,,,,,,,,5HT3D_HUMAN,,,,,,,
5699,,,HTR3C,Involved in transmembrane transport,This is one of the several different receptors...,,,,,,,,5HT3C_HUMAN,,,,,,,
5700,,,HTR3E,,,,,,,,,,5HT3E_HUMAN,,,,,,,


In [13]:
print(len(null_cols_))
display(null_cols_)


20


['secondary_accessions',
 'synonyms',
 'gene_name',
 'general_function',
 'specific_function',
 'pathways',
 'metabolite_associations',
 'go_classifications',
 'subcellular_locations',
 'gene_properties',
 'protein_properties',
 'genbank_protein_id',
 'uniprot_name',
 'pdb_ids',
 'genbank_gene_id',
 'genecard_id',
 'geneatlas_id',
 'hgnc_id',
 'general_references',
 'metabolite_references']

## Clean Data

In [122]:
data=df.drop(null_cols,axis=1) # remove the null cols

In [123]:
data.head()

Unnamed: 0,version,creation_date,update_date,accession,protein_type,gene_name,general_function,specific_function,genbank_protein_id,uniprot_id,uniprot_name,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id
0,4.0,2013-01-16 01:48:37 UTC,2017-12-08 07:09:13 UTC,HMDBP00001,Unknown,NT5E,Involved in hydrolase activity,,23897,P21589,5NTD_HUMAN,X55740,NT5E,NT5E,HGNC:8021
1,4.0,2013-01-16 01:48:37 UTC,2017-12-20 20:30:53 UTC,HMDBP00002,Unknown,DCTD,Involved in zinc ion binding,,61742819,P32321,DCTD_HUMAN,NM_001921.2,DCTD,DCTD,HGNC:2710
2,4.0,2013-01-16 01:48:37 UTC,2017-12-20 20:31:24 UTC,HMDBP00003,Unknown,CMPK1,Involved in ATP binding,,33150592,P30085,KCY_HUMAN,AF087865,CMPK1,CMPK1,HGNC:18170
3,4.0,2013-01-16 01:48:37 UTC,2017-12-08 07:09:13 UTC,HMDBP00004,Unknown,NT5C1B,Involved in nucleotide binding,,50593110,Q96P26,5NT1B_HUMAN,NM_001002006.2,NT5C1B,NT5C1B,HGNC:17818
4,4.0,2013-01-16 01:48:38 UTC,2017-12-08 07:09:13 UTC,HMDBP00005,Unknown,NT5C1A,Involved in nucleotide binding,,12659324,Q9BXI3,5NT1A_HUMAN,AF331801,NT5C1A,NT5C1A,HGNC:17819


In [16]:
data[data['hgnc_id']=="HGNC:17819"]

Unnamed: 0,version,creation_date,update_date,accession,protein_type,gene_name,general_function,specific_function,genbank_protein_id,uniprot_id,uniprot_name,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id
4,4.0,2013-01-16 01:48:38 UTC,2017-12-08 07:09:13 UTC,HMDBP00005,Unknown,NT5C1A,Involved in nucleotide binding,Dephosphorylates the 5' and 2'(3')-phosphates ...,12659324.0,Q9BXI3,5NT1A_HUMAN,AF331801,NT5C1A,NT5C1A,HGNC:17819


In [17]:
def gene_search(search_id):
    display(data[data['hgnc_id']=="%s"%search_id].head())
    
gene_search("HGNC:17819")

Unnamed: 0,version,creation_date,update_date,accession,protein_type,gene_name,general_function,specific_function,genbank_protein_id,uniprot_id,uniprot_name,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id
4,4.0,2013-01-16 01:48:38 UTC,2017-12-08 07:09:13 UTC,HMDBP00005,Unknown,NT5C1A,Involved in nucleotide binding,Dephosphorylates the 5' and 2'(3')-phosphates ...,12659324.0,Q9BXI3,5NT1A_HUMAN,AF331801,NT5C1A,NT5C1A,HGNC:17819


In [18]:
data['hgnc_id'].unique

<bound method Series.unique of 0        HGNC:8021
1        HGNC:2710
2       HGNC:18170
3       HGNC:17818
4       HGNC:17819
           ...    
5697     GNC:15963
5698          None
5699          None
5700          None
5701    HGNC:11455
Name: hgnc_id, Length: 5702, dtype: object>

In [19]:
data.columns

Index(['version', 'creation_date', 'update_date', 'accession', 'protein_type',
       'gene_name', 'general_function', 'specific_function',
       'genbank_protein_id', 'uniprot_id', 'uniprot_name', 'genbank_gene_id',
       'genecard_id', 'geneatlas_id', 'hgnc_id'],
      dtype='object')

## Associations  
`subject/object/predicate`

In [55]:
data.columns.values

array(['version', 'creation_date', 'update_date', 'accession',
       'protein_type', 'gene_name', 'general_function',
       'genbank_protein_id', 'uniprot_id', 'uniprot_name',
       'genbank_gene_id', 'genecard_id', 'geneatlas_id', 'hgnc_id'],
      dtype=object)

In [125]:
data.tail()

Unnamed: 0,version,creation_date,update_date,accession,protein_type,gene_name,general_function,specific_function,genbank_protein_id,uniprot_id,uniprot_name,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id
5697,4.0,2015-06-18 21:34:16 UTC,2017-12-08 07:10:32 UTC,HMDBP12053,Unknown,CHRNA6,Involved in acetylcholine receptor activity,"After binding acetylcholine, the AChR responds...",1458118.0,Q15825,ACHA6_HUMAN,U62435,CHRNA6,,GNC:15963
5698,4.0,2015-06-18 21:34:16 UTC,2017-12-08 07:10:32 UTC,HMDBP12054,Unknown,HTR3D,Involved in transmembrane transport,This is one of the several different receptors...,,Q70Z44,5HT3D_HUMAN,,,,
5699,4.0,2015-06-18 21:34:17 UTC,2017-12-08 07:10:32 UTC,HMDBP12055,Unknown,HTR3C,Involved in transmembrane transport,This is one of the several different receptors...,,Q8WXA8,5HT3C_HUMAN,,,,
5700,4.0,2015-06-18 21:46:37 UTC,2017-12-08 07:10:32 UTC,HMDBP12056,Unknown,HTR3E,,,,A5X5Y0,5HT3E_HUMAN,,,,
5701,4.0,2017-09-20 01:51:15 UTC,2017-12-08 07:10:32 UTC,HMDBP12057,Enzyme,SULT1A3,sulfotransferase activity,Sulfotransferase that utilizes 3'-phospho-5'-a...,,P0DMM9,ST1A3_HUMAN,,,,HGNC:11455


In [63]:
data[data['genbank_protein_id']== '23897']

Unnamed: 0,version,creation_date,update_date,accession,protein_type,gene_name,general_function,genbank_protein_id,uniprot_id,uniprot_name,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id
0,4.0,2013-01-16 01:48:37 UTC,2017-12-08 07:09:13 UTC,HMDBP00001,Unknown,NT5E,Involved in hydrolase activity,23897,P21589,5NTD_HUMAN,X55740,NT5E,NT5E,HGNC:8021


In [58]:
data.describe()

Unnamed: 0,version,creation_date,update_date,accession,protein_type,gene_name,general_function,genbank_protein_id,uniprot_id,uniprot_name,genbank_gene_id,genecard_id,geneatlas_id,hgnc_id
count,15.0,15,15,15,15,15,15,12,15,15,15,15,14,15
unique,1.0,4,7,15,2,15,11,12,15,15,15,15,14,15
top,4.0,2013-01-16 01:48:38 UTC,2017-12-08 07:09:13 UTC,HMDBP00001,Unknown,NT5E,Involved in ATP binding,23897,P21589,5NTD_HUMAN,X55740,NT5E,NT5E,HGNC:8021
freq,15.0,6,7,1,9,1,2,1,1,1,1,1,1,1


## Building `Association` Parser   
"association"-style API, where each document describes a triple (subject/object/predicate). 

In [126]:
len(data['genbank_protein_id'].unique())

4610

In [175]:
# Build Parser 
def load_hmdb_data(data):
    results = {}
    # loop through our data rows and simply pass the row 
    data=data.to_dict(orient='records')
    for rec in data[:4]:
        id_= rec['genbank_protein_id'] # set id column
        #id_= str(id_)
        rec = dict_convert(rec,keyfn=process_key)
        # remove NaN values, not indexable
        rec = dict_sweep(rec,vals=[np.nan])
        results.setdefault(id_,[]).append(rec)
        
    for _id, docs in results.items():
        print(_id)
        doc = { "_id":_id,
                "association": docs }
        print(doc)
        #yield doc


In [176]:
def load_testcase():
    results=load_hmdb_data(data)
    #print(next(results))
load_testcase()

23897
{'_id': '23897', 'association': [{'version': '4.0', 'creation_date': '2013-01-16 01:48:37 UTC', 'update_date': '2017-12-08 07:09:13 UTC', 'accession': 'HMDBP00001', 'protein_type': 'Unknown', 'gene_name': 'NT5E', 'general_function': 'Involved in hydrolase activity', 'genbank_protein_id': '23897', 'uniprot_id': 'P21589', 'uniprot_name': '5NTD_HUMAN', 'genbank_gene_id': 'X55740', 'genecard_id': 'NT5E', 'geneatlas_id': 'NT5E', 'hgnc_id': 'HGNC:8021'}]}
61742819
{'_id': '61742819', 'association': [{'version': '4.0', 'creation_date': '2013-01-16 01:48:37 UTC', 'update_date': '2017-12-20 20:30:53 UTC', 'accession': 'HMDBP00002', 'protein_type': 'Unknown', 'gene_name': 'DCTD', 'general_function': 'Involved in zinc ion binding', 'genbank_protein_id': '61742819', 'uniprot_id': 'P32321', 'uniprot_name': 'DCTD_HUMAN', 'genbank_gene_id': 'NM_001921.2', 'genecard_id': 'DCTD', 'geneatlas_id': 'DCTD', 'hgnc_id': 'HGNC:2710'}]}
33150592
{'_id': '33150592', 'association': [{'version': '4.0', 'cre

## Test API

In [177]:
import requests

In [184]:
r=requests.get("http://localhost:8000/metadata")
print(r.content.decode("utf-8"))

{"biothing_type": "gene", "build_date": "2021-09-20T20:49:53.171856", "build_version": "20210920", "src": {"HMDB": {"stats": {"HMDB": 4}, "version": "2020-09-08"}}, "stats": {"total": 4}}


In [182]:
# method to search Biothings API  
def test_query(query_input):
    # run user query
    query="http://localhost:8000/query?q=%s"%query_input
    r=requests.get(query)
    print("\n[INFO] query for %s: \n"%query_input, r.content.decode("utf-8"))

In [183]:
#"ZDB-GENE-041114-199", "1311391", "WBGene00011502", "FBgn0260795"
test_query('23897')



[INFO] query for 23897: 
 {"took": 15, "total": 1, "max_score": 0.2876821, "hits": [{"_id": "23897", "_score": 0.2876821, "association": [{"accession": "HMDBP00001", "creation_date": "2013-01-16 01:48:37 UTC", "genbank_gene_id": "X55740", "genbank_protein_id": "23897", "gene_name": "NT5E", "geneatlas_id": "NT5E", "genecard_id": "NT5E", "general_function": "Involved in hydrolase activity", "hgnc_id": "HGNC:8021", "protein_type": "Unknown", "uniprot_id": "P21589", "uniprot_name": "5NTD_HUMAN", "update_date": "2017-12-08 07:09:13 UTC", "version": "4.0"}]}]}


---

## BioThings Client `mygene.info`

In [148]:
from biothings_client import get_client
gene_client = get_client('gene') 
type(gene_client)

biothings_client.MyGeneInfo

In [153]:
 gene_client.getgene('23897').keys()

dict_keys(['MGI', '_id', '_version', 'accession', 'alias', 'ensembl', 'entrezgene', 'exons', 'exons_mm9', 'generif', 'genomic_pos', 'genomic_pos_mm9', 'go', 'homologene', 'interpro', 'ipi', 'map_location', 'name', 'other_names', 'pantherdb', 'reagent', 'refseq', 'reporter', 'symbol', 'taxid', 'type_of_gene', 'unigene', 'uniprot'])

In [160]:
 gene_client.getgene('61742819', fields='_id, symbol,name') # '23897'

{'_id': '61742819',
 '_version': 1,
 'name': 'alpha/beta hydrolase',
 'symbol': 'C6X95_RS11625'}

---