# Association Matrix Extraction

Import necessary data processing libraries.

In [3]:
from lxml import etree
import pandas as pd
import numpy as np
import scipy

Download and unzip the "All Metabolites" XML file from https://hmdb.ca/downloads.

Then extract metabolite names and associated diseases based on http://www.metabolomics-forum.com/index.php?topic=1588.0, while filtering out metabolites with no known disease association.

In [2]:
hmdb_path = 'data/hmdb_metabolites.xml'

namespaces = {'hmdb': 'http://www.hmdb.ca'}
context = etree.iterparse(hmdb_path, tag='{http://www.hmdb.ca}metabolite')

df_metabolites = pd.DataFrame(columns=['accession', 'name', 'disease_names'])
disease_names = []
for _, elem in context:

  accession = elem.xpath('hmdb:accession/text()', namespaces=namespaces)[0]
  name = elem.xpath('hmdb:name/text()', namespaces=namespaces)[0]
  associated_diseases = elem.xpath('hmdb:diseases/hmdb:disease/hmdb:name/text()', namespaces=namespaces)

  if len(associated_diseases) > 0:
    df_metabolites.loc[len(df_metabolites)] = [accession, name, associated_diseases]
    for d in associated_diseases:
        if d not in disease_names:
            disease_names.append(d)

  elem.clear()
  for ancestor in elem.xpath('ancestor-or-self::*'):
      while ancestor.getprevious() is not None:
          del ancestor.getparent()[0]
del context

disease_names = np.array(disease_names)

Create the association matrix with metabolites in the rows and diseases in the columns.

In [3]:
def disease_indicator(row, disease):
    if disease in row["disease_names"]:
        return 1
    else:
        return 0

for disease in disease_names:
    df_metabolites[disease] = df_metabolites.apply(disease_indicator, disease=disease, axis=1)

metabolite_names = df_metabolites["name"].values
metabolite_ids = df_metabolites["accession"].values
df_metabolites = df_metabolites.set_index("accession")
df_metabolites = df_metabolites.drop(columns=['name'])
df_metabolites = df_metabolites.drop(columns=['disease_names'])
assert np.array_equal(df_metabolites.columns.values, disease_names)

association_matrix = df_metabolites.values

Remove the outlier disease with 20020 known associations as well as the metabolites connecting only to this particular disease.

In [4]:
print("Removing outlier disease: " + disease_names[np.where(association_matrix.sum(axis=0) == 20020)[0][0]])

association_matrix = association_matrix[:,np.where(association_matrix.sum(axis=0) != 20020)[0]]

keep_rows = np.where(association_matrix.sum(axis=1) > 0)[0]
association_matrix = association_matrix[keep_rows,:]
metabolite_names = metabolite_names[keep_rows]
metabolite_ids = metabolite_ids[keep_rows]

Removing outlier disease: 3-methylglutaconic aciduria type II, X-linked


Save the processed data.

In [5]:
scipy.sparse.save_npz("data/associations.npz", scipy.sparse.csr_matrix(association_matrix))
np.save("data/metabolite_ids.npy", metabolite_ids.astype(str))
np.save("data/metabolite_names.npy", metabolite_names.astype(str))
np.save("data/disease_names.npy", disease_names)

print("Number of metabolites: " + str(association_matrix.shape[0]))
print("Number of diseases: " + str(association_matrix.shape[1]))
print("Number of associations: " + str(association_matrix.sum()))
print("Sparsity of the association matrix: " + str(association_matrix.sum() / (association_matrix.shape[0] * association_matrix.shape[1])))

Number of metabolites: 2583
Number of diseases: 656
Number of associations: 7650
Sparsity of the association matrix: 0.004514744624798164
