In [1]:
import json
import pandas as pd
import numpy as np
from json.decoder import JSONDecodeError
from collections import Counter
from pprint import pprint
from io import BytesIO
import requests
from zipfile import ZipFile
import os
from tqdm import tqdm
# import datasets
import rdkit
import rdkit.Chem as Chem
import rdkit.RDLogger as RDLogger

In [2]:
tqdm.pandas()

In [3]:
RDLogger.DisableLog('rdApp.*')

In [4]:
# download the raw data
url = "https://mona.fiehnlab.ucdavis.edu/rest/downloads/retrieve/19a23fd5-4e06-4122-ae9d-169198ee9794"
response = requests.get(url)
tmp_dir = os.path.join(os.getcwd(),"tmp")
os.makedirs(tmp_dir,exist_ok=True)
with ZipFile(BytesIO(response.content)) as my_zip:
    my_zip.extractall(path=tmp_dir)

In [5]:
mona_file = os.path.join(tmp_dir,"MoNA-export-Experimental_Spectra.json")
print(os.path.isfile(mona_file))

True


In [6]:
with open(mona_file,"r") as json_file:
    _ = json_file.readline() # first line is garbage
    line = json_file.readline()
entry = json.loads(line[:-2])

In [7]:
pprint(entry)

{'annotations': [],
 'compound': [{'classification': [{'category': 'classification',
                                   'computed': True,
                                   'hidden': False,
                                   'name': 'kingdom',
                                   'value': 'Organic compounds'},
                                  {'category': 'classification',
                                   'computed': True,
                                   'hidden': False,
                                   'name': 'superclass',
                                   'value': 'Organoheterocyclic compounds'},
                                  {'category': 'classification',
                                   'computed': True,
                                   'hidden': False,
                                   'name': 'class',
                                   'value': 'Lactams'},
                                  {'category': 'classification',
                                   'compute

In [8]:
# top level keys
pprint(entry.keys())

dict_keys(['compound', 'id', 'metaData', 'annotations', 'score', 'spectrum', 'lastUpdated', 'dateCreated', 'lastCurated', 'splash', 'submitter', 'tags', 'library'])


In [9]:
# compound information
pprint(entry["compound"])

[{'classification': [{'category': 'classification',
                      'computed': True,
                      'hidden': False,
                      'name': 'kingdom',
                      'value': 'Organic compounds'},
                     {'category': 'classification',
                      'computed': True,
                      'hidden': False,
                      'name': 'superclass',
                      'value': 'Organoheterocyclic compounds'},
                     {'category': 'classification',
                      'computed': True,
                      'hidden': False,
                      'name': 'class',
                      'value': 'Lactams'},
                     {'category': 'classification',
                      'computed': True,
                      'hidden': False,
                      'name': 'subclass',
                      'value': 'Beta lactams'},
                     {'category': 'classification',
                      'computed': True,
          

In [10]:
pprint(entry["compound"][0].keys())

dict_keys(['kind', 'tags', 'inchi', 'names', 'molFile', 'computed', 'inchiKey', 'metaData', 'classification'])


In [11]:
# score information
pprint(entry["score"])

{'impacts': [{'reason': 'Valid molecular structure(s) provided for biological '
                        'compound',
              'value': 2.0},
             {'reason': 'Chromatography identified as LC-MS', 'value': 1.0},
             {'reason': 'Ionization mode/type provided', 'value': 1.0},
             {'reason': 'MS type/level identified', 'value': 1.0},
             {'reason': 'Instrument information provided', 'value': 1.0},
             {'reason': 'No collision energy provided', 'value': -1.0},
             {'reason': 'Retention time/index provided', 'value': 1.0},
             {'reason': 'Column information provided', 'value': 1.0}],
 'relativeScore': 0.0,
 'scaledScore': 0.0,
 'score': 4.444444444444445}


In [12]:
# compound metadata (including theoretical adducts)
pprint(entry["compound"][0]["metaData"])

[{'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'molecular formula',
  'value': 'C16H16N4O8S'},
 {'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'SMILES',
  'value': 'CO/N=C(/C1=CC=CO1)\\C(=O)N[C@H]2[C@@H]3N(C2=O)C(=C(CS3)COC(=O)N)C(=O)O'},
 {'category': 'external id',
  'computed': False,
  'hidden': False,
  'name': 'cas',
  'value': '55268-75-2'},
 {'category': 'external id',
  'computed': False,
  'hidden': False,
  'name': 'pubchem cid',
  'value': '5479529'},
 {'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'InChI',
  'value': 'InChI=1S/C16H16N4O8S/c1-26-19-9(8-3-2-4-27-8)12(21)18-10-13(22)20-11(15(23)24)7(5-28-16(17)25)6-29-14(10)20/h2-4,10,14H,5-6H2,1H3,(H2,17,25)(H,18,21)(H,23,24)/b19-9-/t10-,14-/m1/s1'},
 {'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'InChIKey',
  'value': 'JFPVXVDWJQMJEE-IZRZKJBUSA-N'},
 {'category': 'computed',
  'computed': True,
  'hidden': False,
  'name': 'mol

In [13]:
pprint(entry["id"])

'WA002994'


In [14]:
pprint(entry["tags"])

[{'ruleBased': False, 'text': 'MassBank'}, {'ruleBased': True, 'text': 'LC-MS'}]


In [15]:
pprint(entry["library"])

{'description': 'MassBank High Quality Mass Spectral Database',
 'library': 'MassBank',
 'link': 'https://massbank.eu/MassBank/RecordDisplay.jsp?id=WA002994'}


In [16]:
pprint(entry["metaData"])

[{'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'accession',
  'value': 'WA002994'},
 {'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'date',
  'value': '2016.01.19 (Created 2007.08.01, modified 2011.05.06)'},
 {'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'author',
  'value': 'Nihon Waters K.K.'},
 {'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'license',
  'value': 'CC BY-NC'},
 {'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'exact mass',
  'value': '424.06888'},
 {'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'instrument',
  'value': 'ZQ, Waters'},
 {'category': 'none',
  'computed': False,
  'hidden': False,
  'name': 'instrument type',
  'value': 'LC-ESI-Q'},
 {'category': 'mass spectrometry',
  'computed': False,
  'hidden': False,
  'name': 'ms level',
  'value': 'MS1'},
 {'category': 'mass spectrometry',
  'computed': False,
  'hidden':

In [17]:
# all keys should be camelcase
def transform_key(key):
    return key.lower().replace(" ","_")

In [18]:
# process the rows
df_rows = []
md_key_counter = Counter()
with open(mona_file,"r") as json_file:
    line = json_file.readline().strip().rstrip(",")
    count = 0
    multi_compound_count, no_compound_count = 0, 0
    while line != "":
        try:
            entry = json.loads(line)
        except JSONDecodeError:
            entry = None
        if entry is not None:
            df_row = {}
            df_row["spectrum"] = entry.get("spectrum",np.nan)
            df_row["id"] = entry.get("id",np.nan)
            df_row["score"] = entry.get("score",{}).get("score",np.nan)
            df_row["library"]= entry.get("library",{}).get("library",np.nan)
            mds = entry.get("metaData",[])
            # get all metadata properties
            md_keys = []
            for md in mds:
                k = md["name"]
                v = md.get("value",np.nan)
                df_row["md_"+transform_key(k)] = v
                if not (v is np.nan):
                    md_keys.append(k)
            md_key_counter.update(md_keys)
            compounds = entry.get("compound",[])
            if len(compounds) > 0:
                if len(compounds) > 1:
                    multi_compound_count +=1
                compound = compounds[0]
                df_row["inchi"] = compound.get("inchi",np.nan)
                df_row["inchikey"] = compound.get("inchiKey",np.nan)
                compound_mds = compound.get("metaData",[])
                for compound_md in compound_mds:
                    k = compound_md["name"]
                    if k == "SMILES":
                        # note: more than one smiles are possible
                        df_row["smiles"] = compound_md["value"]
                    elif k == "molecular formula":
                        df_row["molecular_formula"] = compound_md["value"]
            else:
                no_compound_count += 1
            df_rows.append(df_row)
        if count % 100000 == 0:
            print(f"> line_count = {count}, num_rows = {len(df_rows)}, num_md_keys = {len(md_key_counter)}")
        line = json_file.readline().strip().rstrip(",")
        count += 1

> line_count = 0, num_rows = 0, num_md_keys = 0
> line_count = 100000, num_rows = 100000, num_md_keys = 24661
> line_count = 200000, num_rows = 200000, num_md_keys = 27561


In [19]:
print(no_compound_count,multi_compound_count)

0 0


In [20]:
# find the 100 most frequent metadata keys
md_top_keys = ["md_"+transform_key(k) for k,v in md_key_counter.most_common(100)]
pprint(md_top_keys)

['md_normalized_entropy',
 'md_spectral_entropy',
 'md_ionization_mode',
 'md_ms_level',
 'md_instrument',
 'md_precursor_type',
 'md_precursor_m/z',
 'md_instrument_type',
 'md_author',
 'md_mass_accuracy',
 'md_mass_error',
 'md_collision_energy',
 'md_comment',
 'md_exact_mass',
 'md_retention_time',
 'md_license',
 'md_accession',
 'md_date',
 'md_column',
 'md_ionization',
 'md_origin',
 'md_flow_rate',
 'md_copyright',
 'md_flow_gradient',
 'md_solvent_a',
 'md_solvent_b',
 'md_whole',
 'md_fragmentation_mode',
 'md_resolution',
 'md_raw_filename',
 'md_source_introduction',
 'md_recalibrate',
 'md_ion_source',
 'md_averaged_scans',
 'md_reanalyze',
 'md_formula',
 'md_source_file',
 'md_compound_source',
 'md_charge_state',
 'md_publication',
 'md_ion_type',
 'md_ionization_energy',
 'md_molecular_weight',
 'md_data_transformation',
 'md_column_temperature',
 'md_compound_id',
 'md_deprofile',
 'md_fragmentation_method',
 'md_capillary_temperature',
 'md_comments',
 'md_precurso

In [21]:
# filter for required keys
md_req_keys = [
    "md_normalized_entropy",
    "md_spectral_entropy",
    "md_ionization_mode",
    "md_ms_level",
    "md_instrument",
    "md_instrument_type",
    "md_precursor_m/z",
    "md_precursor_type",
    "md_mass_accuracy",
    "md_mass_error",
    "md_collision_energy",
    "md_fragmentation_mode",
    "md_license",
    "md_date",
    "md_accession",
    "md_precursor_mz", # alias for precursor_m/z
    "md_ion_type", # alias for prec_type
    "md_ionization_energy",
    "md_collision_energy_voltage", # alias for collision energy
    "md_adduct", # alias for prec_type
    "md_derivatization_type"
]
for i in range(len(df_rows)):
    cur_row = df_rows[i]
    cur_keys = list(cur_row.keys())
    for k in cur_keys:
        if k.startswith("md_"):
            val = cur_row[k]
            if k in md_req_keys:
                # keep the attribute, but change the name
                cur_row[k[3:]] = val
            del cur_row[k]

In [22]:
pprint(df_rows[0])

{'accession': 'WA002994',
 'date': '2016.01.19 (Created 2007.08.01, modified 2011.05.06)',
 'id': 'WA002994',
 'inchi': 'InChI=1S/C16H16N4O8S/c1-26-19-9(8-3-2-4-27-8)12(21)18-10-13(22)20-11(15(23)24)7(5-28-16(17)25)6-29-14(10)20/h2-4,10,14H,5-6H2,1H3,(H2,17,25)(H,18,21)(H,23,24)/b19-9-/t10-,14-/m1/s1',
 'inchikey': 'JFPVXVDWJQMJEE-IZRZKJBUSA-N',
 'instrument': 'ZQ, Waters',
 'instrument_type': 'LC-ESI-Q',
 'ionization_mode': 'negative',
 'library': 'MassBank',
 'license': 'CC BY-NC',
 'molecular_formula': 'C16H16N4O8S',
 'ms_level': 'MS1',
 'normalized_entropy': '0.8310792200354279',
 'score': 4.444444444444445,
 'smiles': 'O=C(O)C1=C(COC(=N)O)CSC2N1C(=O)C2N=C(O)C(=NOC)C=3OC=CC3',
 'spectral_entropy': '4.124517044473415',
 'spectrum': '100:3.103103 101:1.201201 102:1.201201 103:1.201201 '
             '104:15.315315 105:11.011011 106:2.402402 107:0.800801 '
             '108:3.103103 109:12.512513 110:7.107107 111:1.201201 '
             '112:1.201201 114:1.201201 115:5.105105 116:25.5

In [23]:
df = pd.DataFrame(df_rows)

In [24]:
print(df.shape)

(205076, 29)


In [25]:
pprint(list(df.columns))

['spectrum',
 'id',
 'score',
 'library',
 'inchi',
 'inchikey',
 'molecular_formula',
 'smiles',
 'accession',
 'date',
 'license',
 'instrument',
 'instrument_type',
 'ms_level',
 'ionization_mode',
 'spectral_entropy',
 'normalized_entropy',
 'precursor_type',
 'precursor_m/z',
 'mass_accuracy',
 'mass_error',
 'collision_energy',
 'fragmentation_mode',
 'ion_type',
 'derivatization_type',
 'ionization_energy',
 'collision_energy_voltage',
 'adduct',
 'precursor_mz']


In [26]:
# get percentage NaN
pprint(df.isna().mean())

spectrum                    0.000000
id                          0.000000
score                       0.017671
library                     0.134062
inchi                       0.211424
inchikey                    0.327766
molecular_formula           0.051396
smiles                      0.033602
accession                   0.615621
date                        0.640645
license                     0.609910
instrument                  0.089937
instrument_type             0.203081
ms_level                    0.095184
ionization_mode             0.050133
spectral_entropy            0.017813
normalized_entropy          0.016677
precursor_type              0.173775
precursor_m/z               0.166411
mass_accuracy               0.236834
mass_error                  0.236971
collision_energy            0.349695
fragmentation_mode          0.821573
ion_type                    0.911881
derivatization_type         0.985805
ionization_energy           0.939988
collision_energy_voltage    0.985776
a

In [27]:
# look at compound id
print(df["inchikey"].isna().sum(),df["inchi"].isna().sum(),df["smiles"].isna().sum())
print(df[["inchikey","inchi","smiles"]].isna().all(axis=1).sum())

67217 43358 6891
1655


In [28]:
# add inchi and inchikey information
def get_mol(row):
    mol = np.nan
    if row["inchi"] is not np.nan:
        try:
            mol = Chem.MolFromInchi(row["inchi"])
        except:
            pass
    if mol is np.nan and row["smiles"] is not np.nan:
        try:
            mol = Chem.MolFromSmiles(row["smiles"])
        except:
            pass
    return mol
            
mol = df.progress_apply(get_mol,axis=1)
print(mol.isna().sum())

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 205076/205076 [01:39<00:00, 2054.05it/s]

10206





In [29]:
def get_smiles(mol):
    smiles = np.nan
    try:
        smiles = Chem.MolToSmiles(mol)
    except:
        pass
    return smiles
smiles = mol.progress_apply(get_smiles)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 205076/205076 [00:16<00:00, 12552.10it/s]


In [30]:
def get_inchi(mol):
    inchi = np.nan
    try:
        inchi = Chem.MolToInchi(mol)
    except:
        pass
    return inchi
inchi = mol.progress_apply(get_inchi)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 205076/205076 [01:03<00:00, 3211.63it/s]


In [31]:
def get_inchikey(mol):
    inchikey = np.nan
    try:
        inchikey = Chem.MolToInchiKey(mol)
    except:
        pass
    return inchikey
inchikey = mol.progress_apply(get_inchikey)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 205076/205076 [01:06<00:00, 3074.05it/s]


In [32]:
# update dataframe
proc_df = df.copy()
print(proc_df[["smiles","inchi","inchikey"]].isna().any(axis=1).mean())
proc_df.loc[:,"smiles"] = smiles
proc_df.loc[:,"inchi"] = inchi
proc_df.loc[:,"inchikey"] = inchikey
print(proc_df[["smiles","inchi","inchikey"]].isna().any(axis=1).mean())
# drop entries with invalid smiles/inchi/inchikey
proc_df = proc_df[proc_df[["smiles","inchi","inchikey"]].notna().all(axis=1)]

0.4985663851450194
0.04976691568004057


In [33]:
list(proc_df.columns)

['spectrum',
 'id',
 'score',
 'library',
 'inchi',
 'inchikey',
 'molecular_formula',
 'smiles',
 'accession',
 'date',
 'license',
 'instrument',
 'instrument_type',
 'ms_level',
 'ionization_mode',
 'spectral_entropy',
 'normalized_entropy',
 'precursor_type',
 'precursor_m/z',
 'mass_accuracy',
 'mass_error',
 'collision_energy',
 'fragmentation_mode',
 'ion_type',
 'derivatization_type',
 'ionization_energy',
 'collision_energy_voltage',
 'adduct',
 'precursor_mz']

In [34]:
# remove spectra with ND licenses
proc_df = proc_df[proc_df["license"] != "CC BY-NC-ND"]

In [35]:
proc_df["license"].value_counts()

license
CC BY                                    21679
CC BY-NC-SA                              20133
CC0                                      11039
CC BY-SA                                 10819
CC-BY                                     5209
CC BY-NC                                  3792
BY-CC                                     1956
CC BY 4.0                                 1783
CC BY-SA NC                               1562
License CC-BY-NC-SA 4.0 International     1310
CC BY SA                                   561
Name: count, dtype: int64

In [36]:
proc_df.loc[:,"ms_level"] = proc_df["ms_level"].replace(
    {
        "2": "MS2",
        "MS": "MS1"
    }
)

In [37]:
proc_df["ms_level"].value_counts()

ms_level
MS2                  159715
MS1                   21423
MS3                     930
MS4                      70
MS2-MS5 Composite        34
MS2-MS3 Composite        18
MS2-MS4 Composite        15
MS2-MS2 Composite         3
MS5                       1
Name: count, dtype: int64

In [38]:
# merge precursor_type aliases
proc_df.loc[:,"collision_energy"] = proc_df["collision_energy"].where(
    ~(proc_df["collision_energy"].isna()),
    proc_df["collision_energy_voltage"]
)
proc_df = proc_df.drop(columns="collision_energy_voltage")

In [39]:
proc_df["collision_energy"].value_counts()

collision_energy
45HCD           9420
35HCD           9420
65HCD           8873
10 eV           7634
35              6734
                ... 
60.5081            1
61.2619            1
59.5365            1
25 (nominal)       1
5.8                1
Name: count, Length: 948, dtype: int64

In [40]:
proc_df.loc[:,"ionization_mode"] = proc_df["ionization_mode"].replace(
    {
        "N/A": np.nan,
        "ESI": np.nan,
        "Positive":"positive",
        "Negative":"negative"
    }
)

In [41]:
proc_df["ionization_mode"].value_counts()

ionization_mode
positive    131601
negative     53460
Name: count, dtype: int64

In [42]:
proc_df["instrument_type"].value_counts()

instrument_type
LC-ESI-QTOF      44310
ESI-QFT          28260
LC-ESI-QFT       19452
LC-ESI-ITFT      16927
EI-B             13797
                 ...  
GC 6890-5973N        1
QIT-FT               1
MALDI-QIT            1
APCI-ITTOF           1
SI-BE                1
Name: count, Length: 77, dtype: int64

In [43]:
proc_df["fragmentation_mode"].value_counts()

fragmentation_mode
HCD    25983
CID    10598
RID        8
HAD        2
Name: count, dtype: int64

In [44]:
# merge precursor_type aliases
proc_df.loc[:,"precursor_type"] = proc_df["precursor_type"].where(
    ~(proc_df["precursor_type"].isna()),
    proc_df["adduct"]
)
proc_df.loc[:,"precursor_type"] = proc_df["precursor_type"].where(
    ~(proc_df["precursor_type"].isna()),
    proc_df["ion_type"]
)
proc_df = proc_df.drop(columns=["adduct","ion_type"])

In [45]:
proc_df["precursor_type"].value_counts()

precursor_type
[M+H]+                    80131
[M-H]-                    40640
[M+Na]+                   11794
[M]+*                     11242
[M+NH4]+                   7273
                          ...  
[M-H2O+H]+,[M-2H2O+H]+        1
[M+H]-                        1
[M-2H2O+NH4]+                 1
[M+H+Na]2+                    1
[M+Na]+*                      1
Name: count, Length: 137, dtype: int64

In [46]:
# merge precursor_m/z aliases
proc_df.loc[:,"precursor_m/z"] = proc_df["precursor_m/z"].where(
    ~(proc_df["precursor_m/z"].isna()),
    proc_df["precursor_mz"]
)
proc_df = proc_df.drop(columns="precursor_mz")

In [47]:
proc_df["precursor_m/z"].value_counts()

precursor_m/z
267.1856    241
352.1755    195
254.0594    136
230.0552    127
205.1911    124
           ... 
203.1501      1
74.06068      1
170.0448      1
224.1028      1
214.0492      1
Name: count, Length: 48388, dtype: int64

In [48]:
proc_df["derivatization_type"].value_counts()

derivatization_type
2 TMS                                     651
3 TMS                                     523
1 TMS                                     407
ABEE (p-Aminobenzoic acid ethyl ester)    285
4 TMS                                     257
5 TMS                                     129
n TMS                                      78
6 TMS                                      69
8 TMS                                      55
1 TMS; 1 MEOX                              44
1 MEOX; 1 TMS                              40
2 TMS; 1 MEOX                              40
1 MEOX                                     30
1 MEOX; 2 TMS                              23
5 TMS; 1 MEOX                              23
4 TMS; 1 MEOX                              21
2 MEOX                                     17
1 MEOX; 3 TMS                              15
3 TMS; 1 MEOX                              14
7 TMS                                      14
Methoxyamine, MSTFA                        13
1 MEOX; 4 TMS 

In [49]:
proc_df["ionization_energy"].value_counts()

ionization_energy
70 eV      7552
20 eV      1409
-70 eV     1315
75 eV       626
50 eV       300
120 eV      250
100 eV      218
25 eV       163
14 eV        97
15 eV        90
200 eV       76
80 eV        53
13.5 eV      29
40 eV        26
23 eV        25
500 eV       15
12 eV        13
30 eV         8
35 eV         7
18 eV         6
24 eV         5
10 eV         4
250 eV        3
7 eV          2
11 eV         2
36 eV         2
110 eV        1
22 eV         1
16 eV         1
60 eV         1
190 eV        1
Name: count, dtype: int64

In [50]:
proc_df.isna().mean()

spectrum               0.000000
id                     0.000000
score                  0.017394
library                0.112915
inchi                  0.000000
inchikey               0.000000
molecular_formula      0.017394
smiles                 0.000000
accession              0.596012
date                   0.622301
license                0.589962
instrument             0.045403
instrument_type        0.162633
ms_level               0.064256
ionization_mode        0.049609
spectral_entropy       0.017435
normalized_entropy     0.017435
precursor_type         0.090232
precursor_m/z          0.148032
mass_accuracy          0.209957
mass_error             0.210101
collision_energy       0.303455
fragmentation_mode     0.812085
derivatization_type    0.985081
ionization_energy      0.936828
dtype: float64

In [51]:
# save to json
proc_df.to_json("mona_df.json")