# A demonstration of how to use the MS2LDA annotation function

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
basedir = '/Users/joewandy/git/ms2ldaviz/ms2ldaviz/'
sys.path.append(basedir)

import annotation.annotate_methods as annot

### Batch spectra annotation

Prepare some example spectra

In [2]:
parentmass_1 = 188.0818
spectrum_1 = [ # (mz, intensity)
    (53.0384, 331117.7), 
    (57.0447, 798106.4), 
    (65.0386, 633125.7), 
    (77.0385, 5916789.799999999), 
    (81.0334, 27067.0), 
    (85.0396, 740633.6), 
    (91.0542, 1251226.7000000002), 
    (92.0492, 2928042.3), 
    (95.0493, 728877.1), 
    (103.0419, 59138.40000000001), 
]

parentmass_2 = 188.0819
spectrum_2 = [ # (mz, intensity)
    (104.0495, 13185144.700000001), 
    (105.045, 1643858.5999999999), 
    (105.0701, 129215.0), 
    (110.06, 46285.0), 
    (119.0604, 12584552.9), 
    (130.04, 33153.3), 
    (130.0652, 270688.7), 
    (131.0725, 203663.6), 
    (131.0734, 45999.7), 
    (142.0653, 20711.1), 
    (147.0555, 63444.99999999999), 
    (160.0871, 61955089.8), 
    (188.082, 86709805.39999999),
]

spectra = {
    parentmass_1: spectrum_1,
    parentmass_2: spectrum_2
}
db_name = 'MASSBANK' # either 'MASSBANK' or 'GNPS'

Call the batch annotation function on these spectra

In [3]:
results = annot.batch_annotate(spectra, db_name)

Annotation results are produced as a JSON object, which can be converted into a dictionary. The keys are:
- A status field ('status')
- The parent masses for each spectrum

In [18]:
for key in results:
    if key == 'status':
        print key, ' --> ', results[key]
    else:
        print key, ' --> ', results[key].keys()

status  -->  OK
188.0818  -->  [u'sub_term_probs', u'loss_match', u'taxa_term_probs', u'fragment_match', u'fragment_intensity_match', u'motif_theta_overlap', u'loss_intensity_match']
188.0819  -->  [u'sub_term_probs', u'loss_match', u'taxa_term_probs', u'fragment_match', u'fragment_intensity_match', u'motif_theta_overlap', u'loss_intensity_match']


Here we retrieve the annotation result for the first spectra

In [24]:
first = str(results.keys()[1])
spectra_annotations = results[first]
print first

188.0818


Print how many fragment and loss features can be matched for the annotation of that spectra

In [25]:
print spectra_annotations['fragment_match']
print spectra_annotations['loss_match']

10
9


Print the taxonomy terms for this spectra with probability > 0.5

In [26]:
for taxa_term, prob in spectra_annotations['taxa_term_probs']:
    if prob > 0.5:
        print taxa_term, prob

Organic compounds 0.933499434497
Chemical entities 0.910921259107
Benzene and substituted derivatives 0.566497071138
Benzenoids 0.562529969786


Print the substituent terms for this spectra with probability > 0.5

In [27]:
for taxa_term, prob in spectra_annotations['sub_term_probs']:
    if prob > 0.5:
        print taxa_term, prob

Hydrocarbon derivative 0.933512461786
Organic oxygen compound 0.917048996931
Organooxygen compound 0.898785933371
Organic nitrogen compound 0.813150955338
Organonitrogen compound 0.809181461392
Organopnictogen compound 0.793502131819
Heteroaromatic compound 0.748388579402
Azacycle 0.7337791077
Organic oxide 0.61644026113
Organoheterocyclic compound 0.572521216292
Aromatic heteromonocyclic compound 0.543394632553


Print the Mass2Motif annotations for this spectra with probability > 0.01

In [28]:
for motif, annotation, theta, overlap in spectra_annotations['motif_theta_overlap']:
    if theta > 0.01:
        print motif, annotation, theta, overlap

motif_114 None 0.244657340325 0.261738998436
motif_174 None 0.0125941543502 0.00264190826521
motif_204 None 0.0265024721067 0.0217700890222
motif_235 None 0.0842219795498 0.0172562882952
motif_270 None 0.0563800605949 0.059114076161
motif_30 Fragment indicative for aromatic compounds related to methylbenzene substructure (C7H7 fragment) 0.0477704729811 1.0
motif_310 None 0.29198850052 0.0121630592267
motif_324 None 0.118916459985 0.0553050824779
motif_351 None 0.0724566972491 0.031848461005
motif_38 Fragments indicative for dihydroxylated benzene ring substructure (MzCloud) – C6H5O2 fragment corresponds to positively charged fragment with two hydroxyl groups. 0.0114838335088 0.960575383632
motif_446 None 0.0299553074954 0.010711849673
