# Looking at intersections of GNPS annotations in feature-based molecular networking

Author: Louis-Felix Nothias for the CARA project on October 2020

In [3]:
import pandas as pd     
import numpy as np

### Prepare input annotation files

In [4]:
# Importing the GNPS annotations from Classical molecular networking
results_C18_pos = pd.read_csv('Input/FBMN/C18_pos_ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-63575189-download_cytoscape_data/DB_result/601e043681dc4b5da452d6e7a1eaff6d.tsv',  sep='\t', header=0)
results_C18_neg = pd.read_csv('Input/FBMN/C18_neg_ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-2d9c9fec-download_cytoscape_data/DB_result/3c75bcc74591496785c21fac463dc97a.tsv',  sep='\t', header=0)
results_HILIC_pos = pd.read_csv('Input/FBMN/HILIC_pos_ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-b699fd88-download_cytoscape_data/DB_result/b8fe540e7d164bd196883cbf19329390.tsv',  sep='\t', header=0)
results_HILIC_neg = pd.read_csv('Input/FBMN/HILIC_neg_ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-5e42a119-download_cytoscape_data/DB_result/161d834d711a40e0a7d8cc7dd11888a6.tsv',  sep='\t', header=0)

### This function cleans up the annotations

In [5]:
# This function cleans up the GNPS library results

max_ppm_accuracy = 20
def clean_up_annotations(df, ion_mode):
    print('==============')
    print('Initial:')
    print(df.shape)
    df2 = df
    df2 = df2.dropna(subset=['IonMode'])
    df2 = df2[df2.IonMode.str.contains(ion_mode)]
    print('With correct ion mode:')
    print(df2.shape[0])
    df2 =  df2[(df2['MZErrorPPM'] <= max_ppm_accuracy)]
    print('With acceptable mass tolerance:')
    print(df2.shape)
    df2 = df2.dropna(subset=['Smiles'])
    print('With SMILES:')
    print(df2.shape)
    df2 = df2[~df2.Smiles.str.contains(" ")]
    print('With valid SMILES:')
    print(df2.shape)
    print('Annotations after SMILES cleaning')
    #Remove the stereochemistry
    df['Smiles'] = df2['Smiles'].str.replace('@', '')
    print(df2.shape[0])
    print('Unique annotations')
    print(len(set(df['Smiles'])))
    print('==============')
    return df2

### Run the processing

In [6]:
print('C18_pos')
results_C18_pos_clean = clean_up_annotations(results_C18_pos,'Positive')
print('C18_neg')
results_C18_neg_clean = clean_up_annotations(results_C18_neg,'Negative')
print('HILIC_pos')
results_HILIC_pos_clean = clean_up_annotations(results_HILIC_pos,'Positive')
print('HILIC_neg')
results_HILIC_neg_clean = clean_up_annotations(results_HILIC_neg,'Negative')

C18_pos
Initial:
(396, 43)
With correct ion mode:
381
With acceptable mass tolerance:
(358, 43)
With SMILES:
(266, 43)
With valid SMILES:
(260, 43)
Annotations after SMILES cleaning
260
Unique annotations
191
C18_neg
Initial:
(127, 43)
With correct ion mode:
114
With acceptable mass tolerance:
(107, 43)
With SMILES:
(83, 43)
With valid SMILES:
(81, 43)
Annotations after SMILES cleaning
81
Unique annotations
52
HILIC_pos
Initial:
(970, 43)
With correct ion mode:
930
With acceptable mass tolerance:
(871, 43)
With SMILES:
(657, 43)
With valid SMILES:
(644, 43)
Annotations after SMILES cleaning
644
Unique annotations
436
HILIC_neg
Initial:
(366, 43)
With correct ion mode:
324
With acceptable mass tolerance:
(303, 43)
With SMILES:
(221, 43)
With valid SMILES:
(217, 43)
Annotations after SMILES cleaning
217
Unique annotations
174


### Prepare for UpSet plot

In [7]:
# Make list of SMILES
smiles_C18_pos_clean = results_C18_pos_clean.Smiles.tolist()
smiles_C18_neg_clean = results_C18_neg_clean.Smiles.tolist()
smiles_HILIC_pos_clean = results_HILIC_pos_clean.Smiles.tolist()
smiles_HILIC_neg_clean = results_HILIC_neg_clean.Smiles.tolist()

#Make a set of all SMILES
smiles_all = smiles_C18_pos_clean+smiles_C18_neg_clean+smiles_HILIC_pos_clean+smiles_HILIC_neg_clean
print('Total number of valid annotations: '+str(len(smiles_all)))
smiles_unique = set(smiles_all)
print('Total number of valid unique annotations: '+ str(len(smiles_unique)))

Total number of valid annotations: 1202
Total number of valid unique annotations: 663


In [15]:
# Making a binary dataframe for SMILES absence/presence

df = pd.DataFrame()
df["Smiles"] = list(smiles_unique)
df['C18_pos'] = 'False'
df['C18_pos'] = [+1 if x in smiles_C18_pos_clean else +0 for x in df['Smiles']]
df['C18_neg'] = [+1 if x in smiles_C18_neg_clean else +0 for x in df['Smiles']]
df['HILIC_pos'] = [+1 if x in smiles_HILIC_pos_clean else +0 for x in df['Smiles']]
df['HILIC_neg'] = [+1 if x in smiles_HILIC_neg_clean else +0 for x in df['Smiles']]
df.head(3)

Unnamed: 0,Smiles,C18_pos,C18_neg,HILIC_pos,HILIC_neg
0,OC(=O)CCCC(O)=O,0,0,0,1
1,CCC=CCC=CCC=CCC=CCCCCC(=O)OC,1,0,0,0
2,CC(=O)NC1=CC=C(O)C(=C1)C(O)=O,0,0,1,0


In [9]:
#See this repo and binder for UpSet https://github.com/hms-dbmi/upset-faculty
import altair as alt
from upsetaltair import UpSetAltair

In [18]:
UpSetAltair(
    data=df.copy(),
    title="Feature-based molecular networking: annotation distribution",
    subtitle=[
        "The annotations are derived from MS/MS spectral matching against public spectral library, and NIST17.",
        "Minimum cosine score = 0.6 | Minimum matched fragment = 3 | Maximum ppm error = 20."],
    sets=["C18_pos", "C18_neg", "HILIC_pos", "HILIC_neg"],
    abbre=["C18 ESI+", "C18 ESI-", "HILIC ESI+", "HILIC ESI-"],
    sort_by="frequency",
    sort_order="ascending",
    
    # Custom options:
    width=900,
    height=500,
    height_ratio=0.65,
    color_range=["#F0E442", "#D55E00", "#cdc7fc", "#5246b3"],
    #highlight_color="#d61e1e",
    horizontal_bar_chart_width=250,
    glyph_size=600,
    set_label_bg_size=650,
    line_connection_size=5,
    horizontal_bar_size=25,
    vertical_bar_label_size=14,
    vertical_bar_padding=14
)