# CARA feature metadata preparation - C18pos

Author: Louis Felix Nothias, UC San Diego 

Date: 2021

### Notebook:
Prepare and concatenate feature metadata/annotations for Feature-Based Molecular Networking (FBMN) or Classical Molecular Networking (CMN)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## FEATURE BASED MOLECULAR NETWORKING

In [2]:
# GNPS cluster info summary file
gnps_table = pd.read_table('input/FBMN/C18_pos/clusterinfo_sum_bd978581a943474da418b2fe8f8bf72c.tsv')
# Drop group columns
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('ATTRIBUTE')]
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('GNPSGROUP')]
gnps_table = gnps_table.set_index(['cluster index'])
new_names = [(i,'GNPS_'+i) for i in gnps_table.iloc[:, 0:].columns.values]
gnps_table.rename(columns = dict(new_names), inplace=True)
print(gnps_table.shape)
gnps_table.head(2)

(15089, 29)


Unnamed: 0_level_0,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,...,GNPS_SpectrumID,GNPS_SumPeakIntensity,GNPS_UniqueFileSourcesCount,GNPS_componentindex,GNPS_neutral M mass,GNPS_number of spectra,GNPS_parent mass,GNPS_precursor charge,GNPS_precursor mass,GNPS_sum(precursor intensity)
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,CCMSLIB00003270770,3814338000.0,168,1077,,168,338.341,1,338.341,3814338000.0
2,,,,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,,2520604000.0,168,145,,168,675.6743,1,675.6743,2520604000.0


In [3]:
#GNPS spectral library match
annotation_table = pd.read_table('input/FBMN/C18_pos/DB_result/601e043681dc4b5da452d6e7a1eaff6d.tsv', index_col=False)
annotation_table = annotation_table.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIB_'+i) for i in annotation_table.iloc[:, 0:].columns.values]
annotation_table.rename(columns = dict(new_names), inplace=True)
print(annotation_table.shape)
annotation_table.head(2)

(396, 42)


Unnamed: 0_level_0,GNPS_LIB_SpectrumID,GNPS_LIB_Compound_Name,GNPS_LIB_Ion_Source,GNPS_LIB_Instrument,GNPS_LIB_Compound_Source,GNPS_LIB_PI,GNPS_LIB_Data_Collector,GNPS_LIB_Adduct,GNPS_LIB_Precursor_MZ,GNPS_LIB_ExactMass,...,GNPS_LIB_FileScanUniqueID,GNPS_LIB_NumberHits,GNPS_LIB_tags,GNPS_LIB_MoleculeExplorerDatasets,GNPS_LIB_MoleculeExplorerFiles,GNPS_LIB_InChIKey,GNPS_LIB_InChIKey-Planar,GNPS_LIB_superclass,GNPS_LIB_class,GNPS_LIB_subclass
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,CCMSLIB00003270770,Erucamide,ESI,HCD,Isolated,NIST,NIST,M+H,338.342,0.0,...,spectra_filtered/specs_ms.mgf1,1,,111,4803,UAUDZVJPLUQNMU-UHFFFAOYSA-N,UAUDZVJPLUQNMU,Lipids and lipid-like molecules,Fatty Acyls,Fatty amides
1006,CCMSLIB00003610088,3-Ketocholesterol,ESI,HCD,Isolated,NIST,NIST,M+H,385.346,0.0,...,spectra_filtered/specs_ms.mgf1006,1,,46,831,GGCLNOIGPMGLDB-UHFFFAOYSA-N,GGCLNOIGPMGLDB,Lipids and lipid-like molecules,Steroids and steroid derivatives,Cholestane steroids


In [4]:
#GNPS spectral library match analogue 
annotation_table_analogue = pd.read_table('input/FBMN/C18_pos_analogue/DB_result_analogue_fbee22d5de5e4d699c7db5b3d72b2857.tsv', index_col=False)
annotation_table_analogue = annotation_table_analogue.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIBA_'+i) for i in annotation_table_analogue.iloc[:, 0:].columns.values]
annotation_table_analogue.rename(columns = dict(new_names), inplace=True)
print(annotation_table_analogue.shape)
annotation_table_analogue.head(2)

(1501, 45)


Unnamed: 0_level_0,GNPS_LIBA_SpectrumID,GNPS_LIBA_Compound_Name,GNPS_LIBA_Ion_Source,GNPS_LIBA_Instrument,GNPS_LIBA_Compound_Source,GNPS_LIBA_PI,GNPS_LIBA_Data_Collector,GNPS_LIBA_Adduct,GNPS_LIBA_Precursor_MZ,GNPS_LIBA_ExactMass,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,CCMSLIB00003721897,cis-13-Docosenoic acid,ESI,HCD,Isolated,NIST,NIST,M+H-H2O,321.315,0.0,...,98,5771,DPUOLQHDNGRHBS-UHFFFAOYSA-N,DPUOLQHDNGRHBS,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates,Unsaturated fatty acids,Unsaturated fatty acids,Fatty acids
1001,CCMSLIB00003088149,TG(15:0/18:1/18:2); [M+NH4]+ C54H102N1O6,LC-ESI,CID; Lumos,Commercial,Thomas Metz,Thomas Metz,M+NH4,842.736,0.0,...,2,51,,,,,,,,


In [5]:
passatutto_table = pd.read_table('input/FBMN/C18_pos/MOLECULAR-LIBRARYSEARCH-FDR-c4e63a5e-view_all_annotations_DB_fdr-main.tsv', index_col=False)
passatutto_table = passatutto_table.set_index(['#Scan#'])
new_names = [(i,'PASSA_FDR_'+i) for i in passatutto_table.iloc[:, 0:].columns.values]
passatutto_table.rename(columns = dict(new_names), inplace=True)
print(passatutto_table.shape)
passatutto_table.head(2)

(2197, 37)


Unnamed: 0_level_0,PASSA_FDR_Adduct,PASSA_FDR_CAS_Number,PASSA_FDR_Charge,PASSA_FDR_Compound_Name,PASSA_FDR_Compound_Source,PASSA_FDR_Data_Collector,PASSA_FDR_ExactMass,PASSA_FDR_FileScanUniqueID,PASSA_FDR_INCHI,PASSA_FDR_INCHI_AUX,...,PASSA_FDR_Smiles,PASSA_FDR_SpecCharge,PASSA_FDR_SpecMZ,PASSA_FDR_SpectrumFile,PASSA_FDR_SpectrumID,PASSA_FDR_TIC_Query,PASSA_FDR_UpdateWorkflowName,PASSA_FDR_fdr,PASSA_FDR_tags,PASSA_FDR_internalFilename
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9100,M+Na,53003104.0,1,Salinomycin,Isolated,NIST,0.0,spectrapklbin/spec-00000.mgf9100,,,...,,1,774.729,CARA_C18_pos_FBMN_GNPS_MERGED.mgf,CCMSLIB00000318635,26570.0,UPDATE-SINGLE-ANNOTATED-BRONZE,0.228852,,spec-00000.mgf
5184,[M+NH4]+,,1,MoNA:38837 Triacylglycerol 18:2-18:2-18:2,isolated,MoNA,0.0,spectrapklbin/spec-00000.mgf5184,InChI=1S/C57H98O6/c1-4-7-10-13-16-19-22-25-28-...,,...,,1,894.805,CARA_C18_pos_FBMN_GNPS_MERGED.mgf,CCMSLIB00000563325,37864.0,UPDATE-SINGLE-ANNOTATED-BRONZE,0.228852,,spec-00000.mgf


In [6]:
#SIRIUS MOLECULAR_FORMULA
sirius_MF = pd.read_table('input/FBMN/C18_pos_SIRIUS/formula_identifications_MF_network.txt')
sirius_MF= sirius_MF.set_index(['shared name'])
new_names = [(i,'SIR_MF_'+i) for i in sirius_MF.iloc[:, 0:].columns.values]
sirius_MF.rename(columns = dict(new_names), inplace=True)
print(sirius_MF.shape)
sirius_MF.head(2)

(7407, 9)


Unnamed: 0_level_0,SIR_MF_Zod_molecularFormula,SIR_MF_Zod_adduct,SIR_MF_Zod_ZodiacScore,SIR_MF_Zod_TreeScore,SIR_MF_Zod_numExplainedPeaks,SIR_MF_Zod_explainedIntensity,SIR_MF_Zod_id,SIR_MF_Zod_massErrorPrecursor(ppm),SIR_MF_Zod_medianAbsoluteMassErrorFragmentPeaks(ppm)
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4004,C18H32O2,[M + H]+,1.0,230.956,53,0.965,5280_CARA_C18_pos_SIRIUS_4004,-7.491,7.408
7092,C18H35N5O,[M + H3N + H]+,1.0,217.787,39,0.31,2540_CARA_C18_pos_SIRIUS_7092,2.572,4.979


In [7]:
#CSI_FINGERID
sirius_CSI = pd.read_table('input/FBMN/C18_pos_SIRIUS/compound_identifications_adducts_CSIFingerID_network.txt')
sirius_CSI = sirius_CSI.set_index(['shared name'])
print(sirius_CSI.shape)
sirius_CSI.head(2)

(3042, 19)


Unnamed: 0_level_0,CSI_#adducts,CSI_#predictedFPs,CSI_ConfidenceScore,CSI_CSI:FingerIDScore,CSI_ZodiacScore,CSI_SiriusScore,CSI_molecularFormula,CSI_adduct,CSI_InChIkey2D,CSI_InChI,CSI_name,CSI_smiles,CSI_xlogp,CSI_pubchemids,CSI_links,CSI_dbflags,CSI_ionMass,CSI_retentionTimeInSeconds,CSI_id
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
15547,4,4,,-157.54,1.0,11.994,C22H40O3,[M + H3N + H]+,ZJFCVUTYZHUNSW,InChI=1S/C22H40O3/c1-2-3-4-5-6-7-8-9-10-11-12-...,"2, dihydro-3-octadecyl-",CCCCCCCCCCCCCCCCCCC1CC(=O)OC1=O,9.1,96562;25022020;97292209,NORMAN:(NS00059266);PubChem class - safety and...,67371010,370.33,108.743,421_CARA_C18_pos_SIRIUS_15547
5376,3,3,,-131.259,0.999,4.316,C37H73NO5,[M - H2O + H]+,UWTLPHGXKLBELN,InChI=1S/C37H73NO5/c1-3-5-7-9-11-13-15-17-19-2...,,CCCCCCCCCCCCCCCC(=O)OCC(COCCN)OC(=O)CCCCCCCCCC...,13.6,137348728,PubChem:(137348728);PubChem class - bio and me...,16777218,594.544,217.966,6760_CARA_C18_pos_SIRIUS_5376


In [8]:
#CANOPUS
sirius_CAN = pd.read_table('input/FBMN/C18_pos_SIRIUS/canopus_summary_CANOPUS_network.txt')
sirius_CAN = sirius_CAN.set_index(['shared name'])
print(sirius_CAN.shape)
sirius_CAN.head(2)

(5502, 9)


Unnamed: 0_level_0,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
15547,421_CARA_C18_pos_SIRIUS_15547,C22H43NO3,[M + H]+,Alpha amino acid esters,Amino acids and derivatives,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,Organic compounds; Lipids and lipid-like molec...
5376,6760_CARA_C18_pos_SIRIUS_5376,C37H75NO6,[M - H4O2 + H]+,Glycosyl-N-acylsphingosines,Neutral glycosphingolipids,Glycosphingolipids,Sphingolipids,Lipids and lipid-like molecules,Organic compounds; Carbohydrates and carbohydr...


In [9]:
#Create the master table
master_annotation_table = pd.concat([gnps_table, annotation_table, annotation_table_analogue,
                                     passatutto_table,
                                     sirius_MF, sirius_CSI, sirius_CAN], axis=1, sort=False)

master_annotation_table = master_annotation_table.reset_index(drop=False)
master_annotation_table.rename(columns={'index':'#featureID'}, inplace=True)
master_annotation_table.to_csv('input/FBMN/C18pos_feature_metadata.tsv', sep='\t', index=False)
print('Number of annotations without GNPS nodes = '+
      str(master_annotation_table.shape[0]-gnps_table.shape[0]))
print(master_annotation_table.shape)
master_annotation_table.head(5)

Number of annotations without GNPS nodes = 42
(15131, 191)


Unnamed: 0,#featureID,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,...,CSI_id,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
0,1,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,2,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,3,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,6298_CARA_C18_pos_SIRIUS_3,6298_CARA_C18_pos_SIRIUS_3,C27H53NO2,[M - H4O2 + H]+,Sphingolipids,,,Sphingolipids,Lipids and lipid-like molecules,Organic compounds; Lipids and lipid-like molec...
3,4,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,5,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


## Check consistency of the table

In [10]:
def check_annotations(number):
    for x in range(number):
        print('===ENTRY====')
        print(master_annotation_table.iloc[x][['GNPS_precursor mass', 'GNPS_LIB_Precursor_MZ',
                                           'GNPS_LIBA_Precursor_MZ','GNPS_LIBA_MassDiff',
                                           'PASSA_FDR_LibMZ',
                                           'SIR_MF_Zod_adduct','SIR_MF_Zod_molecularFormula',
                                           'CSI_ionMass','CAN_molecularFormula']])
        
check_annotations(10)

===ENTRY====
GNPS_precursor mass            338.341
GNPS_LIB_Precursor_MZ          338.342
GNPS_LIBA_Precursor_MZ         321.315
GNPS_LIBA_MassDiff             17.0258
PASSA_FDR_LibMZ                338.341
SIR_MF_Zod_adduct                  NaN
SIR_MF_Zod_molecularFormula        NaN
CSI_ionMass                        NaN
CAN_molecularFormula               NaN
Name: 0, dtype: object
===ENTRY====
GNPS_precursor mass            675.674
GNPS_LIB_Precursor_MZ              NaN
GNPS_LIBA_Precursor_MZ             NaN
GNPS_LIBA_MassDiff                 NaN
PASSA_FDR_LibMZ                675.676
SIR_MF_Zod_adduct                  NaN
SIR_MF_Zod_molecularFormula        NaN
CSI_ionMass                        NaN
CAN_molecularFormula               NaN
Name: 1, dtype: object
===ENTRY====
GNPS_precursor mass                    388.393
GNPS_LIB_Precursor_MZ                      NaN
GNPS_LIBA_Precursor_MZ                     NaN
GNPS_LIBA_MassDiff                         NaN
PASSA_FDR_LibMZ          