# CARA feature metadata preparation - C18neg

Author: Louis Felix Nothias, UC San Diego 

Date: 2021

### Notebook:
Prepare and concatenate feature metadata/annotations for Feature-Based Molecular Networking (FBMN) or Classical Molecular Networking (CMN)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## FEATURE BASED MOLECULAR NETWORKING

In [25]:
# GNPS cluster info summary file
gnps_table = pd.read_table('input/FBMN/C18_neg/clusterinfo_sum_70e7bb9ea5a44103b06c779f315ff931.tsv')
# Drop group columns
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('ATTRIBUTE')]
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('GNPSGROUP')]
gnps_table = gnps_table.set_index(['cluster index'])
new_names = [(i,'GNPS_'+i) for i in gnps_table.iloc[:, 0:].columns.values]
gnps_table.rename(columns = dict(new_names), inplace=True)
print(gnps_table.shape)
gnps_table.head(2)

(6137, 29)


Unnamed: 0_level_0,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,...,GNPS_SpectrumID,GNPS_SumPeakIntensity,GNPS_UniqueFileSourcesCount,GNPS_componentindex,GNPS_neutral M mass,GNPS_number of spectra,GNPS_parent mass,GNPS_precursor charge,GNPS_precursor mass,GNPS_sum(precursor intensity)
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,32.0,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,,105921500.0,168,1158,,168,794.9517,1,794.9517,105921500.0
4,,,24.0,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,,94187090.0,168,71,,168,744.9547,1,744.9547,94187090.0


In [26]:
#GNPS spectral library match
annotation_table = pd.read_table('input/FBMN/C18_neg/DB_result/3c75bcc74591496785c21fac463dc97a.tsv', index_col=False)
annotation_table = annotation_table.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIB_'+i) for i in annotation_table.iloc[:, 0:].columns.values]
annotation_table.rename(columns = dict(new_names), inplace=True)
print(annotation_table.shape)
annotation_table.head(2)

(127, 42)


Unnamed: 0_level_0,GNPS_LIB_SpectrumID,GNPS_LIB_Compound_Name,GNPS_LIB_Ion_Source,GNPS_LIB_Instrument,GNPS_LIB_Compound_Source,GNPS_LIB_PI,GNPS_LIB_Data_Collector,GNPS_LIB_Adduct,GNPS_LIB_Precursor_MZ,GNPS_LIB_ExactMass,...,GNPS_LIB_FileScanUniqueID,GNPS_LIB_NumberHits,GNPS_LIB_tags,GNPS_LIB_MoleculeExplorerDatasets,GNPS_LIB_MoleculeExplorerFiles,GNPS_LIB_InChIKey,GNPS_LIB_InChIKey-Planar,GNPS_LIB_superclass,GNPS_LIB_class,GNPS_LIB_subclass
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1025,CCMSLIB00003381404,Dodecyl phosphate,ESI,HCD,Isolated,NIST,NIST,2M-H,531.322,0.0,...,spectra_filtered/specs_ms.mgf1025,1,,0,0,TVACALAUIQMRDF-UHFFFAOYSA-N,TVACALAUIQMRDF,Organic acids and derivatives,Organic phosphoric acids and derivatives,Phosphate esters
1091,CCMSLIB00000490081,21-hydroxy-heneicosanoic acid,LC-ESI,LC-ESI-QTOF,Isolated,Metlin,,M-H,341.306,0.0,...,spectra_filtered/specs_ms.mgf1091,1,,0,0,,,,,


In [27]:
#GNPS spectral library match analogue 
annotation_table_analogue = pd.read_table('input/FBMN/C18_neg_analogue/f8b88f7859aa442da56620323b1ac162.tsv', index_col=False)
annotation_table_analogue = annotation_table_analogue.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIBA_'+i) for i in annotation_table_analogue.iloc[:, 0:].columns.values]
annotation_table_analogue.rename(columns = dict(new_names), inplace=True)
print(annotation_table_analogue.shape)
annotation_table_analogue.head(2)

(539, 45)


Unnamed: 0_level_0,GNPS_LIBA_SpectrumID,GNPS_LIBA_Compound_Name,GNPS_LIBA_Ion_Source,GNPS_LIBA_Instrument,GNPS_LIBA_Compound_Source,GNPS_LIBA_PI,GNPS_LIBA_Data_Collector,GNPS_LIBA_Adduct,GNPS_LIBA_Precursor_MZ,GNPS_LIBA_ExactMass,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1007,CCMSLIB00000731320,MGDG 33:0; MGDG(16:0/17:0); [M+HCOO]- C42H80O10,,In silico,In silico,Lipidblast,Lipidblast,M+HCOO,789.573,0.0,...,0,0,,,,,,,,
1020,CCMSLIB00005737078,Massbank:LQB00001 Cer[AP] t34:0,ESI,qTof,Isolated,Massbank,Massbank,M+CH3COOH-H,630.53,0.0,...,0,0,RHIXBFQKTNYVCX-UHFFFAOYSA-N,RHIXBFQKTNYVCX,Lipids and lipid-like molecules,Sphingolipids,Ceramides,Ceramides,Ceramides,Fatty acids


In [28]:
passatutto_table = pd.read_table('input/FBMN/C18_neg/MOLECULAR-LIBRARYSEARCH-FDR-5aafc22e-view_all_annotations_DB_fdr-main.tsv', index_col=False)
passatutto_table = passatutto_table.set_index(['#Scan#'])
new_names = [(i,'PASSA_FDR_'+i) for i in passatutto_table.iloc[:, 0:].columns.values]
passatutto_table.rename(columns = dict(new_names), inplace=True)
print(passatutto_table.shape)
passatutto_table.head(2)

(1115, 37)


Unnamed: 0_level_0,PASSA_FDR_Adduct,PASSA_FDR_CAS_Number,PASSA_FDR_Charge,PASSA_FDR_Compound_Name,PASSA_FDR_Compound_Source,PASSA_FDR_Data_Collector,PASSA_FDR_ExactMass,PASSA_FDR_FileScanUniqueID,PASSA_FDR_INCHI,PASSA_FDR_INCHI_AUX,...,PASSA_FDR_Smiles,PASSA_FDR_SpecCharge,PASSA_FDR_SpecMZ,PASSA_FDR_SpectrumFile,PASSA_FDR_SpectrumID,PASSA_FDR_TIC_Query,PASSA_FDR_UpdateWorkflowName,PASSA_FDR_fdr,PASSA_FDR_tags,PASSA_FDR_internalFilename
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6832,M+FA-H,,1,"NCGC00381061-01_C30H51N5O9_Pyrrolo[1,2-d][1,4,...",isolated,lfnothias,625.369,spectrapklbin/spec-00000.mgf6832,InChI=1S/C30H51N5O9/c1-9-17(4)23-29(42)34(8)24...,,...,CCC(C)C1NC(=O)C2C(C)CCN2C(=O)C(CC(O)CO)OC(=O)C...,1,671.555,C18_neg_6_nodupl_gapfil35_25ppm_GNPS.mgf,CCMSLIB00000850955,3485.4,UPDATE-SINGLE-ANNOTATED-GOLD,0.279716,,spec-00000.mgf
3892,553.4,123284811.0,1,"1,2-Di(4Z,7Z,10Z,13Z,16Z,19Z-docosahexaenoyl)-...",Isolated,NIST,0.0,spectrapklbin/spec-00000.mgf3892,,,...,,1,553.479,C18_neg_6_nodupl_gapfil35_25ppm_GNPS.mgf,CCMSLIB00000252304,1160.3,UPDATE-SINGLE-ANNOTATED-BRONZE,0.279716,,spec-00000.mgf


In [29]:
#SIRIUS MOLECULAR_FORMULA
sirius_MF = pd.read_table('input/FBMN/C18_neg_SIRIUS/formula_identifications_MF_network.txt')
sirius_MF= sirius_MF.set_index(['shared_name'])
new_names = [(i,'SIR_MF_'+i) for i in sirius_MF.iloc[:, 0:].columns.values]
sirius_MF.rename(columns = dict(new_names), inplace=True)
print(sirius_MF.shape)
sirius_MF.head(2)

(5484, 9)


Unnamed: 0_level_0,SIR_MF_Zod_molecularFormula,SIR_MF_Zod_adduct,SIR_MF_Zod_ZodiacScore,SIR_MF_Zod_TreeScore,SIR_MF_Zod_numExplainedPeaks,SIR_MF_Zod_explainedIntensity,SIR_MF_Zod_id,SIR_MF_Zod_massErrorPrecursor(ppm),SIR_MF_Zod_medianAbsoluteMassErrorFragmentPeaks(ppm)
shared_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
29,C5H12O5,[M - H]-,1.0,84.973,12,0.954,19_C18_neg_6_nodupl_gapfil35_25ppm_SIRIUS_char...,-1.966,5.223
2702,C9H19NO4,[M - H]-,1.0,77.318,15,0.89,2074_C18_neg_6_nodupl_gapfil35_25ppm_SIRIUS_ch...,-0.645,4.574


In [30]:
#CSI_FINGERID
sirius_CSI = pd.read_table('input/FBMN/C18_neg_SIRIUS/compound_identifications_adducts_CSIFingerID_network.txt')
sirius_CSI = sirius_CSI.set_index(['shared name'])
print(sirius_CSI.shape)
sirius_CSI.head(2)

(1866, 19)


Unnamed: 0_level_0,CSI_#adducts,CSI_#predictedFPs,CSI_ConfidenceScore,CSI_CSI:FingerIDScore,CSI_ZodiacScore,CSI_SiriusScore,CSI_molecularFormula,CSI_adduct,CSI_InChIkey2D,CSI_InChI,CSI_name,CSI_smiles,CSI_xlogp,CSI_pubchemids,CSI_links,CSI_dbflags,CSI_ionMass,CSI_retentionTimeInSeconds,CSI_id
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1310,1,1,,-268.219,1.0,14.078,C32H62O5,[M - H]-,OJZZNYHBEAPMQA,InChI=1S/C32H62O5/c1-3-5-7-9-11-13-15-16-17-19...,,CCCCCCCCCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCCCC)O,12.4,98117735;98117736;132939384,HMDB:(93212 93613);PubChem:(98117735 98117736 ...,16777226,525.451,140.766,979_C18_neg_6_nodupl_gapfil35_25ppm_SIRIUS_cha...
7537,3,3,,-44.316,1.0,30.663,C13H24O4,[M - H]-,DXNCZXXFRKPEPY,InChI=1S/C13H24O4/c14-12(15)10-8-6-4-2-1-3-5-7...,Brassilate,C(CCCCCC(=O)O)CCCCCC(=O)O,3.511,10458;6994474,HMDB:(2327);SuperNatural:(SN00002517 SN0021618...,87842926,243.16,43.384,5544_C18_neg_6_nodupl_gapfil35_25ppm_SIRIUS_ch...


In [31]:
#CANOPUS
sirius_CAN = pd.read_table('input/FBMN/C18_neg_SIRIUS/canopus_summary_CANOPUS_network.txt')
sirius_CAN = sirius_CAN.set_index(['shared name'])
print(sirius_CAN.shape)
sirius_CAN.head(2)

(3934, 9)


Unnamed: 0_level_0,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6309,4761_C18_neg_6_nodupl_gapfil35_25ppm_SIRIUS_ch...,C37H74O3S2,[M - H2O - H]-,Fatty acids and conjugates,,Fatty acids and conjugates,Fatty Acyls,Lipids and lipid-like molecules,Organic compounds; Organosulfur compounds; Lip...
1310,979_C18_neg_6_nodupl_gapfil35_25ppm_SIRIUS_cha...,C32H62O5,[M - H]-,Long-chain fatty acids,Long-chain fatty acids,Fatty acids and conjugates,Fatty Acyls,Lipids and lipid-like molecules,Organic compounds; Lipids and lipid-like molec...


In [34]:
#Create the master table
master_annotation_table = pd.concat([gnps_table, annotation_table, annotation_table_analogue,
                                     passatutto_table,
                                     sirius_MF, sirius_CSI, sirius_CAN], axis=1, sort=False)

master_annotation_table = master_annotation_table.reset_index(drop=False)
master_annotation_table.rename(columns={'index':'#featureID'}, inplace=True)
master_annotation_table.to_csv('input/FBMN/C18neg_feature_metadata.tsv', sep='\t', index=False)
print('Number of annotations without GNPS nodes = '+
      str(master_annotation_table.shape[0]-gnps_table.shape[0]))
print(master_annotation_table.shape)
master_annotation_table.head(5)

Number of annotations without GNPS nodes = 18
(6155, 191)


Unnamed: 0,#featureID,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,...,CSI_id,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
0,2,,,32.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,4,,,24.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,5,,,86.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,6,,,14.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,0_C18_neg_6_nodupl_gapfil35_25ppm_SIRIUS_charg...,C13H6F24O4,[M - H4O2 - H]-,Perfluoroalkyl carboxylic acid and derivatives,Perfluoroalkyl carboxylic acid and derivatives,Alkyl fluorides,Alkyl halides,Organohalogen compounds,Organic compounds; Lipids and lipid-like molec...
4,7,,,39.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


## Check consistency of the table

In [33]:
def check_annotations(number):
    for x in range(number):
        print('===ENTRY====')
        print(master_annotation_table.iloc[x][['GNPS_precursor mass', 'GNPS_LIB_Precursor_MZ',
                                           'GNPS_LIBA_Precursor_MZ','GNPS_LIBA_MassDiff',
                                           'PASSA_FDR_LibMZ',
                                           'SIR_MF_Zod_adduct','SIR_MF_Zod_molecularFormula',
                                           'CSI_ionMass','CAN_molecularFormula']])
        print(list(master_annotation_table.iloc[x][['GNPS_GNPSLinkout_Cluster']]))
        
check_annotations(25)

===ENTRY====
GNPS_precursor mass            794.952
GNPS_LIB_Precursor_MZ              NaN
GNPS_LIBA_Precursor_MZ             NaN
GNPS_LIBA_MassDiff                 NaN
PASSA_FDR_LibMZ                    NaN
SIR_MF_Zod_adduct                  NaN
SIR_MF_Zod_molecularFormula        NaN
CSI_ionMass                        NaN
CAN_molecularFormula               NaN
Name: 0, dtype: object
['https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=2d9c9fec24f540b89cf45f2c4d073ad7&view=view_all_clusters_withID&show=true#{"main.cluster index_lowerinput":"2","main.cluster index_upperinput":"2"}']
===ENTRY====
GNPS_precursor mass            744.955
GNPS_LIB_Precursor_MZ              NaN
GNPS_LIBA_Precursor_MZ             NaN
GNPS_LIBA_MassDiff                 NaN
PASSA_FDR_LibMZ                    NaN
SIR_MF_Zod_adduct                  NaN
SIR_MF_Zod_molecularFormula        NaN
CSI_ionMass                        NaN
CAN_molecularFormula               NaN
Name: 1, dtype: object
['https://gnps.ucsd.edu/Pr