# CARA feature metadata preparation - HILICneg

Author: Louis Felix Nothias, UC San Diego 

Date: 2021

### Notebook:
Prepare and concatenate feature metadata/annotations for Feature-Based Molecular Networking (FBMN) or Classical Molecular Networking (CMN)


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## FEATURE BASED MOLECULAR NETWORKING

In [16]:
# GNPS cluster info summary file
gnps_table = pd.read_table('input/FBMN/HILIC_neg/clusterinfo_sum_38c367f795224836be71b18908e15117.tsv')
# Drop group columns
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('ATTRIBUTE')]
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('GNPSGROUP')]
gnps_table = gnps_table.set_index(['cluster index'])
new_names = [(i,'GNPS_'+i) for i in gnps_table.iloc[:, 0:].columns.values]
gnps_table.rename(columns = dict(new_names), inplace=True)
print(gnps_table.shape)
gnps_table.head(2)

(11230, 29)


Unnamed: 0_level_0,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,...,GNPS_SpectrumID,GNPS_SumPeakIntensity,GNPS_UniqueFileSourcesCount,GNPS_componentindex,GNPS_neutral M mass,GNPS_number of spectra,GNPS_parent mass,GNPS_precursor charge,GNPS_precursor mass,GNPS_sum(precursor intensity)
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,,63493560.0,167,-1,,167,112.9855,1,112.9855,63493560.0
2,,,19.0,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,,194325400.0,168,421,,168,359.1573,1,359.1573,194325400.0


In [17]:
#GNPS spectral library match
annotation_table = pd.read_table('input/FBMN/HILIC_neg/DB_result/161d834d711a40e0a7d8cc7dd11888a6.tsv', index_col=False)
annotation_table = annotation_table.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIB_'+i) for i in annotation_table.iloc[:, 0:].columns.values]
annotation_table.rename(columns = dict(new_names), inplace=True)
print(annotation_table.shape)
annotation_table.head(2)

(366, 42)


Unnamed: 0_level_0,GNPS_LIB_SpectrumID,GNPS_LIB_Compound_Name,GNPS_LIB_Ion_Source,GNPS_LIB_Instrument,GNPS_LIB_Compound_Source,GNPS_LIB_PI,GNPS_LIB_Data_Collector,GNPS_LIB_Adduct,GNPS_LIB_Precursor_MZ,GNPS_LIB_ExactMass,...,GNPS_LIB_FileScanUniqueID,GNPS_LIB_NumberHits,GNPS_LIB_tags,GNPS_LIB_MoleculeExplorerDatasets,GNPS_LIB_MoleculeExplorerFiles,GNPS_LIB_InChIKey,GNPS_LIB_InChIKey-Planar,GNPS_LIB_superclass,GNPS_LIB_class,GNPS_LIB_subclass
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10018,CCMSLIB00004705186,7-Methoxyflavonol,,ESI-QFT,isolated,MoNA,MoNA:VF-NPL-QEHF014123,[M-H]-,267.066,0.0,...,spectra_filtered/specs_ms.mgf10018,1,,0,0,IPRIGHIBTRMTDP-UHFFFAOYSA-N,IPRIGHIBTRMTDP,Phenylpropanoids and polyketides,Flavonoids,Flavones
10134,CCMSLIB00004721637,BUTYL PARABEN,,ESI-QFT,isolated,MoNA,MoNA:VF-NPL-QEHF028056,[M-H]-,193.087,0.0,...,spectra_filtered/specs_ms.mgf10134,1,,0,0,QFOHBWFCKVYLES-UHFFFAOYSA-N,QFOHBWFCKVYLES,Benzenoids,Benzene and substituted derivatives,Benzoic acids and derivatives


In [18]:
#GNPS spectral library match analogue 
annotation_table_analogue = pd.read_table('input/FBMN/HILIC_neg_analogue/eb6f4efe75f4455aa565216623e8904e.tsv', index_col=False)
annotation_table_analogue = annotation_table_analogue.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIBA_'+i) for i in annotation_table_analogue.iloc[:, 0:].columns.values]
annotation_table_analogue.rename(columns = dict(new_names), inplace=True)
print(annotation_table_analogue.shape)
annotation_table_analogue.head(2)

(884, 45)


Unnamed: 0_level_0,GNPS_LIBA_SpectrumID,GNPS_LIBA_Compound_Name,GNPS_LIBA_Ion_Source,GNPS_LIBA_Instrument,GNPS_LIBA_Compound_Source,GNPS_LIBA_PI,GNPS_LIBA_Data_Collector,GNPS_LIBA_Adduct,GNPS_LIBA_Precursor_MZ,GNPS_LIBA_ExactMass,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10044,CCMSLIB00004751530,(E)-1-nitropentadec-1-ene-histidine conjugate,LC-ESI,Orbitrap,Lysate,Ales Svatos,Riya Menezes,M-H,409.282,0.0,...,0,0,,,,,,,,
10052,CCMSLIB00003460071,Indole-3-butyric acid,ESI,HCD,Isolated,NIST,NIST,M-H,202.087,0.0,...,9,60,JTEDVYBZBROSJT-UHFFFAOYSA-N,JTEDVYBZBROSJT,Organoheterocyclic compounds,Indoles and derivatives,Indoles,Simple indole alkaloids,Simple indole alkaloids,Alkaloids


In [19]:
passatutto_table = pd.read_table('input/FBMN/HILIC_neg/MOLECULAR-LIBRARYSEARCH-FDR-9c54147c-view_all_annotations_DB_fdr-main.tsv', index_col=False)
passatutto_table = passatutto_table.set_index(['#Scan#'])
new_names = [(i,'PASSA_FDR_'+i) for i in passatutto_table.iloc[:, 0:].columns.values]
passatutto_table.rename(columns = dict(new_names), inplace=True)
print(passatutto_table.shape)
passatutto_table.head(2)

(1754, 37)


Unnamed: 0_level_0,PASSA_FDR_Adduct,PASSA_FDR_CAS_Number,PASSA_FDR_Charge,PASSA_FDR_Compound_Name,PASSA_FDR_Compound_Source,PASSA_FDR_Data_Collector,PASSA_FDR_ExactMass,PASSA_FDR_FileScanUniqueID,PASSA_FDR_INCHI,PASSA_FDR_INCHI_AUX,...,PASSA_FDR_Smiles,PASSA_FDR_SpecCharge,PASSA_FDR_SpecMZ,PASSA_FDR_SpectrumFile,PASSA_FDR_SpectrumID,PASSA_FDR_TIC_Query,PASSA_FDR_UpdateWorkflowName,PASSA_FDR_fdr,PASSA_FDR_tags,PASSA_FDR_internalFilename
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7147,M+H-H2O,114418916,1,"D-myo-Inositol-2,3,5,6-tetraphosphate",Isolated,NIST,0.0,spectrapklbin/spec-00000.mgf7147,,,...,,1,481.066,CARA_HILIC_neg_GNPS_MS2_merged.mgf,CCMSLIB00000234712,3848.0,UPDATE-SINGLE-ANNOTATED-BRONZE,0.400752,,spec-00000.mgf
3428,467.1,470553,1,Stachyose,Isolated,NIST,0.0,spectrapklbin/spec-00000.mgf3428,,,...,,1,466.203,CARA_HILIC_neg_GNPS_MS2_merged.mgf,CCMSLIB00000270702,6875.0,UPDATE-SINGLE-ANNOTATED-BRONZE,0.400684,,spec-00000.mgf


In [20]:
#SIRIUS MOLECULAR_FORMULA
sirius_MF = pd.read_table('input/FBMN/HILIC_neg_SIRIUS/formula_identifications_MF_network.txt')
sirius_MF= sirius_MF.set_index(['shared_name'])
new_names = [(i,'SIR_MF_'+i) for i in sirius_MF.iloc[:, 0:].columns.values]
sirius_MF.rename(columns = dict(new_names), inplace=True)
print(sirius_MF.shape)
sirius_MF.head(2)

(5482, 9)


Unnamed: 0_level_0,SIR_MF_Zod_molecularFormula,SIR_MF_Zod_adduct,SIR_MF_Zod_ZodiacScore,SIR_MF_Zod_TreeScore,SIR_MF_Zod_numExplainedPeaks,SIR_MF_Zod_explainedIntensity,SIR_MF_Zod_id,SIR_MF_Zod_massErrorPrecursor(ppm),SIR_MF_Zod_medianAbsoluteMassErrorFragmentPeaks(ppm)
shared_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1662,C18H32O4,[M - H]-,1.0,125.387,28,0.655,648_CARA_HILIC_neg_SIRIUS_charge0_removed_1662,-0.909,4.702
4352,C14H22O7,[M - H]-,1.0,122.416,26,0.639,3696_CARA_HILIC_neg_SIRIUS_charge0_removed_4352,-0.254,3.737


In [21]:
#CSI_FINGERID
sirius_CSI = pd.read_table('input/FBMN/HILIC_neg_SIRIUS/compound_identifications_adducts_CSIFingerID_network.txt')
sirius_CSI = sirius_CSI.set_index(['shared name'])
print(sirius_CSI.shape)
sirius_CSI.head(2)

(339, 19)


Unnamed: 0_level_0,CSI_#adducts,CSI_#predictedFPs,CSI_ConfidenceScore,CSI_CSI:FingerIDScore,CSI_ZodiacScore,CSI_SiriusScore,CSI_molecularFormula,CSI_adduct,CSI_InChIkey2D,CSI_InChI,CSI_name,CSI_smiles,CSI_xlogp,CSI_pubchemids,CSI_links,CSI_dbflags,CSI_ionMass,CSI_retentionTimeInSeconds,CSI_id
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3642,3,4,,-182.284,0.726,16.408,C16H10O4,[M - H]-,DACUXYCUJMGBFH,InChI=1S/C16H10O4/c17-15(18)12-7-3-2-6-11(12)1...,Oprea1_261926,C1=CC=C2C(=C1)C=C(C(=O)O2)C3=CC=CC=C3C(=O)O,3.1,927893;6954761,COCONUT:(CNP0050414 CNP0361541);PubChem:(92789...,3153986,265.049,148.834,2131_CARA_HILIC_neg_SIRIUS_charge0_removed_3642
1953,1,1,,-453.878,1.0,15.389,C24H50O4,[M - H2O - H]-,RYCPZVNAAFWRCP,InChI=1S/C24H50O4/c1-3-5-7-9-11-13-15-17-19-21...,,CCCCCCCCCCCC(O)OOC(CCCCCCCCCCC)O,10.3,54291932,COCONUT:(CNP0273813);Natural Products:(UNPD151...,3178498,383.353,49.008,3509_CARA_HILIC_neg_SIRIUS_charge0_removed_1953


In [22]:
#CANOPUS
sirius_CAN = pd.read_table('input/FBMN/HILIC_neg_SIRIUS/canopus_summary_CANOPUS_network.txt')
sirius_CAN = sirius_CAN.set_index(['shared name'])
print(sirius_CAN.shape)
sirius_CAN.head(2)

(684, 9)


Unnamed: 0_level_0,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
934,1100_CARA_HILIC_neg_SIRIUS_charge0_removed_934,C9H8Cl2O5S2,[M + Cl]-,Benzenesulfonyl compounds,,Benzenesulfonyl compounds,Benzene and substituted derivatives,Benzenoids,Organic compounds; Organosulfur compounds; Hal...
3642,2131_CARA_HILIC_neg_SIRIUS_charge0_removed_3642,C16H14O6,[M - H4O2 - H]-,1-benzopyrans,,1-benzopyrans,Benzopyrans,Organoheterocyclic compounds,Organic compounds; Organoheterocyclic compound...


In [23]:
#Create the master table
master_annotation_table = pd.concat([gnps_table, annotation_table, annotation_table_analogue,
                                     passatutto_table,
                                     sirius_MF, sirius_CSI, sirius_CAN], axis=1, sort=False)

master_annotation_table = master_annotation_table.reset_index(drop=False)
master_annotation_table.rename(columns={'index':'#featureID'}, inplace=True)
master_annotation_table.to_csv('input/FBMN/HILICneg_feature_metadata.tsv', sep='\t', index=False)
print('Number of annotations without GNPS nodes = '+
      str(master_annotation_table.shape[0]-gnps_table.shape[0]))
print(master_annotation_table.shape)
master_annotation_table.head(5)

Number of annotations without GNPS nodes = 0
(11230, 191)


Unnamed: 0,#featureID,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,...,CSI_id,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
0,1,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,2,,,19.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,3,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,4,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,5,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


## Check consistency of the table

In [24]:
def check_annotations(number):
    for x in range(number):
        print('===ENTRY====')
        print(master_annotation_table.iloc[x][['GNPS_precursor mass', 'GNPS_LIB_Precursor_MZ',
                                           'GNPS_LIBA_Precursor_MZ','GNPS_LIBA_MassDiff',
                                           'PASSA_FDR_LibMZ',
                                           'SIR_MF_Zod_adduct','SIR_MF_Zod_molecularFormula',
                                           'CSI_ionMass','CAN_molecularFormula']])
        print(list(master_annotation_table.iloc[x][['GNPS_GNPSLinkout_Cluster']]))
        
check_annotations(50)

===ENTRY====
GNPS_precursor mass             112.986
GNPS_LIB_Precursor_MZ               NaN
GNPS_LIBA_Precursor_MZ              NaN
GNPS_LIBA_MassDiff                  NaN
PASSA_FDR_LibMZ                     NaN
SIR_MF_Zod_adduct              [M - H]-
SIR_MF_Zod_molecularFormula     C2HF3O2
CSI_ionMass                         NaN
CAN_molecularFormula                NaN
Name: 0, dtype: object
['https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=5e42a119e494452f9942cfe4a5bcc749&view=view_all_clusters_withID&show=true#{"main.cluster index_lowerinput":"1.0","main.cluster index_upperinput":"1.0"}']
===ENTRY====
GNPS_precursor mass               359.157
GNPS_LIB_Precursor_MZ                 NaN
GNPS_LIBA_Precursor_MZ                NaN
GNPS_LIBA_MassDiff                    NaN
PASSA_FDR_LibMZ                       NaN
SIR_MF_Zod_adduct                [M - H]-
SIR_MF_Zod_molecularFormula    C14H24N4O7
CSI_ionMass                           NaN
CAN_molecularFormula                  NaN
Name: 1, 