# CARA feature metadata preparation - HILICpos

Author: Louis Felix Nothias, UC San Diego 

Date: 2021

### Notebook:
Prepare and concatenate feature metadata/annotations for Feature-Based Molecular Networking (FBMN) or Classical Molecular Networking (CMN)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## FEATURE BASED MOLECULAR NETWORKING

In [2]:
# GNPS cluster info summary file
gnps_table = pd.read_table('input/FBMN/HILIC_pos/clusterinfo_sum_ed5f63b459354e8993c882d9fd7e2ad5.tsv')
# Drop group columns
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('ATTRIBUTE')]
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('GNPSGROUP')]
gnps_table = gnps_table.set_index(['cluster index'])
new_names = [(i,'GNPS_'+i) for i in gnps_table.iloc[:, 0:].columns.values]
gnps_table.rename(columns = dict(new_names), inplace=True)
print(gnps_table.shape)
gnps_table.head(2)

(28762, 29)


Unnamed: 0_level_0,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,...,GNPS_SpectrumID,GNPS_SumPeakIntensity,GNPS_UniqueFileSourcesCount,GNPS_componentindex,GNPS_neutral M mass,GNPS_number of spectra,GNPS_parent mass,GNPS_precursor charge,GNPS_precursor mass,GNPS_sum(precursor intensity)
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,7.0,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,,89892640.0,168,1,,168,450.2323,1,450.2323,89892640.0
2,,,,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,,118808400.0,166,27,,166,419.221,1,419.221,118808400.0


In [3]:
#GNPS spectral library match
annotation_table = pd.read_table('input/FBMN/HILIC_pos/DB_result/b8fe540e7d164bd196883cbf19329390.tsv', index_col=False)
annotation_table = annotation_table.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIB_'+i) for i in annotation_table.iloc[:, 0:].columns.values]
annotation_table.rename(columns = dict(new_names), inplace=True)
print(annotation_table.shape)
annotation_table.head(2)

(970, 42)


Unnamed: 0_level_0,GNPS_LIB_SpectrumID,GNPS_LIB_Compound_Name,GNPS_LIB_Ion_Source,GNPS_LIB_Instrument,GNPS_LIB_Compound_Source,GNPS_LIB_PI,GNPS_LIB_Data_Collector,GNPS_LIB_Adduct,GNPS_LIB_Precursor_MZ,GNPS_LIB_ExactMass,...,GNPS_LIB_FileScanUniqueID,GNPS_LIB_NumberHits,GNPS_LIB_tags,GNPS_LIB_MoleculeExplorerDatasets,GNPS_LIB_MoleculeExplorerFiles,GNPS_LIB_InChIKey,GNPS_LIB_InChIKey-Planar,GNPS_LIB_superclass,GNPS_LIB_class,GNPS_LIB_subclass
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002,CCMSLIB00003303604,Hexaethylene glycol,ESI,HCD,Isolated,NIST,NIST,M+H,283.175,0.0,...,spectra_filtered/specs_ms.mgf1002,1,,38,2055,IIRDTKBZINWQAW-UHFFFAOYSA-N,IIRDTKBZINWQAW,Organic oxygen compounds,Organooxygen compounds,Ethers
10032,CCMSLIB00000516452,Arg Gly Phe,LC-ESI,LC-ESI-QTOF,Isolated,Metlin,,M+H,379.209,0.0,...,spectra_filtered/specs_ms.mgf10032,1,,0,0,,,,,


In [4]:
#GNPS spectral library match analogue 
annotation_table_analogue = pd.read_table('input/FBMN/HILIC_pos_analogue/cb903a9cf14f417eb382577759e8b68e.tsv', index_col=False)
annotation_table_analogue = annotation_table_analogue.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIBA_'+i) for i in annotation_table_analogue.iloc[:, 0:].columns.values]
annotation_table_analogue.rename(columns = dict(new_names), inplace=True)
print(annotation_table_analogue.shape)
annotation_table_analogue.head(2)

(2332, 45)


Unnamed: 0_level_0,GNPS_LIBA_SpectrumID,GNPS_LIBA_Compound_Name,GNPS_LIBA_Ion_Source,GNPS_LIBA_Instrument,GNPS_LIBA_Compound_Source,GNPS_LIBA_PI,GNPS_LIBA_Data_Collector,GNPS_LIBA_Adduct,GNPS_LIBA_Precursor_MZ,GNPS_LIBA_ExactMass,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002,CCMSLIB00003190755,Tetraethylene glycol,ESI,Q-TOF,Isolated,NIST,NIST,M+H,195.123,0.0,...,0,0,UWHCKJMYHZGTIT-UHFFFAOYSA-N,UWHCKJMYHZGTIT,Organic oxygen compounds,Organooxygen compounds,Ethers,,,Fatty acids
10032,CCMSLIB00000523254,Arg Gly Ile,LC-ESI,LC-ESI-QTOF,Isolated,Metlin,,M+H,345.224,0.0,...,0,0,,,,,,,,


In [5]:
passatutto_table = pd.read_table('input/FBMN/HILIC_pos/MOLECULAR-LIBRARYSEARCH-FDR-876069e3-view_all_annotations_DB_fdr-main.tsv', index_col=False)
passatutto_table = passatutto_table.set_index(['#Scan#'])
new_names = [(i,'PASSA_FDR_'+i) for i in passatutto_table.iloc[:, 0:].columns.values]
passatutto_table.rename(columns = dict(new_names), inplace=True)
print(passatutto_table.shape)
passatutto_table.head(2)

(3054, 37)


Unnamed: 0_level_0,PASSA_FDR_Adduct,PASSA_FDR_CAS_Number,PASSA_FDR_Charge,PASSA_FDR_Compound_Name,PASSA_FDR_Compound_Source,PASSA_FDR_Data_Collector,PASSA_FDR_ExactMass,PASSA_FDR_FileScanUniqueID,PASSA_FDR_INCHI,PASSA_FDR_INCHI_AUX,...,PASSA_FDR_Smiles,PASSA_FDR_SpecCharge,PASSA_FDR_SpecMZ,PASSA_FDR_SpectrumFile,PASSA_FDR_SpectrumID,PASSA_FDR_TIC_Query,PASSA_FDR_UpdateWorkflowName,PASSA_FDR_fdr,PASSA_FDR_tags,PASSA_FDR_internalFilename
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26671,M+Na,1217458586,1,13(S)-HODE-biotin,Isolated,NIST,0.0,spectrapklbin/spec-00000.mgf26671,,,...,,1,558.515,CARA_HILIC_pos_GNPS.mgf,CCMSLIB00000232831,2194.0,UPDATE-SINGLE-ANNOTATED-BRONZE,0.268853,,spec-00000.mgf
5854,M+Na,50033,1,Hydrocortisone 21-acetate,Isolated,NIST,0.0,spectrapklbin/spec-00000.mgf5854,,,...,,1,426.372,CARA_HILIC_pos_GNPS.mgf,CCMSLIB00000349460,4026.0,UPDATE-SINGLE-ANNOTATED-BRONZE,0.268743,,spec-00000.mgf


In [6]:
#SIRIUS MOLECULAR_FORMULA
sirius_MF = pd.read_table('input/FBMN/HILIC_pos_SIRIUS/formula_identifications_MF_network.txt')
sirius_MF= sirius_MF.set_index(['shared_name'])
new_names = [(i,'SIR_MF_'+i) for i in sirius_MF.iloc[:, 0:].columns.values]
sirius_MF.rename(columns = dict(new_names), inplace=True)
print(sirius_MF.shape)
sirius_MF.head(2)

(6247, 9)


Unnamed: 0_level_0,SIR_MF_Zod_molecularFormula,SIR_MF_Zod_adduct,SIR_MF_Zod_ZodiacScore,SIR_MF_Zod_TreeScore,SIR_MF_Zod_numExplainedPeaks,SIR_MF_Zod_explainedIntensity,SIR_MF_Zod_id,SIR_MF_Zod_massErrorPrecursor(ppm),SIR_MF_Zod_medianAbsoluteMassErrorFragmentPeaks(ppm)
shared_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
16471,C18H31NO,[M + H]+,1.0,365.228,55,0.984,4043_CARA_HILIC_pos_SIRIUS_merge_16471,0.212,14.877
7,C13H22O2,[M + H3N + H]+,1.0,328.799,51,0.919,4767_CARA_HILIC_pos_SIRIUS_merge_7,-2.653,15.628


In [7]:
#CSI_FINGERID
sirius_CSI = pd.read_table('input/FBMN/HILIC_pos_SIRIUS/compound_identifications_adducts_CSIFingerID_network.txt')
sirius_CSI = sirius_CSI.set_index(['shared name'])
print(sirius_CSI.shape)
sirius_CSI.head(2)

(3437, 19)


Unnamed: 0_level_0,CSI_#adducts,CSI_#predictedFPs,CSI_ConfidenceScore,CSI_CSI:FingerIDScore,CSI_ZodiacScore,CSI_SiriusScore,CSI_molecularFormula,CSI_adduct,CSI_InChIkey2D,CSI_InChI,CSI_name,CSI_smiles,CSI_xlogp,CSI_pubchemids,CSI_links,CSI_dbflags,CSI_ionMass,CSI_retentionTimeInSeconds,CSI_id
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
28353,3,3,,-39.93,1.0,35.047,C6H10O4,[M - H2O + H]+,WNLRTRBMVRJNCN,"InChI=1S/C6H10O4/c7-5(8)3-1-2-4-6(9)10/h1-4H2,...",Adi-pure,C(CCC(=O)O)CC(=O)O,0.366,196;200164;3316652;12209289;12209291;12209295;...,HMDB:(448);PubChem class - food;SuperNatural:(...,34581965694,129.054,148.132,352_CARA_HILIC_pos_SIRIUS_merge_28353
3343,1,1,,-128.85,1.0,9.811,C25H50O,[M + H3N + H]+,HAGKFWXVDSAFHB,InChI=1S/C25H50O/c1-2-3-4-5-6-7-8-9-10-11-12-1...,pentacosanal,CCCCCCCCCCCCCCCCCCCCCCCCC=O,11.9,181174,COCONUT:(CNP0241264);Natural Products:(UNPD102...,3178498,384.417,106.946,2885_CARA_HILIC_pos_SIRIUS_merge_3343


In [8]:
#CANOPUS
sirius_CAN = pd.read_table('input/FBMN/HILIC_pos_SIRIUS/canopus_summary_CANOPUS_network.txt')
sirius_CAN = sirius_CAN.set_index(['shared name'])
print(sirius_CAN.shape)
sirius_CAN.head(2)

(5598, 9)


Unnamed: 0_level_0,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
28353,352_CARA_HILIC_pos_SIRIUS_merge_28353,C6H12O5,[M - H4O2 + H]+,Medium-chain hydroxy acids and derivatives,,Medium-chain hydroxy acids and derivatives,Hydroxy acids and derivatives,Organic acids and derivatives,Organic compounds; Carbohydrates and carbohydr...
3343,2885_CARA_HILIC_pos_SIRIUS_merge_3343,C25H55NO2,[M - H2O + H]+,"1,2-aminoalcohols",Alkanolamines,Amines,Organonitrogen compounds,Organic nitrogen compounds,Organic compounds; Alcohols and polyols; Organ...


In [11]:
#Create the master table
master_annotation_table = pd.concat([gnps_table, annotation_table, annotation_table_analogue,
                                     passatutto_table,
                                     sirius_MF, sirius_CSI, sirius_CAN], axis=1, sort=False)

master_annotation_table = master_annotation_table.reset_index(drop=False)
master_annotation_table.rename(columns={'index':'#featureID'}, inplace=True)
master_annotation_table.to_csv('input/FBMN/HILICpos_feature_metadata.tsv', sep='\t', index=False)
print('Number of annotations without GNPS nodes = '+
      str(master_annotation_table.shape[0]-gnps_table.shape[0]))
print(master_annotation_table.shape)
master_annotation_table.head(5)

Number of annotations without GNPS nodes = 0
(28762, 191)


Unnamed: 0,#featureID,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,...,CSI_id,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
0,1,,,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,358_CARA_HILIC_pos_SIRIUS_merge_1,358_CARA_HILIC_pos_SIRIUS_merge_1,C20H37NO11,[M - H2O + H]+,Aminoglycosides,Aminosaccharides,Carbohydrates and carbohydrate conjugates,Organooxygen compounds,Organic oxygen compounds,Organic compounds; Organoheterocyclic compound...
1,2,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,6308_CARA_HILIC_pos_SIRIUS_merge_2,6308_CARA_HILIC_pos_SIRIUS_merge_2,C23H26N6O2,[M + H]+,Benzene and substituted derivatives,,,Benzene and substituted derivatives,Benzenoids,Organic compounds; Organoheterocyclic compound...
2,3,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,4,1.0,[M+H]+,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,184_CARA_HILIC_pos_SIRIUS_merge_4,184_CARA_HILIC_pos_SIRIUS_merge_4,C24H30O6,[M + H]+,Dibenzylbutane lignans,,,Dibenzylbutane lignans,"Lignans, neolignans and related compounds",Organic compounds; Organoheterocyclic compound...
4,5,,,54.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,5870_CARA_HILIC_pos_SIRIUS_merge_5,C32H35N9O4,[M + H]+,Benzene and substituted derivatives,,,Benzene and substituted derivatives,Benzenoids,Organic compounds; Organoheterocyclic compound...


## Check consistency of the table

In [10]:
def check_annotations(number):
    for x in range(number):
        print('===ENTRY====')
        print(master_annotation_table.iloc[x][['GNPS_precursor mass', 'GNPS_LIB_Precursor_MZ',
                                           'GNPS_LIBA_Precursor_MZ','GNPS_LIBA_MassDiff',
                                           'PASSA_FDR_LibMZ',
                                           'SIR_MF_Zod_adduct','SIR_MF_Zod_molecularFormula',
                                           'CSI_ionMass','CAN_molecularFormula']])
        print(list(master_annotation_table.iloc[x][['GNPS_GNPSLinkout_Cluster']]))
        
check_annotations(25)

===ENTRY====
GNPS_precursor mass                   450.232
GNPS_LIB_Precursor_MZ                     NaN
GNPS_LIBA_Precursor_MZ                    NaN
GNPS_LIBA_MassDiff                        NaN
PASSA_FDR_LibMZ                        451.27
SIR_MF_Zod_adduct              [M + H3N + H]+
SIR_MF_Zod_molecularFormula         C20H32O10
CSI_ionMass                           450.233
CAN_molecularFormula               C20H37NO11
Name: 0, dtype: object
['https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=b699fd88a01f431591b91dc324f51255&view=view_all_clusters_withID&show=true#{"main.cluster index_lowerinput":"1","main.cluster index_upperinput":"1"}']
===ENTRY====
GNPS_precursor mass               419.221
GNPS_LIB_Precursor_MZ                 NaN
GNPS_LIBA_Precursor_MZ                NaN
GNPS_LIBA_MassDiff                    NaN
PASSA_FDR_LibMZ                       NaN
SIR_MF_Zod_adduct                [M + H]+
SIR_MF_Zod_molecularFormula    C23H26N6O2
CSI_ionMass                        419.22
C