In [None]:
# This notebook records related code for library preprocessing
# Here is a diagram showing the relative paths of the input and output files and folders
# We provide compressed files in three folders: notebook, src, and msdb, at https://zenodo.org/uploads/17065209
# Most input and output files are included except for some very large files, exceeding 20 GB, like the pairwise similarity matrix of each in-silico library.
# Before using these codes, make sure to use them in the msanalyst folder cloned from git.
'''
MSanalyst/
    ├── notebook/
    ├── src/
    └── msdb/
        ├── GNPSLIBRARY/
        |   ├── ISspec_demo/ (input) A demo of generating in-silico spectra files using cfmid is given
        |   ├── ALL_GNPS_NO_PROPOGATED.mgf (input)
        |   ├── GNPS-LIBRARY.mgf (input)
        |   ├── edbMS1.csv (output)
        |   ├── edb_info.json (output)
        |   ├── isdbMS1.csv (output)
        |   ├── isdb_info.json (output)
        ├── GNPSLIBRARY_250514/
        |   └── ALL_GNPS_NO_PROPOGATED.mgf (input)
        ├── data/
        |   ├── hqtof/
        |   |   ├── idlist/
        |   |   |   └── H_qtof_non-redundant_CCMSIDs.npy (output)
        |   |   └── FS_hqtof.json (output)
        |   └── idlist/
        └── FS_isdb_e0.json (output) # Flash format for further
        └── FS_isdb_e1.json (output) # Flash format
        └── FS_isdb_e2.json (output) # Flash format
        └── isdb_info.json (input)
'''

# Import library

In [1]:
import sys, time,json,os,re,requests
sys.argv = ['jupyter']  # Clear the parameters passed to Jupyter
sys.path.append('../')
from rdkit import RDLogger
import numpy as np
import pandas as pd
from rdkit import Chem
from my_packages import functions_new, cheminfo_tools
from ms_entropy import FlashEntropySearch
from collections import Counter
from my_packages import functions_new
from spectral_entropy import calculate_entropy
from tqdm import tqdm, trange

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


# Create library for MSanalyst
## Experimental library

In [2]:
# Load files
# The file is too large to upload to github and can be downloaded at https://external.gnps2.org/gnpslibrary
# The dataset is constantly updated, so filtering results may vary slightly.
DIR = '../msdb/GNPSLIBRARY/'
ALL_GNPS_NP_file = os.path.join(DIR,'ALL_GNPS_NO_PROPOGATED.mgf')
ALL_GNPS_NP = functions_new.load_spectra_from_file(ALL_GNPS_NP_file)

In [None]:
# Access information
GNPS_INFO = {}
for i in trange(len(ALL_GNPS_NP)):
    SPECTRUM = ALL_GNPS_NP[i]
    METADATA = SPECTRUM.metadata # ionmode,smiles,inchi,precursor_mz,ionmode,spectrum_id
    # try:
    if METADATA['ionmode'] == 'positive':
        MZ = SPECTRUM.mz
        INT = SPECTRUM.intensities
        PEAKs = np.column_stack((SPECTRUM.mz,SPECTRUM.intensities))
        PM = METADATA['precursor_mz']
        CHARGE = METADATA['charge']
        IONMODE = METADATA['ionmode']
        try:
            SMILE = METADATA['smiles']
        except:
            SMILE = ''

        try:
            INCHI = METADATA['inchi']
        except:
            INCHI = ''
            
        CCMSID = METADATA['spectrum_id']
        
        GNPS_INFO[CCMSID] = {'pepmass':PM,'charge':CHARGE,'ionmode':IONMODE
                             ,'smiles':SMILE,'inchi':INCHI,'ms2':PEAKs.tolist()}
    # except:
    #     pass

In [11]:
# Experimental MS1 library
data_list_comprehension = [
    {
        'CCMSID': ccmsid,
        'pepmass': info.get('pepmass'),
        'smiles': info.get('smiles'),
        'inchi': info.get('inchi')
    }
    for ccmsid, info in GNPS_INFO.items()
]

# Create the DataFrame
df_comprehension = pd.DataFrame(data_list_comprehension)
df_comprehension_FILE = os.path.join(DIR,'edbMS1.csv')
df_comprehension.to_csv(df_comprehension_FILE)

In [12]:
# Experimental MS2 library
GNPS_INFO_FILE = os.path.join(DIR,'edb_info.json')
with open(GNPS_INFO_FILE, "w") as f:
    json.dump(GNPS_INFO, f)

## In-silico library

In [5]:
# The generation of in-silico library used the script "../src/ISDB_generate.py"
# python ../src/ISDB_generate.py
# Move the generated isdbMS1.csv and isdb_info.json to ../msdb/

In [None]:
# Load and convert to Flash format
IS_LIBRARY_FILE = '../msdb/isdb_info.json'
with open(IS_LIBRARY_FILE,'r') as f:
    IS_INFO = json.load(f)

# Three in-silico libraries predicted at 10,20 and 40 eV will be generated
FS_IS_LIBRARY_e0,FS_IS_LIBRARY_e1,FS_IS_LIBRARY_e2 = [],[],[]
for key, values in tqdm(IS_INFO.items(),total = len(IS_INFO)):
    SPEC_STR_e0, SPEC_STR_e1, SPEC_STR_e2 = values['energy0_ms2'],values['energy1_ms2'],values['energy2_ms2']
    PM = float(values['pepmass'])
    FS_IS_LIBRARY_e0.append({
        "id": key,
        "precursor_mz": PM,
        "peaks":functions_new.spec_str2array(SPEC_STR_e0,PM).tolist(),
        "smile": values['smiles'],
        "charge":1,
        "ion_mode":'Positive'
        })
    FS_IS_LIBRARY_e1.append({
        "id": key,
        "precursor_mz": PM,
        "peaks":functions_new.spec_str2array(SPEC_STR_e1,PM).tolist(),
        "smile": values['smiles'],
        "charge":1,
        "ion_mode":'Positive'
        })
    FS_IS_LIBRARY_e2.append({
        "id": key,
        "precursor_mz": PM,
        "peaks":functions_new.spec_str2array(SPEC_STR_e2,PM).tolist(),
        "smile": values['smiles'],
        "charge":1,
        "ion_mode":'Positive'
        })

FS_GNPS_LIBRARY_OUTPUT0 = '../msdb/FS_isdb_e0.json'
with open(FS_GNPS_LIBRARY_OUTPUT0, "w") as f:
    json.dump(FS_IS_LIBRARY_e0, f)

FS_GNPS_LIBRARY_OUTPUT1 = '../msdb/FS_isdb_e1.json'
with open(FS_GNPS_LIBRARY_OUTPUT1, "w") as f:
    json.dump(FS_IS_LIBRARY_e1, f)

FS_GNPS_LIBRARY_OUTPUT2 = '../msdb/FS_isdb_e2.json'
with open(FS_GNPS_LIBRARY_OUTPUT2, "w") as f:
    json.dump(FS_IS_LIBRARY_e2, f)

# Create data for comparative analysis of spectral similarity distribution
* Data for Figure 3H-I
* Generate as '.npy' format
  
Since spectra have different adduct types,sourced from redundant structures and different instrument platforms
It is necessary to remove the influences mentioned above before evaluating the spectral similarity matrices

In [3]:
# The file is too large to upload to github and can be downloaded at https://external.gnps2.org/gnpslibrary
# The dataset is constantly updated, so filtering results may vary slightly
DIR = '../msdb/GNPSLIBRARY/'
GNPS_LIBRARY_file = os.path.join(DIR,'GNPS-LIBRARY.mgf') # Download from the GNPS
GNPS_LIBRARY = functions_new.load_spectra_from_file(GNPS_LIBRARY_file)

In [None]:
t = time.time()
# Filtering CCMSIDs by instrument and adduct type
filtered_IDs, ADDUCTs, INSTRUMENTs,SMILES = [],[],[],[]
PATTERN = re.compile(r'qtof$', re.IGNORECASE)
for SPECTRUM in tqdm(GNPS_LIBRARY,total=len(GNPS_LIBRARY)):
    METADATA = SPECTRUM.metadata
    NAME = METADATA["compound_name"]
    ADDUCTs.append(NAME)
    INSTRUMENT = METADATA["instrument_type"]
    INSTRUMENTs.append(INSTRUMENT)
    SMILES = METADATA['smiles']
    
    if 'M+H' in NAME and re.search(PATTERN,INSTRUMENT):
        try:
            filtered_IDs.append(METADATA['spectrum_id'])
            SMILES.append(SMILES)
        except:
            pass

# Duplicate structures
nonredundant_SMILES = []
nonredundant_IDs = []
for idx,ID in tqdm(enumerate(filtered_IDs),total=len(filtered_IDs)):
    try:
        SMILE1 = Chem.CanonSmiles(SMILES[idx])
        if Chem.MolFromSmiles(SMILE1):
            if not nonredundant_SMILES: # append the 1st smile in the empty list
                nonredundant_SMILES.append(SMILE1)
                nonredundant_IDs.append(ID)
            else:
                if Chem.CanonSmiles(SMILE1) not in nonredundant_SMILES:
                    nonredundant_SMILES.append(SMILE1)
                    nonredundant_IDs.append(ID)
    except:pass

# # Saving idx-CCMSIDs 
# np.save('../msdb/data/hqtof/idlist/H_qtof_non-redundant_CCMSIDs.npy',nonredundant_IDs)

# Loading CCMSIDs
CCMSIDs = np.load('../msdb/data/hqtof/idlist/H_qtof_non-redundant_CCMSIDs.npy').tolist()
print(len(CCMSIDs))

In [None]:
# Create hqtof spectral file
GNPS_SPECTRA = functions_new.load_spectra_from_file('../msdb/GNPSLIBRARY/GNPS-LIBRARY.mgf')
Hqtof_CCMSIDs = list(np.load('../msdb/data/hqtof/idlist/H_qtof_non-redundant_CCMSIDs.npy'))

FS_SPECTRA = []
for SPECTRUM in tqdm(GNPS_SPECTRA,total = len(GNPS_SPECTRA)):
    CCMSID = SPECTRUM.metadata['spectrum_id']
    if CCMSID in Hqtof_CCMSIDs:
        PM = SPECTRUM.metadata['precursor_mz']
        IONMODE = SPECTRUM.metadata['ionmode']
        try:
            SMILE = SPECTRUM.metadata['smiles']
        except:
            SMILE = GNPS_INFO[CCMSID]['CANONSMILES']
            
        CHARGE = SPECTRUM.metadata['charge']
        SCLASS = GNPS_INFO[CCMSID]['np_classifier_superclass']
        
        peaks = np.column_stack((SPECTRUM.mz,SPECTRUM.intensities))
        # peaks = se.clean_spectrum(peaks, max_mz=PM + 1.6)
        FS_SPECTRA.append({
            "id": CCMSID,
            "precursor_mz": PM,
            "peaks": peaks.tolist(),
            "smile": SMILE,
            "charge": CHARGE,
            "ion_mode":IONMODE,
            'superclass':[SCLASS]
            })

# Save
sorted_list = sorted(FS_SPECTRA, key=lambda x: x['precursor_mz'])
FS_GNPS_LIBRARY_OUTPUT = '../msdb/data/hqtof/FS_hqtof.json'
with open(FS_GNPS_LIBRARY_OUTPUT, "w") as f:
    json.dump(sorted_list, f)

In [None]:
# Load hqtof data
HQTOF = functions_new.json_load('../msdb/data/hqtof/FS_hqtof.json')
hqtof_search = FlashEntropySearch()
FS_SPECTRA = hqtof_search.build_index(HQTOF) # Pre-clean and sorted flash spectra

# Statistics

## ALL_GNPS_NO_PROPOGATED

In [None]:
# Load GNPS library file
GNPS_FILE = '../msdb/GNPSLIBRARY/ALL_GNPS_NO_PROPOGATED.mgf'
t = time.time() 
GNPS_INFO = list(functions_new.load_spectra_from_file(GNPS_FILE))
print(f'Finished in {(time.time() - t) / 60:.2f} min')

In [None]:
# Counts
N_withoutSMILE, N_compound_name =0, 0
PMs = []
for INFO in tqdm(GNPS_INFO,total = len(GNPS_INFO)):
    PM = INFO.metadata['precursor_mz']
    PMs.append(PM)
    
    ID = INFO.metadata['spectrum_id']
    NAME = INFO.metadata['compound_name']
    if 'genistein' in NAME.lower():
        N_compound_name +=1
    try:
        SMILE = INFO.metadata['smiles']
        mol = Chem.MolFromSmiles(SMILE)
    except:
        N_withoutSMILE +=1

In [None]:
PM_dict = Counter(PMs)
sorted_pms = sorted(PM_dict.items(), key=lambda item: item[1], reverse=True)
print(f'Total number of spectra: {len(GNPS_INFO)}')
print(f'Number of spectra with unique precursor mass: {len(PM_dict)}')
print(f'Number of spectra without SMILE: {N_withoutSMILE}')
print(f'There are {sorted_pms[0][1]} spectra with the same precursor m/z of {sorted_pms[0][0]}')