In [None]:
"""
Input file - please change for each case study

Valinomycin = ../data/valinomycin_MSMS_from_mestrenova.csv
Surfactin B = ../data/surfactinB_MSMS_from_mestrenova.csv
Neomycin B = ../data/neomycinB_MSMS_from_mestrenova.csv
"""

input_file = "surfactinB_MSMS_from_mestrenova.csv"
molecule = "surfactinB" # surfactinB or valinomycin

"""
Packages and functions
"""
import os
import json
import re
import glob
import numpy as np
import pandas as pd
from collections import Counter
from rdkit import Chem as Ch
from rdkit.Chem import rdMolDescriptors
import itertools
from scipy.stats import multinomial
import sys
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
import concurrent.futures
import cairosvg
import random

"""
Fixed Data / Parameters
"""

#in the form {'adduct type': [number of M, adduct monoisotopic mass, charge]}
adduct_reference = {'M+H': [1, 1.0072764167, 1],
 'M+Na': [1, 22.9892214167, 1],
 'M+2H': [1, 2.0145528334, 2],
 '2M+H': [2, 1.0072764167, 1],
 'M+NH4': [1, 18.0338254167, 1],
 'M+K': [1, 38.963159416699995, 1],
 'M+H+K': [1, 39.970435833399996, 2],
 'M+H+Na': [1, 23.9964978334, 2],
 'M-2H+3Na': [1, 66.95311141670001, 1],
 'M+2Na': [1, 45.9784428334, 2],
 'M+Fe': [1, 55.9338418334, 2],
 '3M+H': [3, 1.0072764167, 1],
 '2M+Na': [2, 22.9892214167, 1],
 'M+2K': [1, 77.92631883339999, 2],
 'M+2Na-H': [1, 44.9711664167, 1],
 'M+Li': [1, 7.0154564167, 1],
 'M-H+2K': [1, 76.9190424167, 1],
 'M+H+O': [1, 17.0021914167, 1],
 'M+H+H2O': [1, 19.0178414167, 1]}

mass_filter = 0.005
ms_match_weight = 0.5
iso_match_weight = 0.2
mass_defect_match_weight = 0.3
remove_top = True

neomycinB = 'NCC1OC(OC2C(CO)OC(OC3C(O)C(N)CC(N)C3OC3OC(CN)C(O)C(O)C3N)C2O)C(N)C(O)C1O'
surfactinB = 'CC(C)CCCCCCCCC1CC(=O)NC(CCC(=O)O)C(=O)NC(CC(C)C)C(=O)NC(CC(C)C)C(=O)NC(C(C)C)C(=O)NC(CC(=O)O)C(=O)NC(CC(C)C)C(=O)NC(CC(C)C)C(=O)O1'
valinomycin = 'CC1OC(=O)C(C(C)C)NC(=O)C(C(C)C)OC(=O)C(C(C)C)NC(=O)C(C)OC(=O)C(C(C)C)NC(=O)C(C(C)C)OC(=O)C(C(C)C)NC(=O)C(C)OC(=O)C(C(C)C)NC(=O)C(C(C)C)OC(=O)C(C(C)C)NC1=O'

mol_dict = {'neomycinB':neomycinB,'surfactinB':surfactinB,'valinomycin':valinomycin}

"""
Functions for MS2 comparison
"""

def roundms(x): #Function to round off and check for duplicates and scale
    rounding_factor = 1 #change if required
    # Create a dictionary to store the rounded 1st elements and their corresponding sums
    d = {}
    for sublist in x:
        # Round the 1st element to the nearest integer
        rounded_first = (round(sublist[0]*(1/rounding_factor))/(1/rounding_factor))
        # If the rounded 1st element is not in the dictionary, add it with a sum of 0
        if rounded_first not in d:
            d[rounded_first] = 0
        # Add the 2nd element to the sum for the rounded 1st element
        d[rounded_first] += sublist[1]
    # Return a list of lists, where each sublist contains the rounded 1st element and its sum
    sumlist = [[rounded_first, sum] for rounded_first, sum in d.items()]
    max_value = max(sublist[1] for sublist in sumlist)
    return [[sublist[0], round((sublist[1]*100) / max_value,2)] 
            for sublist in sumlist if round((sublist[1]*100) / max_value,2) > 0]

def ms_cosine_similarity(x,y):
#Takes in x and y as list of lists (list of [m/z, int])
    m1=[]
    m2=[]
    i1=[]
    i2=[]
    for ms1 in x:
        m1.append(ms1[0])
        i1.append(ms1[1])
    for ms2 in y:
        m2.append(ms2[0])
        i2.append(ms2[1])
    
    common = list(set(m1).intersection(set(m2)))
    
    sumtop=0
    sumleft=0
    sumright=0
    for k in common:
        m1i=m1.index(k)
        m2i=m2.index(k)
        i1i=i1[m1i]
        i2i=i2[m2i]
        top=(k*(i1i**0.5))*(k*(i2i**0.5))
        sumtop=sumtop+top
    for k in m1:
        m1i=m1.index(k)
        i1i=i1[m1i]
        left=(k*i1i**0.5)**2
        sumleft=sumleft+left
    for k in m2:
        m2i=m2.index(k)
        i2i=i2[m2i]
        right=(k*i2i**0.5)**2
        sumright=sumright+right
            
    if sumleft == 0 or sumright == 0:
        cosine_sim = 0
    else:
        cosine_sim = sumtop/((sumleft*sumright)**0.5)
            
    return cosine_sim

def iso_cosine_similarity(calc_I , exp_I , remove_top = True):
    #Make both calc_I and exp_I the same length
    #As a result of previous parameters, calc_I is always larger than exp_I
    calc_I = calc_I[:len(exp_I)]
    #Removes the highest experimental peak and its corresponding index
    if remove_top == True:
        remove_index = exp_I.index(max(exp_I))
        new_exp_I = exp_I[:remove_index] + exp_I[remove_index+1:]
        new_calc_I = calc_I[:remove_index] + calc_I[remove_index+1:]
    else: 
        new_exp_I = exp_I
        new_calc_I = calc_I
    #Rescale the values
    if sum(1 for num in exp_I if num != 0) <= 1:
        result = 0
    else:
        try:
            norm_exp_I = new_exp_I / np.max(new_exp_I)
            norm_calc_I = new_calc_I / np.max(new_calc_I)
            top = np.dot(norm_exp_I , norm_calc_I)
            bot = sum((norm_exp_I)**2) * sum((norm_calc_I)**2)
            result = float(top / bot**0.5)
        except:
            result = 0
        
    if np.isnan(result) == True:
        result = 0
        
    return result

In [None]:

"""
Loading MS/MS reference libraries
"""
lib = pd.read_parquet('MSMS_reference_library.parquet')

all_rt = []
all_adduct = []
all_abundance = []
all_precursormz = []
all_ms = []
all_iso = []
all_predicted_smiles = []
all_predicted_ms = []
all_scores = []
all_ranks = []

df = pd.read_csv(input_file)
for i in range(len(df)):
    retention_time = df.iloc[i]['Retention Time (min)']
    precursor_mz = df.iloc[i]['Precursor m/z']
    adduct = df.iloc[i]['Adduct Type']
    iso = eval(df.iloc[i]['Expt Isotope Distribution'])
    ms = eval(df.iloc[i]['Expt MS2 Spectra'])
    abundance = df.iloc[i]['MS1 Abundance']   

    #add new part to export list of metabolites from sample
    all_rt.append(retention_time)
    all_adduct.append(adduct)
    all_abundance.append(abundance)
    all_precursormz.append(precursor_mz)
    all_ms.append(ms)
    all_iso.append(iso)

    df_temp = lib[abs(lib['PrecursorMZ'] - precursor_mz) <= mass_filter].copy(deep=True).reset_index(drop=True)
    
    #increase to 0.02 Da if no hits
    if len(df_temp) == 0:
        df_temp = lib[abs(lib['PrecursorMZ'] - precursor_mz) <= 0.02].copy(deep=True).reset_index(drop=True)

    #calculate the match scores and combined score
    df_temp['ms_match'] = df_temp['Mass_Spec_Rounded'].apply(lambda x: ms_cosine_similarity(ms, x))
    if len(iso) <= 2: #if insufficient isotopic distribution data then set all to 0
        df_temp['iso_match'] = df_temp['Isotope_Prob'].apply(lambda x: 0)
    else:
        df_temp['iso_match'] = df_temp['Isotope_Prob'].apply(lambda x: 
                                                            max((iso_cosine_similarity(iso, list(x)[:len(iso)],
                                                                                    remove_top=remove_top)
                                                                -0.999)*1000, 0))
    df_temp['mass_defect_match'] = df_temp['PrecursorMZ'].apply(lambda x: 
                                                                max((mass_filter-abs(x-precursor_mz)),0)
                                                                    /mass_filter)
    df_temp['combined_score'] = df_temp.apply(
        lambda x: 
        (x['ms_match']*ms_match_weight if pd.notna(x['ms_match']) else 0) +
        (x['iso_match']*iso_match_weight if pd.notna(x['iso_match']) else 0) +
        (x['mass_defect_match']*mass_defect_match_weight if pd.notna(x['mass_defect_match']) else 0), 
        axis=1
    )
    df_temp['rank'] = df_temp['combined_score'].rank(ascending=False, method='min')
    df_temp = df_temp.sort_values(by='rank')
    all_predicted_smiles.append(mol_dict[molecule])
    all_predicted_ms.append(df_temp[df_temp['SMILES'] == mol_dict[molecule]]['Mass_Spec_Rounded'].item())
    all_scores.append(df_temp[df_temp['SMILES'] == mol_dict[molecule]]['combined_score'].item())
    all_ranks.append(df_temp[df_temp['SMILES'] == mol_dict[molecule]]['rank'].item())

    # un-comment to see top10 predicted structures for each MS/MS spectra
    #df_temp.to_json(str(df.iloc[i]['sample']) + '_' + str(retention_time) + 'min_' + str(round(precursor_mz,4)) + 'mz_' + str(adduct) + '_Top10.json')

    del df_temp

df['Predicted SMILES'] = all_predicted_smiles
df['Predicted MS2 Spectra'] = all_predicted_ms
df['WISE Score'] = all_scores
df['Rank'] = all_ranks
df.to_csv(input_file[:-4] + '_processed.csv',index=False)