# Minimum dependencies

In [25]:
# All required imports
import numba
from typing import Tuple, List
from matchms import Spectrum
from matchms.typing import SpectrumType
import numpy as np
import pandas as pd

# Functions that are needed for the example

In [26]:
from matchms import Spectrum

from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

def process_spectrum(spectrum):
    spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
    spectrum = normalize_intensities(spectrum)
    spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=1000)
    spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
    return spectrum

In [30]:
from pathlib import Path
import json

ref_spectra_df_path = Path("/Users/yoanngloaguen/Desktop/example_dataset_tornike.csv")
if ref_spectra_df_path.exists():
    ref_spectra_df = pd.read_csv(ref_spectra_df_path)
else:
    ref_spectra_df = get_reference_spectra(0)
    ref_spectra_df.to_csv(ref_spectra_df_path, index=False)

def get_ref_spectra_from_df(spectra_df):
    # This function will take a dataframe with spectra and return a list of matchms spectra
    spectra = []
    for index, row in spectra_df.iterrows():
        pbid = row["pbid"]
        precursor_mz = row["precursor_mz"]
        smiles = row["pb_smiles"]
        inchikey = row["pb_inchikey"]
        mz_array = np.array(json.loads(row["peaks_mz"]))
        intensity_array = np.array(json.loads(row["peaks_intensities"]))
        sp = Spectrum(mz=mz_array, intensities=intensity_array,
                        metadata={'id': pbid, 
                                  'precursor_mz': precursor_mz, 
                                  'smiles': smiles, 
                                  'inchikey': inchikey}) 
        sp = process_spectrum(sp)
        if sp is not None:
            spectra.append(sp)
    return spectra

In [None]:
# This the "most" time consuming part of the code. 

@numba.njit(fastmath=True)
def score_best_matches(matching_pairs: np.ndarray, spec1: np.ndarray,
                       spec2: np.ndarray, mz_power: float = 0.0,
                       intensity_power: float = 1.0) -> Tuple[float, int]:
    """Calculate cosine-like score by multiplying matches. Does require a sorted
    list of matching peaks (sorted by intensity product)."""
    score = float(0.0)
    used_matches = int(0)
    used1 = set()
    used2 = set()
    for i in range(matching_pairs.shape[0]):
        if not matching_pairs[i, 0] in used1 and not matching_pairs[i, 1] in used2:
            score += matching_pairs[i, 2]
            used1.add(matching_pairs[i, 0])  # Every peak can only be paired once
            used2.add(matching_pairs[i, 1])  # Every peak can only be paired once
            used_matches += 1

    # Normalize score:
    spec1_power = spec1[:, 0] ** mz_power * spec1[:, 1] ** intensity_power
    spec2_power = spec2[:, 0] ** mz_power * spec2[:, 1] ** intensity_power


    score = score/(np.sum(spec1_power ** 2) ** 0.5 * np.sum(spec2_power ** 2) ** 0.5)
    return score, used_matches

In [None]:
@numba.njit
def collect_peak_pairs(spec1: np.ndarray, spec2: np.ndarray,
                       tolerance: float, shift: float = 0, mz_power: float = 0.0,
                       intensity_power: float = 1.0):
    # pylint: disable=too-many-arguments
    """Find matching pairs between two spectra.

    Args
    ----
    spec1:
        Spectrum peaks and intensities as numpy array.
    spec2:
        Spectrum peaks and intensities as numpy array.
    tolerance
        Peaks will be considered a match when <= tolerance apart.
    shift
        Shift spectra peaks by shift. The default is 0.
    mz_power:
        The power to raise mz to in the cosine function. The default is 0, in which
        case the peak intensity products will not depend on the m/z ratios.
    intensity_power:
        The power to raise intensity to in the cosine function. The default is 1.

    Returns
    -------
    matching_pairs : numpy array
        Array of found matching peaks.
    """
    matches = find_matches(spec1[:, 0], spec2[:, 0], tolerance, shift)
    idx1 = [x[0] for x in matches]
    idx2 = [x[1] for x in matches]
    if len(idx1) == 0:
        return None
    matching_pairs = []
    for i, idx in enumerate(idx1):
        power_prod_spec1 = (spec1[idx, 0] ** mz_power) * (spec1[idx, 1] ** intensity_power)
        power_prod_spec2 = (spec2[idx2[i], 0] ** mz_power) * (spec2[idx2[i], 1] ** intensity_power)
        matching_pairs.append([idx, idx2[i], power_prod_spec1 * power_prod_spec2])
    return np.array(matching_pairs.copy())


@numba.njit
def find_matches(spec1_mz: np.ndarray, spec2_mz: np.ndarray,
                 tolerance: float, shift: float = 0) -> List[Tuple[int, int]]:
    """Faster search for matching peaks.
    Makes use of the fact that spec1 and spec2 contain ordered peak m/z (from
    low to high m/z).

    Parameters
    ----------
    spec1_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    spec2_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    tolerance
        Peaks will be considered a match when <= tolerance apart.
    shift
        Shift peaks of second spectra by shift. The default is 0.

    Returns
    -------
    matches
        List containing entries of type (idx1, idx2).

    """
    lowest_idx = 0
    matches = []
    for peak1_idx in range(spec1_mz.shape[0]):
        mz = spec1_mz[peak1_idx]
        low_bound = mz - tolerance
        high_bound = mz + tolerance
        for peak2_idx in range(lowest_idx, spec2_mz.shape[0]):
            mz2 = spec2_mz[peak2_idx] + shift
            if mz2 > high_bound:
                break
            if mz2 < low_bound:
                lowest_idx = peak2_idx
            else:
                matches.append((peak1_idx, peak2_idx))
    return matches

# Example use case

Here I am intentionally creating different use case that we might encounter in prod:

    - Spectra of different lengths

    - Spectra containing several very close m/z which will fall within the tolerance window (spectrum 4 mz:100 and mz:100.001)
    
    - This is not covered here but we could also have different number of spectra in references and queries - This will always happen in prod actually 

In [31]:
spectrum_1 = Spectrum(mz=np.array([100, 150, 200., 203, 234]),
                      intensities=np.array([0.7, 0.2, 0.1, 0.1, 0.1]),
                      metadata={'id': 'spectrum1'})
spectrum_2 = Spectrum(mz=np.array([100, 140, 190., 210]),
                      intensities=np.array([0.4, 0.2, 0.1, 0.1]),
                      metadata={'id': 'spectrum2'})
spectrum_3 = Spectrum(mz=np.array([110, 140, 195.]),
                      intensities=np.array([0.6, 0.2, 0.1]),
                      metadata={'id': 'spectrum3'})
spectrum_4 = Spectrum(mz=np.array([100, 100.001, 150, 200.]),
                      intensities=np.array([0.6, 0.1, 0.3, 0.6]),
                      metadata={'id': 'spectrum4'})
references = [spectrum_1, spectrum_3]
queries = [spectrum_2, spectrum_4]

large_references = get_ref_spectra_from_df(ref_spectra_df.head(100000)) 



# Running the pipeline

In [32]:
# In the matchms library, the cosineGreedy function and Score object would take care of the this for loop
# There may be some extra data prep and filter as well but we can ignore that for now
# and reorganise the data to return it as one single output.
# Here is an example of how it would work by default:

# Start example ------------

# import numpy as np
# from matchms.similarity import CosineGreedy
# from matchms import calculate_scores

# similarity_measure = CosineGreedy()
# scores = calculate_scores(references, queries, similarity_measure)

# This only for printing the results
# for (reference, query, score) in scores:
#     print(f"Cosine score between {reference.get('id')} and {query.get('id')}" +
#           f" is {score[0]:.2f} with {score[1]} matched peaks")

# End example --------------

# Here I just break it down so you see what is happening behind the scenes
# It is basically three function calls (the first two will collect the matching pairs, the third will score them - including normalization)
# The time bottleneck is on the last function call (score_best_matches), as it operates on single spectra pairs
# Storing the output of matching pairs in some dataframe (tensor) and then scoring them in batches would be much faster
# So it is really the last function that needs to be changed
# I managed to create a batch version of the first two functions but that did not hold any speedup
# Plus it only worked on specific case - multiple spectra in the reference against a single spectrum in the query (not multiple vs multiple)

# I am setting the mz_tolerance to 0.1, as this is probably the value we will use by default
# That being said, in this exercise it just to cover all potential cases where we can have a peak from one spectrum matching several peaks in the other
# Only one (the first) is then selected in the scoring function 

for spectrum_1 in references:
    for spectrum_2 in queries:
        spec1 = spectrum_1.peaks.to_numpy
        spec2 = spectrum_2.peaks.to_numpy
        matching_pairs = collect_peak_pairs(spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy, 0.1,
                    shift=0.0, mz_power=0.0,
                    intensity_power=1.0)
        if matching_pairs is not None:
            score = score_best_matches(matching_pairs, spec1, spec2,
                                    0.0, 1.0)
            print(score)
        else: print("No matching pairs found")


(0.7977240352174655, 1)
(0.7968798037362897, 3)
(0.13318543164240537, 1)
No matching pairs found


# Benchmarking runtime

Below is a small snippets to benchmark runtime of the collect_peak_pairs function and score_best_matches function.
To do so we split the approach in two parts, first collecting and storing the matching pairs, second running the score function


In [33]:
# List to store the matching pairs along with the spectra
import time

# Very small example
references = [spectrum_1, spectrum_3]
queries = [spectrum_2, spectrum_4]

# Realistic example
references = large_references[1000:100000]
queries = large_references[0:10]

start_collect_peaks = time.time()
pairs_to_score_list = []
for spectrum_1 in references:
    for spectrum_2 in queries:
        spec1 = spectrum_1.peaks.to_numpy
        spec2 = spectrum_2.peaks.to_numpy
        matching_pairs = collect_peak_pairs(spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy, 0.1,
                    shift=0.0, mz_power=0.0,
                    intensity_power=1.0)
        if matching_pairs is not None:
            pairs_to_score_list.append([ matching_pairs, spectrum_1, spectrum_2])  
end_collect_peaks = time.time()
print("Time to collect matching pairs: ", end_collect_peaks - start_collect_peaks)

# Now we can score the matching pairs
start_score = time.time()
for matching_pairs, spectrum_1, spectrum_2 in pairs_to_score_list:
    score = score_best_matches(matching_pairs, spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy,
                                0.0, 1.0)

end_score = time.time()
print("Time to score matching pairs: ", end_score - start_score)




Time to collect matching pairs:  28.9017972946167
Time to score matching pairs:  10.086315155029297
