# Minimum dependencies

In [1]:
# All required imports
from joblib import Parallel, delayed
from tqdm import tqdm
import numba
from typing import Tuple, List
from matchms import Spectrum
from matchms.typing import SpectrumType
import numpy as np
import pandas as pd
from pathlib import Path
import json

# Functions that are needed for the example

In [2]:
from matchms import Spectrum

from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

def process_spectrum(spectrum):
    spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
    spectrum = normalize_intensities(spectrum)
    spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=1000)
    spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
    return spectrum

In [3]:
ref_spectra_df_path = Path("example_dataset_tornike.csv")
if ref_spectra_df_path.exists():
    ref_spectra_df = pd.read_csv(ref_spectra_df_path)
else:
    ref_spectra_df = get_reference_spectra(0)
    ref_spectra_df.to_csv(ref_spectra_df_path, index=False)

def get_ref_spectra_from_df(spectra_df):
    # This function will take a dataframe with spectra and return a list of matchms spectra
    # Argh, This function is annoyingly slow. Added simple parallelization.
    
    # for index, row in spectra_df.iterrows():
    def fn(index, row):
        pbid = row["pbid"]
        precursor_mz = row["precursor_mz"]
        smiles = row["pb_smiles"]
        inchikey = row["pb_inchikey"]
        mz_array = np.array(json.loads(row["peaks_mz"]))
        intensity_array = np.array(json.loads(row["peaks_intensities"]))
        sp = Spectrum(mz=mz_array, intensities=intensity_array,
                        metadata={'id': pbid, 
                                'precursor_mz': precursor_mz, 
                                'smiles': smiles, 
                                'inchikey': inchikey}) 
        sp = process_spectrum(sp)
        return sp
    
    spectra = Parallel(-2)(delayed(fn)(index, row) for index, row in tqdm(spectra_df.iterrows(), total=len(spectra_df)))
    spectra = [s for s in spectra if s is not None]
    return spectra

In [4]:
ref_spectra_df

Unnamed: 0,pbid,pb_smiles,pb_inchikey,precursor_mz,compound_name,peaks_mz,peaks_intensities
0,499410,Clc1ccccc1CNCCc1c[nH]c2ccccc12,ZSEPJTGBFYRCMJ-UHFFFAOYSA-N,285.1153,N-(2-Chlorobenzyl)-2-(1H-indol-3-yl)ethanamine,"[118.0654, 125.0155, 132.0811, 133.0844, 143.5...","[8.39, 29.17, 66.23, 1.9, 2.4, 999.0, 2.7, 1.1..."
1,499411,Clc1ccccc1CNCCc1c[nH]c2ccccc12,ZSEPJTGBFYRCMJ-UHFFFAOYSA-N,285.1153,N-(2-Chlorobenzyl)-2-(1H-indol-3-yl)ethanamine,"[118.0654, 125.0155, 126.0188, 132.081, 133.08...","[8.29, 54.65, 1.3, 59.94, 1.7, 3.1, 999.0, 3.2..."
2,499412,Clc1ccccc1CNCCc1c[nH]c2ccccc12,ZSEPJTGBFYRCMJ-UHFFFAOYSA-N,285.1153,N-(2-Chlorobenzyl)-2-(1H-indol-3-yl)ethanamine,"[117.0701, 118.0654, 125.0155, 126.0188, 132.0...","[1.8, 7.99, 145.75, 3.0, 53.15, 1.4, 1.6, 3.5,..."
3,499413,Clc1ccccc1CNCCc1c[nH]c2ccccc12,ZSEPJTGBFYRCMJ-UHFFFAOYSA-N,285.1153,N-(2-Chlorobenzyl)-2-(1H-indol-3-yl)ethanamine,"[117.0701, 118.0654, 125.0155, 126.0188, 132.0...","[9.39, 8.19, 289.41, 6.69, 49.35, 1.1, 5.39, 3..."
4,499414,Clc1ccccc1CNCCc1c[nH]c2ccccc12,ZSEPJTGBFYRCMJ-UHFFFAOYSA-N,285.1153,N-(2-Chlorobenzyl)-2-(1H-indol-3-yl)ethanamine,"[91.0544, 115.0545, 117.0576, 117.0702, 118.06...","[1.5, 4.4, 3.3, 41.56, 9.19, 1.2, 1.6, 422.98,..."
...,...,...,...,...,...,...,...
99996,1094297,O=c1[nH]nc(-c2ccccc2)c(=O)[nH]1,QDVLVVJHFWNNJK-UHFFFAOYSA-N,190.0611,5-Phenyl-6-azauracil,"[50.0152, 51.023, 51.0249, 52.0263, 53.0387, 5...","[24.28, 375.52, 3.9, 10.89, 17.88, 18.08, 6.29..."
99997,1094298,O=c1[nH]nc(-c2ccccc2)c(=O)[nH]1,QDVLVVJHFWNNJK-UHFFFAOYSA-N,190.0611,5-Phenyl-6-azauracil,"[50.0152, 51.023, 52.0264, 53.0386, 59.0241, 6...","[75.22, 934.36, 27.37, 15.88, 15.98, 18.98, 16..."
99998,1094299,O=c1[nH]nc(-c2ccccc2)c(=O)[nH]1,QDVLVVJHFWNNJK-UHFFFAOYSA-N,190.0611,5-Phenyl-6-azauracil,"[50.0152, 51.023, 52.0264, 52.0307, 53.0385, 5...","[113.59, 999.0, 29.17, 2.4, 8.09, 7.39, 1.7, 2..."
99999,1094300,O=c1[nH]nc(-c2ccccc2)c(=O)[nH]1,QDVLVVJHFWNNJK-UHFFFAOYSA-N,190.0611,5-Phenyl-6-azauracil,"[50.0152, 50.0164, 51.023, 52.0181, 52.0264, 5...","[191.51, 4.5, 999.0, 1.8, 30.47, 5.0, 2.2, 1.6..."


In [5]:
# This the "most" time consuming part of the code. 

@numba.njit(fastmath=True)
def score_best_matches(matching_pairs: np.ndarray, spec1: np.ndarray,
                       spec2: np.ndarray, mz_power: float = 0.0,
                       intensity_power: float = 1.0) -> Tuple[float, int]:
    """Calculate cosine-like score by multiplying matches. Does require a sorted
    list of matching peaks (sorted by intensity product)."""
    score = float(0.0)
    used_matches = int(0)
    used1 = set()
    used2 = set()
    for i in range(matching_pairs.shape[0]):
        if not matching_pairs[i, 0] in used1 and not matching_pairs[i, 1] in used2:
            score += matching_pairs[i, 2]
            used1.add(matching_pairs[i, 0])  # Every peak can only be paired once
            used2.add(matching_pairs[i, 1])  # Every peak can only be paired once
            used_matches += 1

    # Normalize score:
    spec1_power = spec1[:, 0] ** mz_power * spec1[:, 1] ** intensity_power
    spec2_power = spec2[:, 0] ** mz_power * spec2[:, 1] ** intensity_power


    score = score/(np.sum(spec1_power ** 2) ** 0.5 * np.sum(spec2_power ** 2) ** 0.5)
    return score, used_matches

In [6]:
@numba.njit
def collect_peak_pairs(spec1: np.ndarray, spec2: np.ndarray,
                       tolerance: float, shift: float = 0, mz_power: float = 0.0,
                       intensity_power: float = 1.0):
    # pylint: disable=too-many-arguments
    """Find matching pairs between two spectra.

    Args
    ----
    spec1:
        Spectrum peaks and intensities as numpy array.
    spec2:
        Spectrum peaks and intensities as numpy array.
    tolerance
        Peaks will be considered a match when <= tolerance apart.
    shift
        Shift spectra peaks by shift. The default is 0.
    mz_power:
        The power to raise mz to in the cosine function. The default is 0, in which
        case the peak intensity products will not depend on the m/z ratios.
    intensity_power:
        The power to raise intensity to in the cosine function. The default is 1.

    Returns
    -------
    matching_pairs : numpy array
        Array of found matching peaks.
    """
    matches = find_matches(spec1[:, 0], spec2[:, 0], tolerance, shift)
    idx1 = [x[0] for x in matches]
    idx2 = [x[1] for x in matches]
    if len(idx1) == 0:
        return None
    matching_pairs = []
    for i, idx in enumerate(idx1):
        power_prod_spec1 = (spec1[idx, 0] ** mz_power) * (spec1[idx, 1] ** intensity_power)
        power_prod_spec2 = (spec2[idx2[i], 0] ** mz_power) * (spec2[idx2[i], 1] ** intensity_power)
        matching_pairs.append([idx, idx2[i], power_prod_spec1 * power_prod_spec2])
    return np.array(matching_pairs.copy())

In [7]:



@numba.njit
def find_matches(spec1_mz: np.ndarray, spec2_mz: np.ndarray,
                 tolerance: float, shift: float = 0) -> List[Tuple[int, int]]:
    """Faster search for matching peaks.
    Makes use of the fact that spec1 and spec2 contain ordered peak m/z (from
    low to high m/z).

    Parameters
    ----------
    spec1_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    spec2_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    tolerance
        Peaks will be considered a match when <= tolerance apart.
    shift
        Shift peaks of second spectra by shift. The default is 0.

    Returns
    -------
    matches
        List containing entries of type (idx1, idx2).

    """
    lowest_idx = 0
    matches = []
    for peak1_idx in range(spec1_mz.shape[0]):
        mz = spec1_mz[peak1_idx]
        low_bound = mz - tolerance
        high_bound = mz + tolerance
        for peak2_idx in range(lowest_idx, spec2_mz.shape[0]):
            mz2 = spec2_mz[peak2_idx] + shift
            if mz2 > high_bound:
                break
            if mz2 < low_bound:
                lowest_idx = peak2_idx
            else:
                matches.append((peak1_idx, peak2_idx))
    return matches

# Example use case

Here I am intentionally creating different use case that we might encounter in prod:

    - Spectra of different lengths

    - Spectra containing several very close m/z which will fall within the tolerance window (spectrum 4 mz:100 and mz:100.001)
    
    - This is not covered here but we could also have different number of spectra in references and queries - This will always happen in prod actually 

In [8]:
spectrum_1 = Spectrum(mz=np.array([100, 150, 200., 203, 234]),
                      intensities=np.array([0.7, 0.2, 0.1, 0.1, 0.1]),
                      metadata={'id': 'spectrum1'})
spectrum_2 = Spectrum(mz=np.array([100, 140, 190., 210]),
                      intensities=np.array([0.4, 0.2, 0.1, 0.1]),
                      metadata={'id': 'spectrum2'})
spectrum_3 = Spectrum(mz=np.array([110, 140, 195.]),
                      intensities=np.array([0.6, 0.2, 0.1]),
                      metadata={'id': 'spectrum3'})
spectrum_4 = Spectrum(mz=np.array([100, 100.001, 150, 200.]),
                      intensities=np.array([0.6, 0.1, 0.3, 0.6]),
                      metadata={'id': 'spectrum4'})
references = [spectrum_1, spectrum_3]
queries = [spectrum_2, spectrum_4]



In [9]:
large_references = get_ref_spectra_from_df(ref_spectra_df)

100%|██████████| 100001/100001 [00:23<00:00, 4342.67it/s]


In [10]:
from tqdm import tqdm
import itertools

references = large_references[1000:]
queries = large_references[0:100]
batch_size = 1000

s1p = [s.peaks.to_numpy.astype('float32') for s in references]
s1p = sorted(s1p, key=len)

s2p = [s.peaks.to_numpy.astype('float32') for s in queries]
s2p = sorted(s2p, key=len)

In [None]:

def chunks(lst, max_size = 1e9, fill_value=float):
    """Yield successive n-sized chunks from lst."""
    # for i in range(0, len(lst), n):
    i = 0
    max_size = np.maximum.accumulate(np.array(len(l) for l in chunk))
    while True:
        chunk = lst[i:i + n]
        batch = np.full((len(chunk), max_size, 2), fill_value=fill_value, dtype='float32')
        for i,c in enumerate(chunk):
            batch[i, :c.shape[0], :] = c
        yield batch
        
s1p_batched = list(chunks(s1p, n=batch_size, fill_value=1e6))
s2p_batched = list(chunks(s2p, n=batch_size, fill_value=-1e6))

for s1 in s1p_batched:
    for s2 in s2p_batched:
        R, N, _ = s1.shape
        Q, M, _ = s2.shape

(1000, 5, 2)
---> (1000, 5, 2)
1000 100
(1000, 5, 2)
---> (1000, 5, 2)
1000 100
(1000, 5, 2)
---> (1000, 5, 2)
1000 100
(1000, 6, 2)
---> (1000, 6, 2)
1000 100
(1000, 6, 2)
---> (1000, 6, 2)
1000 100
(1000, 6, 2)
---> (1000, 6, 2)
1000 100
(1000, 7, 2)
---> (1000, 7, 2)
1000 100
(1000, 7, 2)
---> (1000, 7, 2)
1000 100
(1000, 7, 2)
---> (1000, 7, 2)
1000 100
(1000, 7, 2)
---> (1000, 7, 2)
1000 100
(1000, 8, 2)
---> (1000, 8, 2)
1000 100
(1000, 8, 2)
---> (1000, 8, 2)
1000 100
(1000, 8, 2)
---> (1000, 8, 2)
1000 100
(1000, 9, 2)
---> (1000, 9, 2)
1000 100
(1000, 9, 2)
---> (1000, 9, 2)
1000 100
(1000, 9, 2)
---> (1000, 9, 2)
1000 100
(1000, 10, 2)
---> (1000, 10, 2)
1000 100
(1000, 10, 2)
---> (1000, 10, 2)
1000 100
(1000, 10, 2)
---> (1000, 10, 2)
1000 100
(1000, 11, 2)
---> (1000, 11, 2)
1000 100
(1000, 11, 2)
---> (1000, 11, 2)
1000 100
(1000, 11, 2)
---> (1000, 11, 2)
1000 100
(1000, 12, 2)
---> (1000, 12, 2)
1000 100
(1000, 12, 2)
---> (1000, 12, 2)
1000 100
(1000, 12, 2)
---> (1000

In [None]:

    # for spectrum_2 in queries:
        # spec1 = spectrum_1.peaks.to_numpy
        # spec2 = spectrum_2.peaks.to_numpy
        # ins1 = spectrum_1.peaks._intensities

        # mz2 = spectrum_2.peaks._mz
        # ins2 = spectrum_2.peaks._intensities
        # matching_pairs = collect_peak_pairs(spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy, 0.1,
        #             shift=0.0, mz_power=0.0,
        #             intensity_power=1.0)
        # if matching_pairs is not None:
        #     score = score_best_matches(matching_pairs, spec1, spec2, 0.0, 1.0)

# Running the pipeline

In [None]:
# In the matchms library, the cosineGreedy function and Score object would take care of the this for loop
# There may be some extra data prep and filter as well but we can ignore that for now
# and reorganise the data to return it as one single output.
# Here is an example of how it would work by default:

# Start example ------------

# import numpy as np
# from matchms.similarity import CosineGreedy
# from matchms import calculate_scores

# similarity_measure = CosineGreedy()
# scores = calculate_scores(references, queries, similarity_measure)

# This only for printing the results
# for (reference, query, score) in scores:
#     print(f"Cosine score between {reference.get('id')} and {query.get('id')}" +
#           f" is {score[0]:.2f} with {score[1]} matched peaks")

# End example --------------

# Here I just break it down so you see what is happening behind the scenes
# It is basically three function calls (the first two will collect the matching pairs, the third will score them - including normalization)
# The time bottleneck is on the last function call (score_best_matches), as it operates on single spectra pairs
# Storing the output of matching pairs in some dataframe (tensor) and then scoring them in batches would be much faster
# So it is really the last function that needs to be changed
# I managed to create a batch version of the first two functions but that did not hold any speedup
# Plus it only worked on specific case - multiple spectra in the reference against a single spectrum in the query (not multiple vs multiple)

# I am setting the mz_tolerance to 0.1, as this is probably the value we will use by default
# That being said, in this exercise it just to cover all potential cases where we can have a peak from one spectrum matching several peaks in the other
# Only one (the first) is then selected in the scoring function 

    # for spectrum_1 in references:
    #     for spectrum_2 in queries:
    #         spec1 = spectrum_1.peaks.to_numpy
    #         spec2 = spectrum_2.peaks.to_numpy
    #         matching_pairs = collect_peak_pairs(spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy, 0.1,
    #                     shift=0.0, mz_power=0.0,
    #                     intensity_power=1.0)
    #         if matching_pairs is not None:
    #             score = score_best_matches(matching_pairs, spec1, spec2,
    #                                     0.0, 1.0)
    #             print(score)
    #         else: print("No matching pairs found")


In [None]:

# @numba.njit
def collect_peak_pairs(spec1: np.ndarray, spec2: np.ndarray,
                       tolerance: float, shift: float = 0, mz_power: float = 0.0,
                       intensity_power: float = 1.0):
    # pylint: disable=too-many-arguments
    """Find matching pairs between two spectra.

    Args
    ----
    spec1:
        Spectrum peaks and intensities as numpy array.
    spec2:
        Spectrum peaks and intensities as numpy array.
    tolerance
        Peaks will be considered a match when <= tolerance apart.
    shift
        Shift spectra peaks by shift. The default is 0.
    mz_power:
        The power to raise mz to in the cosine function. The default is 0, in which
        case the peak intensity products will not depend on the m/z ratios.
    intensity_power:
        The power to raise intensity to in the cosine function. The default is 1.

    Returns
    -------
    matching_pairs : numpy array
        Array of found matching peaks.
    """
    matches = find_matches(spec1[:, 0], spec2[:, 0], tolerance, shift)
    # global a
    # a = matches
    # matches_op = find_matches_opt(spec1[:, 0], spec2[:, 0], tolerance, shift)
    # global b
    # b = matches_op
    # assert np.allclose(matches, matches_op)
    
    idx1 = [x[0] for x in matches]
    idx2 = [x[1] for x in matches]
    if len(idx1) == 0:
        return None
    matching_pairs = []
    for i, idx in enumerate(idx1):
        power_prod_spec1 = (spec1[idx, 0] ** mz_power) * (spec1[idx, 1] ** intensity_power)
        power_prod_spec2 = (spec2[idx2[i], 0] ** mz_power) * (spec2[idx2[i], 1] ** intensity_power)
        matching_pairs.append([idx, idx2[i], power_prod_spec1 * power_prod_spec2])
    return np.array(matching_pairs.copy())


# @numba.njit
def find_matches(spec1_mz: np.ndarray, spec2_mz: np.ndarray,
                 tolerance: float, shift: float = 0) -> List[Tuple[int, int]]:
    """Faster search for matching peaks.
    Makes use of the fact that spec1 and spec2 contain ordered peak m/z (from
    low to high m/z).

    Parameters
    ----------
    spec1_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    spec2_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    tolerance
        Peaks will be considered a match when <= tolerance apart.
    shift
        Shift peaks of second spectra by shift. The default is 0.

    Returns
    -------
    matches
        List containing entries of type (idx1, idx2).

    """
    
    lowest_idx = 0
    matches = []
    for peak1_idx in range(spec1_mz.shape[0]):
        mz = spec1_mz[peak1_idx]
        low_bound = mz - tolerance
        high_bound = mz + tolerance
        for peak2_idx in range(lowest_idx, spec2_mz.shape[0]):
            mz2 = spec2_mz[peak2_idx] + shift
            if mz2 > high_bound:
                break
            if mz2 < low_bound:
                lowest_idx = peak2_idx
            else:
                matches.append((peak1_idx, peak2_idx))
    return matches

# Benchmarking runtime

Below is a small snippets to benchmark runtime of the collect_peak_pairs function and score_best_matches function.
To do so we split the approach in two parts, first collecting and storing the matching pairs, second running the score function


In [None]:

# for spectrum_1 in tqdm(references):
#     for spectrum_2 in queries:
#         spec1 = spectrum_1.peaks.to_numpy
#         spec2 = spectrum_2.peaks.to_numpy
        
#         matching_pairs = collect_peak_pairs_opt(
#                     spectrum_1.peaks.to_numpy, 
#                     spectrum_2.peaks.to_numpy, 
#                     0.1,
#                     shift=0.0, mz_power=0.0,
#                     intensity_power=1.0
#         )
#         if matching_pairs is not None:
#             pairs_to_score_list.append([ matching_pairs, spectrum_1, spectrum_2])  
# end_collect_peaks = time.time()
# print("Time to collect matching pairs: ", end_collect_peaks - start_collect_peaks)

In [None]:

# start_collect_peaks = time.time()
# pairs_to_score_list = []
# def fn(spectrum_1, spectrum_2):
#     s1, s2 = spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy
#     matching_pairs = collect_peak_pairs(s1, s2, 0.1,
#                 shift=0.0, mz_power=0.0,
#                 intensity_power=1.0)
#     if matching_pairs is not None:
#         score = score_best_matches(matching_pairs, s1, s2, 0.0, 1.0)
#         if score is not None:
#             return [matching_pairs, spectrum_1, spectrum_2, score]
# pairs_to_score_list = Parallel(-1)(delayed(fn)(spectrum_1, spectrum_2) 
#                                    for spectrum_1, spectrum_2 in tqdm(product(references, queries), total=len(references) * len(queries)))
# pairs_to_score_list = [e for e in pairs_to_score_list if e is not None] # Make sure if matching_pairs is missing, remove entry


def find_matches_opt(spec1_mz: np.ndarray, spec2_mz: np.ndarray,
                 tolerance: float, shift: float = 0) -> List[Tuple[int, int]]:
    a = tf.constant(spec1_mz)[...,None]
    b = tf.constant(spec2_mz)[None,...]
    a = tf.repeat(a, axis=1, repeats=b.shape[1])
    b = tf.repeat(b, axis=0, repeats=a.shape[0])
    c = tf.where(tf.abs(a - (b+shift)) <= tolerance)
    matches = c.numpy().tolist()
    return matches

def collect_peak_pairs_opt(spec1: np.ndarray, spec2: np.ndarray,
                       tolerance: float, shift: float = 0, mz_power: float = 0.0,
                       intensity_power: float = 1.0):
    spec1 = tf.constant(spec1)
    spec2 = tf.constant(spec2)

    a = tf.constant(spec1[:, 0])[...,None]
    b = tf.constant(spec2[:, 0])[None,...]

    a_mat = tf.repeat(a, axis=1, repeats=b.shape[1])
    b_mat = tf.repeat(b, axis=0, repeats=a.shape[0])
    match_idx = tf.where(tf.abs(a_mat - (b_mat+shift)) <= tolerance)
    
    power_prod_spec1 = tf.gather(spec1[:,0], match_idx[:,0]) ** mz_power * tf.gather(spec1[:,1], match_idx[:,0]) ** intensity_power
    power_prod_spec2 = tf.gather(spec2[:,0], match_idx[:,1]) ** mz_power * tf.gather(spec2[:,1], match_idx[:,1]) ** intensity_power

    matching_pairs = tf.concat([tf.cast(match_idx, tf.float64), power_prod_spec1[...,None] * power_prod_spec2[...,None]], 1)
    return matching_pairs.numpy()

# for spectrum_1 in tqdm(references):
#     for spectrum_2 in queries:
#         spec1 = spectrum_1.peaks.to_numpy
#         spec2 = spectrum_2.peaks.to_numpy
        
#         matching_pairs = collect_peak_pairs_opt(
#                     spectrum_1.peaks.to_numpy, 
#                     spectrum_2.peaks.to_numpy, 
#                     0.1,
#                     shift=0.0, mz_power=0.0,
#                     intensity_power=1.0
#         )
#         if matching_pairs is not None:
#             pairs_to_score_list.append([ matching_pairs, spectrum_1, spectrum_2])  
# end_collect_peaks = time.time()
# print("Time to collect matching pairs: ", end_collect_peaks - start_collect_peaks)

In [None]:
# List to store the matching pairs along with the spectra
import time
from itertools import product

# Very small example
references = [spectrum_1, spectrum_3]
queries = [spectrum_2, spectrum_4]

# Realistic example
# references = large_references[1000:100000]
# queries = large_references[0:10]


In [None]:
def spectra_peaks_to_tensor(spectra: list, fill: float) -> tf.Tensor:
    sp_max_shape = max(len(s.peaks) for s in spectra)
    sp = np.full((len(spectra), sp_max_shape, 2), fill, 'float32')
    for i, s in enumerate(spectra):
        sp[i, :len(s.peaks)] = s.peaks.to_numpy
    sp = tf.constant(sp, tf.float32)
    return sp
import random
import gc
gc.collect()

mz_power = 0.0
intensity_power = 1.0

NUM_R = 100
NUM_Q = 10
references = random.sample(large_references,NUM_R) # 1e6
queries = random.sample(large_references,NUM_R) # 1e5

for i in tqdm(range(100)):
    ref_sp = spectra_peaks_to_tensor(references, np.inf)
    que_sp = spectra_peaks_to_tensor(queries, np.inf)
    shift = 0
    tolerance = .1
    R,M,T = ref_sp.shape
    Q,N,T = que_sp.shape

    a = tf.constant(ref_sp[..., 0])[...,None]
    b = tf.constant(que_sp[..., 0])[...,None]

    a_mat = tf.repeat(a, axis=-1, repeats=N)
    a_mat = a_mat[None,...]
    a_mat = tf.repeat(a_mat, axis=0, repeats=Q)
    a_mat = tf.transpose(a_mat, [1,0,2,3])

    b_mat = tf.repeat(b, axis=-1, repeats=M)
    b_mat = tf.transpose(b_mat, [0,2,1])
    b_mat = b_mat[None,...]
    b_mat = tf.repeat(b_mat, axis=0, repeats=R)
    b_mat = tf.add(b_mat, -shift)
    a_mat.shape, b_mat.shape

    match = tf.where(tf.abs(a_mat - b_mat) < tolerance)
    
    power_prod_spec1 = tf.gather(a_mat[:,0], match[:,0]) ** mz_power * tf.gather(a_mat[:,1], match[:,0]) ** intensity_power
    power_prod_spec2 = tf.gather(b_mat[:,0], match[:,1]) ** mz_power * tf.gather(b_mat[:,1], match[:,1]) ** intensity_power

    match = tf.cast(match, tf.float32)
    matching_pairs = tf.concat([match, power_prod_spec1[...,None] * power_prod_spec2[...,None]], 1)
    # print(i, tf.reduce_mean(z))

  0%|          | 0/100 [00:10<?, ?it/s]


ResourceExhaustedError: {{function_node __wrapped__Abs_device_/job:localhost/replica:0/task:0/device:GPU:0}} failed to allocate memory [Op:Abs]

In [None]:

# match_idx = tf.where(tf.abs(a_mat - (b_mat+shift)) <= tolerance)

# power_prod_spec1 = tf.gather(spec1[:,0], match_idx[:,0]) ** mz_power * tf.gather(spec1[:,1], match_idx[:,0]) ** intensity_power
# power_prod_spec2 = tf.gather(spec2[:,0], match_idx[:,1]) ** mz_power * tf.gather(spec2[:,1], match_idx[:,1]) ** intensity_power

# matching_pairs = tf.concat([tf.cast(match_idx, tf.float64), power_prod_spec1[...,None] * power_prod_spec2[...,None]], 1)

In [None]:

    # for i, idx in enumerate(idx1):
    #     power_prod_spec1 = (spec1[idx, 0] ** mz_power) * (spec1[idx, 1] ** intensity_power)
    #     power_prod_spec2 = (spec2[idx2[i], 0] ** mz_power) * (spec2[idx2[i], 1] ** intensity_power)
    #     matching_pairs.append([idx, idx2[i], power_prod_spec1 * power_prod_spec2])
    # return np.array(matching_pairs.copy())


# collect_peak_pairs(
#     spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy, 0.1,
#     shift=0.0, mz_power=0.0,
#     intensity_power=1.0
# ), collect_peak_pairs_opt(
#     spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy, 0.1,
#     shift=0.0, mz_power=0.0,
#     intensity_power=1.0
# )

# shift = 0
# tolerance = .1
# mz_power = 0.0
# intensity_power = 1.0

# spec1 = tf.constant(spec1)
# spec2 = tf.constant(spec2)

# a = tf.constant(spec1[:, 0])[...,None]
# b = tf.constant(spec2[:, 0])[None,...]

# a_mat = tf.repeat(a, axis=1, repeats=b.shape[1])
# b_mat = tf.repeat(b, axis=0, repeats=a.shape[0])
# match_idx = tf.where(tf.abs(a - (b+shift)) <= tolerance)

# power_prod_spec1 = tf.gather(spec1[:,0], match_idx[:,0]) ** mz_power * tf.gather(spec1[:,1], match_idx[:,0]) ** intensity_power
# power_prod_spec2 = tf.gather(spec2[:,0], match_idx[:,1]) ** mz_power * tf.gather(spec2[:,1], match_idx[:,1]) ** intensity_power

# tf.concat([tf.cast(match_idx, tf.float64), power_prod_spec1[...,None] * power_prod_spec2[...,None]], 1)

In [None]:
tolerance = 0.1
a = spectrum_1.peaks.to_numpy[:,0]
b = spectrum_2.peaks.to_numpy[:,0]
a = tf.constant(a)[None,...]
b = tf.constant(b)[...,None]
a = tf.repeat(a, axis=0, repeats=b.shape[0])
b = tf.repeat(b, axis=1, repeats=a.shape[1])
c = tf.where(tf.abs(a - b) <= tolerance)
c

<tf.Tensor: shape=(0, 2), dtype=int64, numpy=array([], shape=(0, 2), dtype=int64)>

InvalidArgumentError: {{function_node __wrapped__MatMul_device_/job:localhost/replica:0/task:0/device:CPU:0}} In[0] and In[1] ndims must be == 2: 1 [Op:MatMul]

In [None]:
# Now we can score the matching pairs
start_score = time.time()
# def fn(matching_pairs, spectrum_1, spectrum_2):
#     score = score_best_matches(matching_pairs, spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy,
#                                 0.0, 1.0)
#     return score

# scores = Parallel(-1)(delayed(fn)(matching_pairs, spectrum_1, spectrum_2) 
#                       for matching_pairs, spectrum_1, spectrum_2 in tqdm(pairs_to_score_list, total=len(pairs_to_score_list)))
scores = []
for matching_pairs, spectrum_1, spectrum_2 in tqdm(pairs_to_score_list):
    scores.append(score_best_matches(matching_pairs, spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy,
                                0.0, 1.0))

end_score = time.time()
print("Time to score matching pairs: ", end_score - start_score)

In [None]:
import pickle
# pickle.dump(pairs_to_score_list, Path('pairs_to_score_list.pickle').open('wb'))
# pickle.dump(scores, Path('scores.pickle').open('wb'))

In [None]:
scores_ld = pickle.load(Path('scores.pickle').open('rb'))

In [None]:
(np.array(scores_ld) == np.array(scores)).all()

True

Unoptimized
``` 
100%|██████████| 91698/91698 [00:56<00:00, 1627.05it/s]
Time to collect matching pairs:  56.94396376609802
100%|██████████| 635365/635365 [00:21<00:00, 29557.14it/s]
Time to score matching pairs:  21.498287439346313
```

Optimized (CPU parallelism, 12 CPUs)
``` 
100%|██████████| 916980/916980 [00:20<00:00, 44429.67it/s]
Time to collect matching pairs:  21.796888828277588
100%|██████████| 635365/635365 [00:20<00:00, 30734.33it/s]
Time to score matching pairs:  21.340625762939453
```

Optimized (CPU parallelism 12 CPUs)
```
100%|██████████| 916980/916980 [00:30<00:00, 29640.48it/s]
Time to collect matching pairs:  31.654369831085205
```