This notebook evaluates performance gains of `CudaCosineGreedy` against `matchms`.

Performance depends heavily on used hardware, as well as how correlated the spectra are with each other, (i.e. on average, how many common pairs of peaks do spectra have).

This specific notebook evaluates the performance given the following below arguments (feel free to change these):

In [1]:
# We can use pickled version for much faster loading times
spectra_file = 'GNPS-LIBRARY-default-filter-nmax-2048.pickle'

# We take a random sample of spectra from said file

# Minimum size:
chunk_sizes_min = 32

# Maximum size
chunk_sizes_max = 5_000

# how many points to evaluate (in logspace) between min and max
num_evals = 15

# max number of peaks to retain in any spectra - larger numbers are marginally more accurate, but much slower
n_max_peaks = 1024

# Match limit
match_limit = 2048

# tolerance
tolerance = 0.1

# optimal batch size is hardware-dependent, but usually the best number is the largest the hardware can handle (without an OOM error)
batch_size = 2048

# Hardware matters! These results are only repeatable using this GPU (shown as an output)
! nvidia-smi -L
! echo Number of CPU cores $(nproc)

GPU 0: Tesla T4 (UUID: GPU-910df82c-0fd1-816b-c967-a08a64d0cda4)
Number of CPU cores 2


In [2]:
! pip install -q numpy==1.24
! pip uninstall cudams -q -y
! pip install -q --upgrade git+https://github.com/tornikeo/cudams@main

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chex 0.1.86 requires numpy>=1.24.1, but you have numpy 1.24.0 which is incompatible.
imageio 2.31.6 requires pillow<10.1.0,>=8.3.2, but you have pillow 10.2.0 which is incompatible.
numba 0.58.1 requires llvmlite<0.42,>=0.41.0dev0, but you have llvmlite 0.40.1 which is incompatible.
seaborn 0.13.1 requires numpy!=1.24.0,>=1.20, but you have numpy 1.24.0 which is incompatible.[0m[31m
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for cudams (pyproject.toml) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take in

In [3]:
from cudams.utils import argbatch, Timer
from cudams.similarity.spectrum_similarity_functions import cosine_greedy_kernel
from pathlib import Path
from tqdm import tqdm
from numba import cuda
import numpy as np
import torch
import matplotlib.pyplot as plt
import pickle
from cudams.utils import download
from joblib import Parallel, delayed
from matchms.filtering import default_filters, normalize_intensities, reduce_to_number_of_peaks
from matchms.importing import load_from_mgf
from cudams.utils import mute_stdout

spectra = pickle.load(open(download(spectra_file),'rb'))

  from scipy.sparse.sputils import get_index_dtype
Downloading data from 'https://github.com/tornikeo/cudams/releases/download/samples-0.1/GNPS-LIBRARY-default-filter-nmax-2048.pickle' to file '/root/.cache/pooch/2243cebc54a3035914aa78c57585f0d1-GNPS-LIBRARY-default-filter-nmax-2048.pickle'.
100%|█████████████████████████████████████| 86.7M/86.7M [00:00<00:00, 12.8GB/s]
SHA256 hash of downloaded file: 9797fa1068f59c6e2a2e005c44d7322f07b6cb08e125c9d5c2cc60bdfe3771e6
Use this value as the 'known_hash' argument of 'pooch.retrieve' to ensure that the file hasn't changed if it is downloaded again in the future.
  spectra = pickle.load(open(download(spectra_file),'rb'))


In [4]:
from cudams.similarity import CudaCosineGreedy

np.random.seed(42)
chunk_sizes_cu = np.round(np.logspace(
    np.log2(chunk_sizes_min),
    np.log2(chunk_sizes_max),
    num=num_evals,
    base=2,
    endpoint=True)
).astype(int)

times_cu = []
pairs_cu = []

# Kernel compilation might take a bit of time initially
kernel = CudaCosineGreedy(batch_size=batch_size,
                          n_max_peaks=n_max_peaks)

# To force CUDA to load code to GPU, we need to do warmup
kernel.matrix(spectra[:64], spectra[:64])

# We avoid parallel here, since we only have one GPU, after all.
for chunk_size in tqdm(chunk_sizes_cu):
    chunk_size = min(len(spectra), chunk_size) # We might run out of spectra
    references = spectra[:chunk_size]
    queries = references # Pairwise
    with Timer() as timer:
        kernel.matrix(references, queries)
    times_cu.append(timer.duration)
    pairs_cu.append(len(references) * len(queries)) # We've processed all pairs

  and should_run_async(code)
100%|██████████| 15/15 [01:25<00:00,  5.67s/it]


In [9]:
import json, os

benchmark_summary = dict(
        pairs_cu=pairs_cu,
        times_cu=times_cu,
        device=torch.cuda.get_device_name(),
        nproc=os.cpu_count(),
    )
print(json.dumps(benchmark_summary,indent=1))

{
 "pairs_cu": [
  1024,
  2116,
  4356,
  8836,
  18496,
  37636,
  77841,
  160000,
  329476,
  677329,
  1394761,
  2869636,
  5904900,
  12152196,
  25000000
 ],
 "times_cu": [
  0.8951638820000198,
  0.887718360000008,
  0.8959380680000777,
  0.9086294239999688,
  0.9281062210000073,
  0.9271748569999545,
  0.9635137029999896,
  1.0036216500000137,
  1.2886957990000383,
  1.5533741749999308,
  2.4807445960000223,
  3.2844729860000825,
  9.68464170499999,
  19.980734250999944,
  39.32723661600005
 ],
 "device": "Tesla T4",
 "nproc": 2
}
