In [20]:
# We compare pairwise similarity performance on `spectra_file`
# We can use pickled version for much faster loading times
spectra_file = 'GNPS-random-10k.mgf'

# We take a random sample of spectra from said file

# Minimum size:
chunk_sizes_min = 32

# Maximum size
chunk_sizes_max = 2048

# how many points to evaluate (in logspace) between min and max
num_evals = 15

! echo Number of CPU cores $(nproc)

Number of CPU cores 128


In [21]:
from cudams.utils import Timer
from tqdm import tqdm
import numpy as np
import sys
import pandas as pd
import pickle
from cudams.utils import download
from joblib import Parallel, delayed
from matchms.similarity import CosineGreedy, ModifiedCosine
from matchms.importing import load_from_mgf
from matchms.filtering import require_precursor_mz
from cudams.utils import Timer

np.random.seed(42)

raw_spectra = list(load_from_mgf(download(spectra_file)))

In [22]:
spectra = []
for s in raw_spectra:
    s = require_precursor_mz(s)
    if s is not None:
        spectra.append(s)

In [23]:
len(spectra)

9997

In [24]:
chunk_sizes = np.round(np.logspace(
    np.log2(chunk_sizes_min), 
    np.log2(chunk_sizes_max), 
    num=num_evals, 
    base=2, 
    endpoint=True)
).astype(int)

def loop(chunk_size, kernel):
    references = np.random.choice(spectra, size=chunk_size)
    queries = np.random.choice(spectra, size=chunk_size)

    # Allow warm-up
    kernel.matrix(references[:4], queries[:4])

    with Timer() as timer:
        kernel.matrix(references, queries)
    return kernel.__class__.__name__, \
        timer.duration,\
        len(references) * len(queries) # All samples

# We have enough CPUs, right?
data = Parallel(4)(delayed(loop)(chunk_size, kernel) for chunk_size in tqdm(chunk_sizes) for kernel in [CosineGreedy(), ModifiedCosine()])
data = pd.DataFrame(data, columns=['kernel','time','pairs'])
data['label'] = 'MatchMS'






  0%|          | 0/15 [00:00<?, ?it/s][A[A[A[A[A




 13%|█▎        | 2/15 [00:00<00:02,  4.86it/s][A[A[A[A[A




 27%|██▋       | 4/15 [00:07<00:23,  2.12s/it][A[A[A[A[A




 40%|████      | 6/15 [00:09<00:15,  1.73s/it][A[A[A[A[A




 47%|████▋     | 7/15 [00:10<00:10,  1.34s/it][A[A[A[A[A




 53%|█████▎    | 8/15 [00:11<00:09,  1.30s/it][A[A[A[A[A




 67%|██████▋   | 10/15 [00:16<00:09,  1.83s/it][A[A[A[A[A




 80%|████████  | 12/15 [00:26<00:09,  3.05s/it][A[A[A[A[A




100%|██████████| 15/15 [00:57<00:00,  3.84s/it][A[A[A[A[A


In [25]:
data.to_json(sys.stdout)

{"kernel":{"0":"CosineGreedy","1":"ModifiedCosine","2":"CosineGreedy","3":"ModifiedCosine","4":"CosineGreedy","5":"ModifiedCosine","6":"CosineGreedy","7":"ModifiedCosine","8":"CosineGreedy","9":"ModifiedCosine","10":"CosineGreedy","11":"ModifiedCosine","12":"CosineGreedy","13":"ModifiedCosine","14":"CosineGreedy","15":"ModifiedCosine","16":"CosineGreedy","17":"ModifiedCosine","18":"CosineGreedy","19":"ModifiedCosine","20":"CosineGreedy","21":"ModifiedCosine","22":"CosineGreedy","23":"ModifiedCosine","24":"CosineGreedy","25":"ModifiedCosine","26":"CosineGreedy","27":"ModifiedCosine","28":"CosineGreedy","29":"ModifiedCosine"},"time":{"0":3.3452200247,"1":3.4825263112,"2":1.4242219273,"3":3.6208217419,"4":1.5034325663,"5":1.3939296789,"6":0.3249981231,"7":0.9768814989,"8":0.5354183698,"9":2.4722801382,"10":1.0703489119,"11":4.0897979671,"12":2.5273353779,"13":5.94818806,"14":3.3287949921,"15":10.5887021492,"16":6.0714132083,"17":19.0591123402,"18":11.0056099179,"19":34.3604118112,"20":19.