In [3]:
%load_ext autoreload
%autoreload 2
%pwd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'/home/tornikeo/Documents/work/scalexa/pangeaai/optimize-cosine'

In [4]:
from cudams.utils import argbatch, mkdir
from cudams.data import get_ref_spectra_from_df
from cudams.kernel import compile
from cudams.utils import name2idx
from cudams.cosine import similarity
import math
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from cudams.data import spectra_peaks_to_tensor
from cudams.processor import Config
from numba import cuda
from itertools import product
from time import perf_counter
from multiprocessing.pool import ThreadPool
from multiprocessing import shared_memory
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib

assert cuda.is_available()

In [17]:
## Define constants
tolerance: float = 0.1
shift: float = 0
mz_power: float = 0
int_power: float = 1

## How many pairs per batch. Has to be a power of 2.
# Hardware specific - An RTX2070 works best at around 1024 * 2
# But Colab T4 GPU might work best at 1024 * 4
BATCH_SIZE = 512

# MAX NUMBER OF PEAKS 
MAX_PEAKS = 1024

# MATCH_LIMIT specifies max how many mz-mz pairs we could consider for each RQ pair, before we sort and filter. 
# E.g. a value of 256 usually causes around ~0.003% of RQ pairs to "overflow".
# The overflown RQ scores will be strictly less than or equal to perfectly accurate score.
# The mean absolute difference at 256, for all overflown pairs is on the order of ~1e-3
# Small values of MATCH_LIMIT (e.g. 128, 64,) cause a dramatic speedup in the processing speed.
MATCH_LIMIT = 1024 * 4

## GPU-specific constants
THREADS_PER_BLOCK = (32, 32)
BLOCKS_PER_GRID_X = math.ceil(BATCH_SIZE / THREADS_PER_BLOCK[0])
BLOCKS_PER_GRID_Y = math.ceil(BATCH_SIZE / THREADS_PER_BLOCK[1])
BLOCKS_PER_GRID = (BLOCKS_PER_GRID_X, BLOCKS_PER_GRID_Y)

# Since Greedy cosine is an unstable algorithm, because approximate mz-mz values do not
# result in approximately the same scores and number of matches.
# So we need to use fp64 to minimize the deviation as much as possible.
# Using float32 causes a significant speedup in the processing speed.
dtype = 'float64'

# Data path
reference_csv_file = Path("data/input/test_set_cosine.csv")
query_csv_file = Path("data/input/test_set_cosine.csv")

In [18]:
from cudams.processor import CudaCosineGreedy, CpuCosineGreedy
from collections import defaultdict
from matchms import calculate_scores
from tqdm import tqdm
from matchms.filtering import normalize_intensities, select_by_mz, select_by_relative_intensity, reduce_to_number_of_peaks, \
    require_minimum_number_of_peaks
from cudams.utils import mute_stdout

def process_spectrum(spectrum: np.ndarray) -> np.ndarray:
    # spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
    # spectrum = normalize_intensities(spectrum)
    # spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
    # spectrum = reduce_to_number_of_peaks(spectrum, n_max=1000)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=MAX_PEAKS)
    # spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
    return spectrum

ref_spectra_df_path = Path(reference_csv_file)
ref_spectra_df = pd.read_csv(ref_spectra_df_path)
references = get_ref_spectra_from_df(ref_spectra_df, 
                                    spectrum_processor=process_spectrum,
                                    limit=BATCH_SIZE * 2,)

query_spectra_df_path = Path(query_csv_file)
query_spectra_df = pd.read_csv(query_spectra_df_path)
queries = get_ref_spectra_from_df(query_spectra_df, 
                                spectrum_processor=process_spectrum,
                                limit=BATCH_SIZE * 2,)

100%|██████████| 1024/1024 [00:02<00:00, 386.19it/s]
100%|██████████| 1024/1024 [00:00<00:00, 3631.09it/s]


In [19]:
references = references[-BATCH_SIZE:]
queries = queries[-BATCH_SIZE:]

In [21]:
from typing import Tuple
from matchms.typing import SpectrumType
from matchms.similarity import CosineGreedy as OriginalCosineGreedy
from cudams.processor import CudaCosineGreedy, CpuCosineGreedy
from collections import defaultdict
from matchms import calculate_scores
from tqdm import tqdm
from matchms.filtering import normalize_intensities, select_by_mz, select_by_relative_intensity, reduce_to_number_of_peaks, \
    require_minimum_number_of_peaks
from cudams.utils import mute_stdout

from matchms.typing import SpectrumType
from matchms.similarity.BaseSimilarity import BaseSimilarity
from matchms.similarity.spectrum_similarity_functions import (collect_peak_pairs,
                                            score_best_matches)


class CosineGreedy(OriginalCosineGreedy):
    def __init__(self, tolerance: float = 0.1, mz_power: float = 0, intensity_power: float = 1):
        super().__init__(tolerance, mz_power, intensity_power)
        
    def pair(self, reference: SpectrumType, query: SpectrumType) -> Tuple[float, int]:
        """Calculate cosine score between two spectra.

        Parameters
        ----------
        reference
            Single reference spectrum.
        query
            Single query spectrum.

        Returns
        -------
        Score
            Tuple with cosine score and number of matched peaks.
        """
        def get_matching_pairs():
            """Get pairs of peaks that match within the given tolerance."""
            matching_pairs = collect_peak_pairs(spec1, spec2, self.tolerance,
                                                shift=0.0, mz_power=self.mz_power,
                                                intensity_power=self.intensity_power)
            if matching_pairs is None:
                return None
            # TODO: Original doesn't have kind=stable!
            matching_pairs = matching_pairs[np.argsort(matching_pairs[:, 2],kind='stable')[::-1], :]
            return matching_pairs

        spec1 = reference.peaks.to_numpy
        spec2 = query.peaks.to_numpy
        matching_pairs = get_matching_pairs()
        if matching_pairs is None:
            return np.asarray((float(0), 0), dtype=self.score_datatype)
        # return np.asarray((float(1), len(matching_pairs)), dtype=self.score_datatype)
    
        score = score_best_matches(matching_pairs, spec1, spec2,
                                   self.mz_power, self.intensity_power)
        return np.asarray(score, dtype=self.score_datatype)

similarity_measure = CosineGreedy(tolerance=tolerance, 
                                mz_power= 0.0, 
                                intensity_power = 1.0)
C_orig = calculate_scores(references, queries, similarity_measure, is_symmetric=False)
Cy = C_orig.to_array()
Cy, Cm = Cy['CosineGreedy_score'], Cy['CosineGreedy_matches']
Cy = np.stack([Cy,Cm],axis=-1)

100%|██████████| 1/1 [00:01<00:00,  1.93s/it]
Batch all references: 1it [00:00, 89.64it/s]
Batch all queries: 1it [00:00, 90.27it/s]
100%|██████████| 1/1 [00:01<00:00,  1.99s/it]

MOST IMPORTANT ====>  0.9990463256835938 250
Overflows 0.0 0
CPU orig vs GPU 0.9988288879394531 614
CPU opt vs GPU: 0.9993743896484375 328
CPU orig vs CPU opt 0.9993362426757812 348





count    262144.000000
mean          0.000076
std           0.034719
min          -2.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           4.000000
dtype: float64

In [30]:
from typing import Tuple

from matchms.typing import SpectrumType
from matchms.similarity.BaseSimilarity import BaseSimilarity
from matchms.similarity.spectrum_similarity_functions import (collect_peak_pairs,
                                            score_best_matches)

from matchms.typing import SpectrumType
from matchms.similarity import CosineGreedy as OriginalCosineGreedy
from cudams.processor import CudaCosineGreedy, CpuCosineGreedy
from collections import defaultdict
from matchms import calculate_scores
from matchms.similarity import CosineGreedy
from tqdm import tqdm
from matchms.filtering import normalize_intensities, select_by_mz, select_by_relative_intensity, reduce_to_number_of_peaks, \
    require_minimum_number_of_peaks
from cudams.utils import mute_stdout

refs = list([r.peaks.to_numpy for r in references])
ques = list([q.peaks.to_numpy for q in queries])

rlims = argbatch(refs, BATCH_SIZE)
qlims = argbatch(ques, BATCH_SIZE)
R = len(references)
Q = len(queries)

batches_rq = list(product(rlims, qlims))

for (rstart, rend), (qstart, qend) in tqdm(batches_rq, total=len(batches_rq)):
    rspec = refs[rstart:rend]
    qspec = ques[qstart:qend]
    out_true = np.full((BATCH_SIZE, BATCH_SIZE, 2), fill_value=0, dtype='float32')
    for (i, spec1), (j, spec2) in product(enumerate(rspec), enumerate(qspec)):
            score = similarity(
                spec1,
                spec2,
                tolerance=tolerance,
                shift=shift,
                mz_power=mz_power,
                int_power=int_power,
            )
            if score is not None:
                out_true[i,j,0] = score[0]
                out_true[i,j,1] = score[1]
    
C = np.empty((BATCH_SIZE,BATCH_SIZE,2), dtype='float32')
C[:] = out_true[:]

cosine = CudaCosineGreedy(
            tolerance=tolerance,
            mz_power=0,
            intensity_power=1, 
            shift=0,
            batch_size=BATCH_SIZE,
            match_limit=MATCH_LIMIT,
        )
cosine.compile()
G, Ov = cosine.matrix(
    references=references, 
    queries=queries, 
    array_type="numpy"
)
R,Q,_ = Cy.shape

a,b = Cy[:R,:Q, 1], G[:R,:Q, 1]
C_match = np.isclose(Cy[:R,:Q, 1], G[:R,:Q, 1])
corr = C_match.mean()
print("MOST IMPORTANT ====> ", corr, (1-C_match).sum())

print("Overflows", Ov[:R,:Q].mean(), Ov[:R,:Q].sum())
C_match = np.isclose(Cy[:R,:Q], G[:R,:Q])
corr = C_match.mean()
print("CPU orig vs GPU", corr, (1-C_match).sum())

C_match = np.isclose(C[:R,:Q], G[:R,:Q])
corr = C_match.mean()
print("CPU opt vs GPU:", corr, (1-C_match).sum())

C_match = np.isclose(Cy[:R,:Q], C[:R,:Q])
corr = C_match.mean()
print("CPU orig vs CPU opt", corr, (1-C_match).sum())
pd.Series((b - a).ravel()).describe()

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
Batch all references: 1it [00:00, 72.34it/s]
Batch all queries: 1it [00:00, 73.91it/s]
100%|██████████| 1/1 [00:02<00:00,  2.18s/it]

MOST IMPORTANT ====>  1.0 0
Overflows 0.0 0
CPU orig vs GPU 1.0 0
CPU opt vs GPU: 0.9993362426757812 348
CPU orig vs CPU opt 0.9993362426757812 348





count    262144.0
mean          0.0
std           0.0
min           0.0
25%           0.0
50%           0.0
75%           0.0
max           0.0
dtype: float64

In [10]:

a,b = Cy[:R,:Q, 1], G[:R,:Q, 1]
C_match = np.isclose(Cy[:R,:Q, 1], G[:R,:Q, 1])
corr = C_match.mean()
print("MOST IMPORTANT ====> ", corr, (1-C_match).sum())

print("Overflows", Ov[:R,:Q].mean(), Ov[:R,:Q].sum())
C_match = np.isclose(Cy[:R,:Q], G[:R,:Q])
corr = C_match.mean()
print("CPU orig vs GPU", corr, (1-C_match).sum())

C_match = np.isclose(C[:R,:Q], G[:R,:Q])
corr = C_match.mean()
print("CPU opt vs GPU:", corr, (1-C_match).sum())

C_match = np.isclose(Cy[:R,:Q], C[:R,:Q])
corr = C_match.mean()
print("CPU orig vs CPU opt", corr, (1-C_match).sum())
# pd.Series((b - a).ravel()).describe()

MOST IMPORTANT ====>  0.9996243990384616 5
Overflows 0.0 0
CPU orig vs GPU 0.9996243990384616 10
CPU opt vs GPU: 0.9998497596153846 4
CPU orig vs CPU opt 0.9997746394230769 6


In [8]:
C_match = np.isclose(Cy[:R,:Q], G[:R,:Q])
a,b = np.nonzero(1-C_match[...,0])

In [9]:
# C_match = np.isclose(Cy[:R,:Q], C[:R,:Q])
# a,b = np.nonzero(1-C_match[...,0])
a,b

(array([ 6, 58, 58, 63]), array([58,  6, 63, 58]))

In [13]:
for i in range(len(a)):
    r, q = references[a[i]].peaks.to_numpy[:,0], \
        queries[b[i]].peaks.to_numpy[:,0]
    Cy[a[i],b[i]],G[a[i],b[i]]
    pairs = []
    diffs = []
    lowest_idx = 0
    for peak1_idx in tqdm(range(len(r))):
        mz = r[peak1_idx]
        lo = mz - .1
        hi = mz + .1
        for peak2_idx in range(lowest_idx, len(q)):
            mz2 = q[peak2_idx]
            diffs.append(abs(mz2 - hi))
            diffs.append(abs(mz2 - lo))
            diffs.append(abs(hi - mz2))
            diffs.append(abs(lo - mz2))
            if mz2 > hi:
                break
            if mz2 < lo:
                lowest_idx = peak2_idx + 1
            else:
                pairs.append([peak1_idx, peak2_idx])
    
    print(min(diffs), len(pairs))

100%|██████████| 500/500 [00:00<00:00, 436542.88it/s]


0.0010010000000306718 78


100%|██████████| 185/185 [00:00<00:00, 207805.63it/s]


0.0010010000000306718 78


100%|██████████| 185/185 [00:00<00:00, 174213.35it/s]


0.004694000000029064 61


100%|██████████| 415/415 [00:00<00:00, 324261.58it/s]

0.004694000000029064 61



