In [1]:
%load_ext autoreload
%autoreload 2
%pwd

'/home/tornikeo/Documents/work/scalexa/pangeaai/optimize-cosine'

In [2]:
from cudams.utils import argbatch, mkdir, get_ref_spectra_from_df
import math
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from numba import cuda
from itertools import product
from time import perf_counter
from multiprocessing.pool import ThreadPool
from multiprocessing import shared_memory
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib

assert cuda.is_available()

In [4]:
## Define constants
tolerance: float = 0.1
shift: float = 0
mz_power: float = 0
int_power: float = 1

## How many pairs per batch. Has to be a power of 2.
# Hardware specific - An RTX2070 works best at around 1024 * 2
# But Colab T4 GPU might work best at 1024 * 4
BATCH_SIZE = 2048

# MAX NUMBER OF PEAKS during filtering. Due to nature of matrices, having large number of 
# peaks will increase memory requirements. After 1024, this has diminishing benefits, as 
# smaller and smaller (likely noisy) peaks are taken into consideration when running similarity.
MAX_PEAKS = 1024

# MATCH_LIMIT specifies max how many mz-mz pairs we could consider for each RQ pair, before we sort and filter. 
# E.g. a value of 256 usually causes around ~0.003% of RQ pairs to "overflow".
# The overflown RQ scores will be strictly less than or equal to perfectly accurate score.
# The mean absolute difference at 256, for all overflown pairs is on the order of ~1e-3
# Small values of MATCH_LIMIT (e.g. 128, 64,) cause a dramatic speedup in the processing speed.
MATCH_LIMIT = 1024

# Since Greedy cosine is an unstable algorithm, because approximate mz-mz values do not
# result in approximately the same scores and number of matches.
# So we need to use fp64 to minimize the deviation as much as possible.
# Using float32 causes a significant speedup in the processing speed.
dtype = 'float32'

# Data path
reference_csv_file = Path("data/input/example_dataset_tornike.csv")
query_csv_file = Path("data/input/example_dataset_tornike.csv")

In [5]:
from cudams.utils import get_spectra_batches

len_spectra = 1024
references, queries, batches_inputs = get_spectra_batches(
    batch_size=BATCH_SIZE,
    max_peaks=MAX_PEAKS,
    max_pairs=16 * (BATCH_SIZE ** 2), # 16 batches, give or take...
    padding=None,
)
TOTAL_BATCHES = len(batches_inputs)
batch_outputs = np.empty(shape=(TOTAL_BATCHES,4),dtype=object)

100%|██████████| 16384/16384 [00:05<00:00, 2982.35it/s]


In [7]:
import cupy as cp

In [8]:
from cudams.similarity.kernels import cosine_greedy_kernel

kernel = compile_cuda_cosine_greedy_kernel(
    tolerance=tolerance,
    shift=shift,
    mz_power=mz_power,
    int_power=int_power,
    match_limit=MATCH_LIMIT,
    batch_size=BATCH_SIZE
)

R, Q = BATCH_SIZE, BATCH_SIZE
THREADS_PER_BLOCK = (32, 32)
BLOCKS_PER_GRID_X = math.ceil(R / THREADS_PER_BLOCK[0])
BLOCKS_PER_GRID_Y = math.ceil(Q / THREADS_PER_BLOCK[1])
BLOCKS_PER_GRID = (BLOCKS_PER_GRID_X, BLOCKS_PER_GRID_Y)

for batch_i in tqdm(range(TOTAL_BATCHES)):
        # We get our batch and lengths (lengths are different for different spectra)
    (rspec, rlen, rstart, rend), (qspec, qlen, qstart, qend) = batches_inputs[
        batch_i
    ]
    
    lens = cp.zeros((2, BATCH_SIZE), "int32")
    lens[0, :len(rlen)] = rlen
    lens[1, :len(qlen)] = qlen
    
    
    # We order empty space for results on GPU RAM
    # scores = cp.zeros(
    #     (BATCH_SIZE, BATCH_SIZE), dtype="float32"
    # )
    # used_matches = cp.zeros(
    #     (BATCH_SIZE, BATCH_SIZE), dtype="int32"
    # )
    # overflow = cp.zeros(
    #     (BATCH_SIZE, BATCH_SIZE), dtype="uint8"
    # )
    out = cp.zeros(
        (BATCH_SIZE, BATCH_SIZE, 2), dtype=dtype
    )
    overflow = cp.zeros(
        (BATCH_SIZE, BATCH_SIZE, 1), dtype="uint8"
    )

    # rmz = cp.asarray(rspec[0])
    # rint = cp.asarray(rspec[1])
    
    # qmz = cp.asarray(qspec[0])
    # qint = cp.asarray(qspec[1])
    
    # rlen = cp.asarray(lens[0])
    # qlen = cp.asarray(lens[1])
    
    # rnorm = ((rmz ** mz_power) * (rint ** int_power)).sum()
    
    kernel[BLOCKS_PER_GRID, THREADS_PER_BLOCK](
            rspec,
            qspec,            
            lens,
            out,
            overflow,
    )
    out.get()
    
    # break
# plt.imshow(scores.get())

  0%|          | 0/16 [00:00<?, ?it/s]


ValueError: non-scalar numpy.ndarray cannot be used for fill

In [10]:
len(rlen)

2048