In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from joblib import Parallel, delayed
import os
from multiprocessing import shared_memory
from multiprocessing.pool import ThreadPool
from tqdm import tqdm
import numba
from typing import Tuple, List, Optional
from matchms import Spectrum
from matchms.typing import SpectrumType
import numpy as np
import pandas as pd
from pathlib import Path
import json
from numba import cuda
from numba.cuda.cudadrv.devicearray import DeviceNDArray
from numba import types
import math
import warnings
from numba.core.errors import NumbaPerformanceWarning
import time
from time import perf_counter
from itertools import product
import matplotlib.pyplot as plt
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

assert cuda.is_available()
from cudams.utils import ignore_performance_warnings
ignore_performance_warnings()

In [3]:
import numpy as np
from numba import cuda, int32

BSP2 = 9
BLOCK_SIZE = 2**BSP2

#CUDA kernel to calculate prefix sum of each block of input array
@cuda.jit('void(int32[:], int32[:], int32[:], int32, int32)')
def prefix_sum_nzmask_block(a, b, s, nzm, length):
    ab = cuda.shared.array(shape=(BLOCK_SIZE), dtype=int32)

    tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x;
    ab[cuda.threadIdx.x] = 0
    if tid < length:
        if nzm == 1:
            ab[cuda.threadIdx.x] = int32(a[tid] != 0); #Load mask of input data into shared memory
        else:
            ab[cuda.threadIdx.x] = int32(a[tid]); #Load input data into shared memory


    for j in range(0,BSP2):
        i = 2**j
        cuda.syncthreads()
        if i <= cuda.threadIdx.x:
            temp = ab[cuda.threadIdx.x]
            temp += ab[cuda.threadIdx.x - i] #Perform scan on shared memory
        cuda.syncthreads()
        if i <= cuda.threadIdx.x:
            ab[cuda.threadIdx.x] = temp
    if tid < length:
        b[tid] = ab[cuda.threadIdx.x]; #Write scanned blocks to global memory

    if(cuda.threadIdx.x == cuda.blockDim.x-1):  #Last thread of block
        s[cuda.blockIdx.x] = ab[cuda.threadIdx.x]; #Write last element of shared memory into global memory

#CUDA kernel to merge the prefix sums of individual blocks
@cuda.jit('void(int32[:], int32[:], int32)')
def pref_sum_update(b, s, length):
    tid = (cuda.blockIdx.x + 1) * cuda.blockDim.x + cuda.threadIdx.x; #Skip first block

    if tid<length:
        b[tid] += s[cuda.blockIdx.x] #Accumulate last elements of all previous blocks


#CUDA kernel to copy non-zero entries to the correct index of the output array
@cuda.jit('void(int32[:], int32[:], int32[:], int32)')
def map_non_zeros(a, prefix_sum, nz, length):
    tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x;

    if tid < length:
        input_value = a[tid]
        if input_value != 0:
            index = prefix_sum[tid] #The correct output index is the value at current index of prefix sum array
            nz[index-1] = input_value



#Apply stream compaction algorithm to get only the non-zero entries from the input array
def pref_sum(a, asum, nzm):
    block = BLOCK_SIZE
    length = a.shape[0]
    grid = int((length + block -1)/block)
    #Create auxiliary array to hold the sum of each block
    bs = cuda.device_array(shape=(grid), dtype=np.int32)

    #Perform partial scan of each block. Store block sum in auxiliary array named block_sum.
    prefix_sum_nzmask_block[grid, block](a, asum, bs, nzm, length)
    if grid > 1:
        bssum = cuda.device_array(shape=(grid), dtype=np.int32)
        pref_sum(bs, bssum, 0)
        pref_sum_update[grid-1, block](asum, bssum, length)

def get_non_zeros(a):
    #Copy input array from host to device
    ad = cuda.to_device(a)

    #Create prefix sum output array
    bd = cuda.device_array_like(ad)

    #Perform partial scan of each block. Store block sum in auxiliary array named block_sum.
    pref_sum(ad, bd, int(1))

    #The last element of prefix sum contains the total number of non-zero elements
    non_zero_count = int(bd[bd.shape[0]-1])
    #Create device output array to hold ONLY the non-zero entries
    non_zeros = cuda.device_array(shape=(non_zero_count), dtype=np.int32)

    #Copy ONLY the non-zero entries
    block = BLOCK_SIZE
    length = a.shape[0]
    grid = int((length + block -1)/block)
    map_non_zeros[grid, block](ad, bd, non_zeros, length)

    #Return to host
    return non_zeros.copy_to_host()

# arr = np.zeros(50000000, dtype=np.int32)
# for i in range(32,65000, 1024):
#     arr[i] = i
# nz = get_non_zeros(arr)
# print(arr)

In [4]:
# arr = np.arange(1024 * 1024).reshape(1024,1024).astype('float32')
arr = np.arange(1024).reshape(1024).astype('float32')
print(arr)

[0.000e+00 1.000e+00 2.000e+00 ... 1.021e+03 1.022e+03 1.023e+03]


In [35]:
batch_size = 2048
arr = np.random.uniform(size=(batch_size, batch_size)).astype('float32')
threads_per_block = 32,32
blocks_per_grid = (arr + threads_per_block[0] - 1) // threads_per_block[0]

In [41]:
r,q = np.nonzero(arr > .7)
v = arr[r,q]

In [11]:
arr = np.arange(1024).astype('int32')
print(arr)


THREADS_PER_BLOCK = 8
# BLOCKS_PER_GRID_X = math.ceil(arr.shape[0] / THREADS_PER_BLOCK[0])
# BLOCKS_PER_GRID_Y = math.ceil(arr.shape[1] / THREADS_PER_BLOCK[1])
# BLOCKS_PER_GRID = BLOCKS_PER_GRID_X,BLOCKS_PER_GRID_Y
BLOCKS_PER_GRID = (arr.shape[0] + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK
nelem = len(arr)

@cuda.jit
def array_sum(data):
    tid = cuda.threadIdx.x
    size = len(data)
    if tid < size:
        i = cuda.grid(1)

        # Declare an array in shared memory
        shr = cuda.shared.array(nelem, int32)
        shr[tid] = data[i]

        # Ensure writes to shared memory are visible
        # to all threads before reducing
        cuda.syncthreads()

        s = 1
        while s < cuda.blockDim.x:
            if tid % (2 * s) == 0:
                # Stride by `s` and add
                shr[tid] += shr[tid + s]
            s *= 2
            cuda.syncthreads()

        # After the loop, the zeroth  element contains the sum
        # if tid == 0:
        data[i] = shr[tid]

arr_cu = cuda.to_device(arr)
out_cu = cuda.device_array_like(arr)
array_sum[1, nelem](
    arr_cu
)
out = arr_cu.copy_to_host()
out[0], arr.sum()

[   0    1    2 ... 1021 1022 1023]


(523776, 523776)

In [10]:
out

array([8128,    1,    5, ..., 1021, 2045, 1023], dtype=int32)