In [1]:
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_VISIBLE_DEVICES=5


In [2]:
import numpy as np
import cupy as cp

In [3]:
xp = cp

X = 1500
Y = 1500
Z = 20
K = 3
data = np.random.rand(X,Y,Z).astype('float32')

# Compare with

In [4]:
%%time
kernel_size = K*2 + 1
k = kernel_size // 2

data_n = cp.asarray(data)
i_range, x_range = data.shape[:2]

CPU times: user 103 ms, sys: 332 ms, total: 435 ms
Wall time: 4.44 s


In [5]:
def vectors_comparison(vector1, vector2):
    res = xp.multiply(vector1, vector2)
    res = xp.sum(xp.power(res, 2), axis = 2)
    return res

In [6]:
def run():
    padded_data = xp.pad(data_n, ((k, k), (k, k), (0, 0)), constant_values=xp.nan)

    res5 = []
    for i in range(kernel_size):
        for j in range(kernel_size):
            if i == j == k:
                continue

            shifted_data = padded_data[i:i+i_range, j:j+x_range]

            computed = vectors_comparison(data_n, shifted_data)
            res5.append(computed)
    cp.cuda.stream.get_current_stream().synchronize()
    return res5

In [7]:
%%time
res5 = run()

CPU times: user 547 ms, sys: 54.2 ms, total: 601 ms
Wall time: 2.19 s


In [8]:
%%time
res5 = run()

CPU times: user 62 ms, sys: 88.3 ms, total: 150 ms
Wall time: 149 ms


In [9]:
%%time
res5 = run()

CPU times: user 95.9 ms, sys: 51.1 ms, total: 147 ms
Wall time: 146 ms


In [10]:
%%timeit
res5 = run()

146 ms ± 261 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
