In [1]:
%env CUDA_VISIBLE_DEVICES=6

In [2]:
import numpy as np
import cupy as cp

In [3]:
xp = cp

X = 1500
Y = 1500
Z = 20
K = 3
data = np.random.rand(X,Y,Z).astype('float32')

# numpy

In [4]:
def vectors_comparison(vector1, vector2):
    res = xp.multiply(vector1, vector2)
    res = xp.sum(xp.power(res, 2), axis = 2)
    return res

def vectors_comparison1(matrix1, matrix2):
    return (xp.power(matrix1 * matrix2, 2)).sum(axis=2)

def compare_matrices(a_pad, data, xk, yk, K, comp_function):
    comparison = comp_function(data, a_pad[xk:xk+data.shape[0], K+yk:K+yk+data.shape[1], :])
    res = xp.pad(comparison, ((xk, 0), (K, K)), constant_values=xp.nan)[0:data.shape[0], K-yk:K-yk+data.shape[1]]
    return comparison, res

def get_comparison(data, X, Y, Z, K, comp_function):
    K = K + 1
    a_pad = xp.pad(data, ((0, K), (K, K), (0, 0)), constant_values=xp.nan)
    result = []    
    for xk in range(0, K):
        for yk in range(0, K):
            if (xk!=0) or (yk!=0):   
                result.extend(compare_matrices(a_pad, data, xk, yk, K, comp_function))
                if (yk!=0) and (xk!=0):
                    result.extend(compare_matrices(a_pad, data, xk, -yk, K, comp_function))
    cp.cuda.stream.get_current_stream().synchronize()
    return result

In [5]:
%%time
data_n = cp.asarray(data)

CPU times: user 94.1 ms, sys: 281 ms, total: 375 ms
Wall time: 432 ms


In [6]:
%%time
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

CPU times: user 408 ms, sys: 40.5 ms, total: 449 ms
Wall time: 452 ms


In [7]:
%%time
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

CPU times: user 64.6 ms, sys: 26.1 ms, total: 90.7 ms
Wall time: 89.8 ms


In [8]:
%%time
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

CPU times: user 45.7 ms, sys: 30.7 ms, total: 76.4 ms
Wall time: 75.4 ms


In [9]:
%%timeit
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

74.9 ms ± 76.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Compare with

In [10]:
%%time
kernel_size = K*2 + 1
k = kernel_size // 2

data_n = cp.asarray(data)
i_range, x_range = data.shape[:2]

#def function(matrix1, matrix2):
#    return (xp.power(matrix1 * matrix2, 2)).sum(axis=2)

CPU times: user 36.8 ms, sys: 1.15 ms, total: 37.9 ms
Wall time: 36.4 ms


In [11]:
def run():
    padded_data = xp.pad(data_n, ((k, k), (k, k), (0, 0)), constant_values=xp.nan)

    res5 = []
    for i in range(kernel_size):
        for j in range(kernel_size):
            if i == j == k:
                continue

            shifted_data = padded_data[i:i+i_range, j:j+x_range]

            computed = vectors_comparison(data_n, shifted_data)
            res5.append(computed)
    cp.cuda.stream.get_current_stream().synchronize()
    return res5

In [12]:
%%time
res5 = run()

CPU times: user 92 ms, sys: 56.2 ms, total: 148 ms
Wall time: 148 ms


In [13]:
%%time
res5 = run()

CPU times: user 75 ms, sys: 74.7 ms, total: 150 ms
Wall time: 147 ms


In [14]:
%%time
res5 = run()

CPU times: user 96 ms, sys: 52.1 ms, total: 148 ms
Wall time: 147 ms


In [15]:
%%timeit
res5 = run()

148 ms ± 220 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
