In [1]:
%env CUDA_VISIBLE_DEVICES=5

In [2]:
import numpy as np
import cupy as cp

In [3]:
xp = cp

X = 1500
Y = 1500
Z = 20
K = 3
data = np.random.rand(X,Y,Z).astype('float32')

# numpy

In [4]:
def vectors_comparison(vector1, vector2):
    res = xp.multiply(vector1, vector2)
    res = xp.sum(xp.power(res, 2), axis = 2)
    return res

def vectors_comparison1(matrix1, matrix2):
    return (xp.power(matrix1 * matrix2, 2)).sum(axis=2)

def compare_matrices(a_pad, data, xk, yk, K, comp_function):
    comparison = comp_function(data, a_pad[xk:xk+data.shape[0], K+yk:K+yk+data.shape[1], :])
    res = xp.pad(comparison, ((xk, 0), (K, K)), constant_values=xp.nan)[0:data.shape[0], K-yk:K-yk+data.shape[1]]
    return comparison, res

def get_comparison(data, X, Y, Z, K, comp_function):
    K = K + 1
    a_pad = xp.pad(data, ((0, K), (K, K), (0, 0)), constant_values=xp.nan)
    result = []    
    for xk in range(0, K):
        for yk in range(0, K):
            if (xk!=0) or (yk!=0):   
                result.extend(compare_matrices(a_pad, data, xk, yk, K, comp_function))
                if (yk!=0) and (xk!=0):
                    result.extend(compare_matrices(a_pad, data, xk, -yk, K, comp_function))
    cp.cuda.stream.get_current_stream().synchronize()
    return result

In [5]:
%%time
data_n = cp.asarray(data)

CPU times: user 99.4 ms, sys: 367 ms, total: 466 ms
Wall time: 5.87 s


In [6]:
%%time
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

CPU times: user 507 ms, sys: 29.9 ms, total: 537 ms
Wall time: 2.14 s


In [7]:
%%time
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

CPU times: user 62.9 ms, sys: 27.9 ms, total: 90.8 ms
Wall time: 89.8 ms


In [8]:
%%time
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

CPU times: user 42.6 ms, sys: 44.6 ms, total: 87.2 ms
Wall time: 86.1 ms


In [9]:
%%timeit
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

74.3 ms ± 87.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
