In [1]:
import numpy as np
import cupy as cp

In [2]:
xp = cp

X = 1500
Y = 1500
Z = 20
K = 3
data = np.random.rand(X,Y,Z).astype('float32')

# Try 4: numpy

In [3]:
def vectors_comparison(vector1, vector2):
    res = xp.multiply(vector1, vector2)
    res = xp.sum(xp.power(res, 2), axis = 2)
    return res

def vectors_comparison1(matrix1, matrix2):
    return (xp.power(matrix1 * matrix2, 2)).sum(axis=2)

def compare_matrices(a_pad, data, xk, yk, K, comp_function):
    comparison = comp_function(data, a_pad[xk:xk+data.shape[0], K+yk:K+yk+data.shape[1], :])
    res = xp.pad(comparison, ((xk, 0), (K, K)), constant_values=xp.nan)[0:data.shape[0], K-yk:K-yk+data.shape[1]]
    return comparison, res

def get_comparison(data, X, Y, Z, K, comp_function):
    K = K + 1
    a_pad = xp.pad(data, ((0, K), (K, K), (0, 0)), constant_values=xp.nan)
    result = []    
    for xk in range(0, K):
        for yk in range(0, K):
            if (xk!=0) or (yk!=0):   
                result.extend(compare_matrices(a_pad, data, xk, yk, K, comp_function))
                if (yk!=0) and (xk!=0):
                    result.extend(compare_matrices(a_pad, data, xk, -yk, K, comp_function))
    cp.cuda.stream.get_current_stream().synchronize()
    return result

In [4]:
data_n = cp.asarray(data)

In [5]:
%%time
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

CPU times: user 462 ms, sys: 46.9 ms, total: 509 ms
Wall time: 511 ms


In [6]:
%%time
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

CPU times: user 39.4 ms, sys: 42.7 ms, total: 82 ms
Wall time: 81 ms


In [7]:
%%time
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

CPU times: user 46.3 ms, sys: 33.7 ms, total: 80 ms
Wall time: 79 ms


In [8]:
%%timeit
res2 = get_comparison(data_n, X, Y, Z, K, vectors_comparison)

74.3 ms ± 32.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Compare with

In [9]:
%%time
kernel_size = K*2 + 1
k = kernel_size // 2

data_n = cp.asarray(data)
i_range, x_range = data.shape[:2]

#def function(matrix1, matrix2):
#    return (xp.power(matrix1 * matrix2, 2)).sum(axis=2)

CPU times: user 47.4 ms, sys: 0 ns, total: 47.4 ms
Wall time: 46 ms


In [10]:
def run():
    padded_data = xp.pad(data_n, ((k, k), (k, k), (0, 0)), constant_values=xp.nan)

    res5 = []
    for i in range(kernel_size):
        for j in range(kernel_size):
            if i == j == k:
                continue

            shifted_data = padded_data[i:i+i_range, j:j+x_range]

            computed = vectors_comparison(data_n, shifted_data)
            res5.append(computed)
    cp.cuda.stream.get_current_stream().synchronize()
    return res5

In [11]:
%%time
res5 = run()

CPU times: user 83.5 ms, sys: 63.5 ms, total: 147 ms
Wall time: 148 ms


In [12]:
%%time
res5 = run()

CPU times: user 85.2 ms, sys: 61.9 ms, total: 147 ms
Wall time: 146 ms


In [13]:
%%time
res5 = run()

CPU times: user 94.5 ms, sys: 52.5 ms, total: 147 ms
Wall time: 146 ms


In [14]:
%%timeit
res5 = run()

147 ms ± 190 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
