### CUDA with CUPY

In [1]:
import numpy as np
import cupy as cp

Create numpy like array on Device

In [2]:
# array on host memory
x_host = np.array([1, 2, 3])
type(x_host)

numpy.ndarray

In [3]:
# array on device memory
x_device = cp.array([1, 2, 3])
type(x_device)

cupy.ndarray

In [4]:
%%timeit
np.linalg.norm(x_host)

2.7 µs ± 36.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [5]:
%%timeit
cp.linalg.norm(x_device)

188 µs ± 61.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
with cp.cuda.Device(0):
    x_on_device_0 = cp.array([1, 2, 3, 4, 5])

Transfer data between Host and Device

In [7]:
# Create numpy array on host
x_host = np.random.randint(0, 255, (2000, 2000))

In [8]:
# Transfer numpy array from host to device
x_device = cp.asarray(x_host)

In [9]:
# Transfer cupy array from device to host
x_host_1 = x_device.get()

Compute FFT

In [10]:
from scipy.fft import fftn

In [11]:
%%timeit
fftn(x_host)

70.9 ms ± 1.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
# CUDA implementation of some algorithms
import cupyx

In [13]:
%%timeit
cupyx.scipy.fft.fftn(x_device)

12.5 ms ± 38.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
