In [1]:
import numba as nb
import numpy as np

In [2]:
!numba -s

# Должны совпадать:
# CUDA Driver Version                           : 12.0
# CUDA Runtime Version                          : 12.0

System info:
--------------------------------------------------------------------------------
__Time Stamp__
Report started (local time)                   : 2023-11-22 18:56:36.720954
UTC start time                                : 2023-11-22 15:56:36.720964
Running time (s)                              : 0.606791

__Hardware Information__
Machine                                       : x86_64
CPU Name                                      : alderlake
CPU Count                                     : 20
Number of accessible CPUs                     : 20
List of accessible CPUs cores                 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
CFS Restrictions (CPUs worth of runtime)      : None

CPU Features                                  : 64bit adx aes avx avx2 avxvnni bmi
                                                bmi2 clflushopt clwb cmov crc32
                                                cx16 cx8 f16c fma fsgsbase fxsr
                              

# isin(a, b)

In [3]:
a = np.arange(0, 100000000)
b = np.arange(0, 100000000, 2)
display(a.shape, a)
display(b.shape, b)

(100000000,)

array([       0,        1,        2, ..., 99999997, 99999998, 99999999])

(50000000,)

array([       0,        2,        4, ..., 99999994, 99999996, 99999998])

In [4]:
def isin(a, b):
    out = np.empty(a.shape[0], dtype=np.bool_)
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b
    return out

%timeit -n 1 -r 2 isin(a, b)
display(isin(a, b))

18.8 s ± 3.16 s per loop (mean ± std. dev. of 2 runs, 1 loop each)


array([ True, False,  True, ..., False,  True, False])

In [6]:
@nb.njit(boundscheck=False, looplift=True, nogil=True)
def isin(a, b):
    out = np.empty(a.shape[0], dtype=np.bool_)
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b
    return out

%timeit -n 10 -r 2 isin(a, b)
display(isin(a, b))

1.15 s ± 84.9 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

In [7]:
@nb.njit(boundscheck=False, looplift=True, nogil=True, parallel=True)
def isin(a, b):
    out = np.empty(a.shape[0], dtype=np.bool_)
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b
    return out

%timeit -n 10 -r 2 isin(a, b)
display(isin(a, b))

904 ms ± 33.6 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

In [8]:
@nb.njit(boundscheck=False, looplift=True, nogil=True, parallel=True)
def isin(a, b, clusters_count):
    out = np.empty(a.shape[0], dtype=np.bool_)
    b = set(b)
    cluster_size = np.ceil(a.shape[0] / clusters_count)
    for i in nb.prange(clusters_count):
        start = i * cluster_size
        for j in range(start, start+cluster_size):
            out[j] = a[j] in b
    return out

%timeit -n 10 -r 2 isin(a, b, 10)
display(isin(a, b, 10))

881 ms ± 17.3 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

In [11]:
@nb.guvectorize([(nb.int64[:], nb.int64[:], nb.bool_[:])], '(n),(m)->(n)', nopython=True)
def isin(a, b, out):
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b

%timeit -n 10 -r 2 isin(a, b)
display(isin(a, b))

1.09 s ± 27.6 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

In [12]:
@nb.guvectorize([(nb.int64[:], nb.int64[:], nb.bool_[:])], '(n),(m)->(n)', nopython=True, target='parallel')
def isin(a, b, out):
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b

%timeit -n 10 -r 2 isin(a, b)
display(isin(a, b))

1.11 s ± 32 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

# sin(x^2) + cos(y + a)

In [13]:
x = np.random.randint(low=0, high=100, size=1000000).astype(np.float64)
y = np.random.randint(low=0, high=100, size=1000000).astype(np.float64)
display(x.shape, x)
display(y.shape, y)

(1000000,)

array([36., 82.,  5., ..., 32., 66., 62.])

(1000000,)

array([32., 20., 87., ..., 61., 71., 66.])

In [14]:
def my_math_func(x, y, a):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)
    return out

%timeit -n 10 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

1.17 s ± 6.31 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ 0.59569048,  0.99122066, -1.05749929, ..., -0.46755611,
        1.7602084 , -0.1417008 ])

In [15]:
@nb.njit
def my_math_func(x, y, a):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)
    return out

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

30.7 ms ± 1.08 ms per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([ 0.59569048,  0.99122066, -1.05749929, ..., -0.46755611,
        1.7602084 , -0.1417008 ])

In [16]:
@nb.njit
def my_math_func(x, y, a):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)
    return out

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

30.5 ms ± 436 µs per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([ 0.59569048,  0.99122066, -1.05749929, ..., -0.46755611,
        1.7602084 , -0.1417008 ])

In [17]:
@nb.njit(parallel=True)
def my_math_func(x, y, a):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)
    return out

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

11.9 ms ± 3.45 ms per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([ 0.59569048,  0.99122066, -1.05749929, ..., -0.46755611,
        1.7602084 , -0.1417008 ])

In [18]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64, nb.float64[:])], '(n),(n),()->(n)', nopython=True)
def my_math_func(x, y, a, out):
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

33.6 ms ± 2.17 ms per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([ 0.59569048,  0.99122066, -1.05749929, ..., -0.46755611,
        1.7602084 , -0.1417008 ])

In [19]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64, nb.float64[:])], '(n),(n),()->(n)', nopython=True, target='parallel')
def my_math_func(x, y, a, out):
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

38.6 ms ± 743 µs per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([ 0.59569048,  0.99122066, -1.05749929, ..., -0.46755611,
        1.7602084 , -0.1417008 ])

In [20]:
def my_math_func(x, y, a):
    return np.sin(x**2) + np.cos(y + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10)
display(my_math_func(x, y, 10))

33.1 ms ± 710 µs per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([ 0.59569048,  0.99122066, -1.05749929, ..., -0.46755611,
        1.7602084 , -0.1417008 ])

In [21]:
@nb.njit
def my_math_func(x, y, a):
    return np.sin(x**2) + np.cos(y + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10)
display(my_math_func(x, y, 10))

32.5 ms ± 667 µs per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([ 0.59569048,  0.99122066, -1.05749929, ..., -0.46755611,
        1.7602084 , -0.1417008 ])

In [22]:
@nb.njit(parallel=True)
def my_math_func(x, y, a):
    return np.sin(x**2) + np.cos(y + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10)
display(my_math_func(x, y, 10))

8.73 ms ± 1.76 ms per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([ 0.59569048,  0.99122066, -1.05749929, ..., -0.46755611,
        1.7602084 , -0.1417008 ])

In [23]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64, nb.float64[:])], '(n),(n),()->(n)', nopython=True, target='cuda')
def my_math_func(x, y, a, out):
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)

%timeit -n 10 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:

In [None]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64, nb.float64[:])], '(n),(n),()->(n)', nopython=True, target='cuda')
def my_math_func(x, y, a, out):
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)

x_ = x.reshape(100, 100, 100, -1)
y_ = y.reshape(100, 100, 100, -1)

%timeit -n 100 -r 2 my_math_func(x_, y_, 10.)
display(my_math_func(x_, y_, 10.).reshape(-1))

# add_scalars(x, y)

In [2]:
x = np.random.randint(low=0, high=100, size=100000000).astype(np.float64)
y = np.random.randint(low=0, high=100, size=100000000).astype(np.float64)
display(x.shape, x)
display(y.shape, y)

(100000000,)

array([65., 50.,  5., ..., 70., 57., 53.])

(100000000,)

array([40., 50., 23., ..., 18., 56., 95.])

In [3]:
@nb.njit
def add_scalars(x, y):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = x[i] + y[i]
    return out

%timeit -n 10 -r 2 add_scalars(x, y)
display(add_scalars(x, y))

331 ms ± 1.21 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([105., 100.,  28., ...,  88., 113., 148.])

In [4]:
@nb.njit(parallel=True)
def add_scalars(x, y):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = x[i] + y[i]
    return out

%timeit -n 10 -r 2 add_scalars(x, y)
display(add_scalars(x, y))

105 ms ± 13.8 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([105., 100.,  28., ...,  88., 113., 148.])

In [5]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64[:])], '(n),(n)->(n)', nopython=True, target='cuda')
def add_scalars(x, y, out):
    for i in nb.prange(x.shape[0]):
        out[i] = x[i] + y[i]

x_ = x.reshape(100, 100, 100, 100, -1)
y_ = y.reshape(100, 100, 100, 100, -1)

%timeit -n 10 -r 2 add_scalars(x_, y_)
display(add_scalars(x_, y_).reshape(-1))

CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
: