In [9]:
import numba as nb
import numpy as np

In [2]:
!numba -s

# Должны совпадать:
# CUDA Driver Version                           : 12.0
# CUDA Runtime Version                          : 12.0

System info:

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
--------------------------------------------------------------------------------
__Time Stamp__
Report started (local time)                   : 2023-11-20 15:38:09.679719
UTC start time                                : 2023-11-20 12:38:09.679726
Running time (s)                              : 0.439456

__Hardware Information__
Machine                                       : x86_64
CPU Name                                      : skylake
CPU Count                                     : 20
Number of accessible CPUs                     : 20
List of accessible CPUs cores                 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
CFS Restrictions (CPUs worth of runtime)      : None

CPU Features          

# isin(a, b)

In [3]:
a = np.arange(0, 100000000)
b = np.arange(0, 100000000, 2)
display(a.shape, a)
display(b.shape, b)

(100000000,)

array([       0,        1,        2, ..., 99999997, 99999998, 99999999])

(50000000,)

array([       0,        2,        4, ..., 99999994, 99999996, 99999998])

In [90]:
def isin(a, b):
    out = np.empty(a.shape[0], dtype=np.bool_)
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b
    return out

%timeit -n 1 -r 2 isin(a, b)
display(isin(a, b))

17.3 s ± 39.1 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)


array([ True, False,  True, ..., False,  True, False])

In [91]:
@nb.njit(boundscheck=False, looplift=True, nogil=True)
def isin(a, b):
    out = np.empty(a.shape[0], dtype=np.bool_)
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b
    return out

%timeit -n 10 -r 2 isin(a, b)
display(isin(a, b))

1.02 s ± 14.6 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

In [92]:
@nb.njit(boundscheck=False, looplift=True, nogil=True, parallel=True)
def isin(a, b):
    out = np.empty(a.shape[0], dtype=np.bool_)
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b
    return out

%timeit -n 10 -r 2 isin(a, b)
display(isin(a, b))

874 ms ± 22.6 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

In [93]:
@nb.njit(boundscheck=False, looplift=True, nogil=True, parallel=True)
def isin(a, b, clusters_count):
    out = np.empty(a.shape[0], dtype=np.bool_)
    b = set(b)
    cluster_size = np.ceil(a.shape[0] / clusters_count)
    for i in nb.prange(clusters_count):
        start = i * cluster_size
        for j in range(start, start+cluster_size):
            out[j] = a[j] in b
    return out

%timeit -n 10 -r 2 isin(a, b, 10)
display(isin(a, b, 10))

878 ms ± 21.8 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

In [94]:
@nb.guvectorize([(nb.int64[:], nb.int64[:], nb.bool_[:])], '(n),(m)->(n)', nopython=True)
def isin(a, b, out):
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b

%timeit -n 10 -r 2 isin1(a, b)
display(isin(a, b))

1.03 s ± 10 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

In [96]:
@nb.guvectorize([(nb.int64[:], nb.int64[:], nb.bool_[:])], '(n),(m)->(n)', nopython=True, target='parallel')
def isin(a, b, out):
    b = set(b)
    for i in nb.prange(a.shape[0]):
        out[i] = a[i] in b

%timeit -n 10 -r 2 isin(a, b)
display(isin(a, b))

1.07 s ± 8.98 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ True, False,  True, ..., False,  True, False])

# sin(x^2) + cos(y + a)

In [3]:
x = np.random.randint(low=0, high=100, size=1000000).astype(np.float64)
y = np.random.randint(low=0, high=100, size=1000000).astype(np.float64)
display(x.shape, x)
display(y.shape, y)

(1000000,)

array([18., 14., 25., ..., 78., 16., 25.])

(1000000,)

array([65., 47., 69., ..., 76., 74., 86.])

In [14]:
def my_math_func(x, y, a):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)
    return out

%timeit -n 10 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

1.33 s ± 19.9 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([-1.15184262,  1.78032651,  0.39947694, ...,  1.88980814,
       -0.21386096, -0.99776154])

In [15]:
@nb.njit
def my_math_func(x, y, a):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)
    return out

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

30.4 ms ± 2.65 ms per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([-1.15184262,  1.78032651,  0.39947694, ...,  1.88980814,
       -0.21386096, -0.99776154])

In [16]:
@nb.njit
def my_math_func(x, y, a):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)
    return out

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

29 ms ± 879 µs per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([-1.15184262,  1.78032651,  0.39947694, ...,  1.88980814,
       -0.21386096, -0.99776154])

In [17]:
@nb.njit(parallel=True)
def my_math_func(x, y, a):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)
    return out

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

5.85 ms ± 3.35 ms per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([-1.15184262,  1.78032651,  0.39947694, ...,  1.88980814,
       -0.21386096, -0.99776154])

In [18]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64, nb.float64[:])], '(n),(n),()->(n)', nopython=True)
def my_math_func(x, y, a, out):
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

28.2 ms ± 563 µs per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([-1.15184262,  1.78032651,  0.39947694, ...,  1.88980814,
       -0.21386096, -0.99776154])

In [20]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64, nb.float64[:])], '(n),(n),()->(n)', nopython=True, target='parallel')
def my_math_func(x, y, a, out):
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))

33.4 ms ± 89.1 µs per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([-1.15184262,  1.78032651,  0.39947694, ...,  1.88980814,
       -0.21386096, -0.99776154])

In [21]:
def my_math_func(x, y, a):
    return np.sin(x**2) + np.cos(y + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10)
display(my_math_func(x, y, 10))

30.5 ms ± 703 µs per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([-1.15184262,  1.78032651,  0.39947694, ...,  1.88980814,
       -0.21386096, -0.99776154])

In [22]:
@nb.njit
def my_math_func(x, y, a):
    return np.sin(x**2) + np.cos(y + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10)
display(my_math_func(x, y, 10))

29.6 ms ± 1.37 ms per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([-1.15184262,  1.78032651,  0.39947694, ...,  1.88980814,
       -0.21386096, -0.99776154])

In [24]:
@nb.njit(parallel=True)
def my_math_func(x, y, a):
    return np.sin(x**2) + np.cos(y + a)

%timeit -n 100 -r 2 my_math_func(x, y, 10)
display(my_math_func(x, y, 10))

5.42 ms ± 2.7 ms per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([-1.15184262,  1.78032651,  0.39947694, ...,  1.88980814,
       -0.21386096, -0.99776154])

In [4]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64, nb.float64[:])], '(n),(n),()->(n)', nopython=True, target='cuda')
def my_math_func(x, y, a, out):
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)

%timeit -n 10 -r 2 my_math_func(x, y, 10.)
display(my_math_func(x, y, 10.))



1.12 s ± 8.13 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ 0.51768605,  1.83939688, -0.71995467, ...,  0.56990004,
       -1.67923153, -0.00441418])

In [5]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64, nb.float64[:])], '(n),(n),()->(n)', nopython=True, target='cuda')
def my_math_func(x, y, a, out):
    for i in nb.prange(x.shape[0]):
        out[i] = np.sin(x[i]**2) + np.cos(y[i] + a)

x_ = x.reshape(100, 100, 100, -1)
y_ = y.reshape(100, 100, 100, -1)

%timeit -n 100 -r 2 my_math_func(x_, y_, 10.)
display(my_math_func(x_, y_, 10.).reshape(-1))

4.48 ms ± 346 µs per loop (mean ± std. dev. of 2 runs, 100 loops each)


array([ 0.51768605,  1.83939688, -0.71995467, ...,  0.56990004,
       -1.67923153, -0.00441418])

# add_scalars(x, y)

In [6]:
x = np.random.randint(low=0, high=100, size=100000000).astype(np.float64)
y = np.random.randint(low=0, high=100, size=100000000).astype(np.float64)
display(x.shape, x)
display(y.shape, y)

(100000000,)

array([48., 50., 41., ..., 91., 47., 44.])

(100000000,)

array([75., 54., 71., ..., 35., 38., 45.])

In [6]:
@nb.njit
def add_scalars(x, y):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = x[i] + y[i]
    return out

%timeit -n 10 -r 2 add_scalars(x, y)
display(add_scalars(x, y))

261 ms ± 16.9 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ 72., 110., 131., ..., 150., 108., 120.])

In [40]:
@nb.njit(parallel=True)
def add_scalars(x, y):
    out = np.empty(x.shape[0], dtype=np.float64)
    for i in nb.prange(x.shape[0]):
        out[i] = x[i] + y[i]
    return out

%timeit -n 10 -r 2 add_scalars(x, y)
display(add_scalars(x, y))

129 ms ± 13.8 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([ 72., 110., 131., ..., 150., 108., 120.])

In [7]:
@nb.guvectorize([(nb.float64[:], nb.float64[:], nb.float64[:])], '(n),(n)->(n)', nopython=True, target='cuda')
def add_scalars(x, y, out):
    for i in nb.prange(x.shape[0]):
        out[i] = x[i] + y[i]

x_ = x.reshape(100, 100, 100, 100, -1)
y_ = y.reshape(100, 100, 100, 100, -1)

%timeit -n 10 -r 2 add_scalars(x_, y_)
display(add_scalars(x_, y_).reshape(-1))

329 ms ± 1.77 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


array([123., 104., 112., ..., 126.,  85.,  89.])