# Pylops - numba

### Author: M.Ravasi

In this notebook I will investigate the benifit of adding numba to various operators

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

import pylops

from scipy.sparse import csr_matrix, vstack, rand
from scipy.linalg import lstsq, solve, pinv
from scipy.sparse.linalg import LinearOperator as spLinearOperator
from scipy.sparse.linalg import LinearOperator, cg, lsqr

from pylops.utils                      import dottest
from pylops.utils.wavelets             import *
from pylops.utils.seismicevents        import *
from pylops.basicoperators             import *
from pylops.signalprocessing             import *
from pylops.waveeqprocessing.mdd       import *
from pylops.optimization.leastsquares  import *
from pylops.optimization.sparsity  import IRLS as IRLSpylops
from pylops.optimization.sparsity  import FISTA

from numba import jit

In [2]:
os.environ['OMP_NUM_THREADS'] = '10'
os.environ['MKL_NUM_THREADS'] = '10'
os.environ['NUMBA_NUM_THREADS'] = '10'
os.environ['NUMBA_DEVELOPER_MODE'] = '0'

## Slant stack 2d

Initialization

In [3]:
par = {'nt': 11, 'nhx': 21, 'nhy': 10, 'npx':31, 'npy':21, 'pxmax':1e-2,
       'centeredh': True, 'kind': 'linear'} # linear, centered, linear interp

dt, dh = 0.005, 1
t = np.arange(par['nt']) * dt
h = np.arange(par['nhx']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
x = np.zeros((par['npx'], par['nt']))
x[2, par['nt']//2] = 1
 
# numpy
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numpy', dtype='float64')
# numba
Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

12.1 ms ± 4.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
343 µs ± 4.38 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with pre-computed table

In [4]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

10.8 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
10.2 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

90.9 µs ± 3.76 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
115 µs ± 35.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [7]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

16.3 ms ± 155 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
18.1 ms ± 480 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

40.5 µs ± 1.24 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
46.1 µs ± 9.87 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
4.821487820240936e-15


Numpy vs numba with on-the-fly computation

In [10]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

16.4 ms ± 206 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
17.6 ms ± 797 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

253 µs ± 1.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
787 µs ± 20.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [13]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

24.9 ms ± 114 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
27.3 ms ± 309 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

258 µs ± 1.92 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
779 µs ± 11.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
4.932377424477932e-15


## Slant stack 3d

Initialization

In [16]:
dt, dh = 0.005, 1
t = np.arange(par['nt']) * dt 
hx = np.arange(par['nhx']) * dh
hy = np.arange(par['nhy']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
py = np.linspace(0, par['pxmax'], par['npy'])
x = np.zeros((par['npy'], par['npx'], par['nt']))
x[2, 2, par['nt']//2] = 1
 
# numpy
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], dtype='float64')
# numba
Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

212 ms ± 921 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
109 ms ± 5.14 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


Numpy vs numba with pre-computed table

In [18]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

238 ms ± 491 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
240 ms ± 267 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

2.42 ms ± 21.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 5.10 times longer than the fastest. This could mean that an intermediate result is being cached.
796 µs ± 447 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [21]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

401 ms ± 348 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
422 ms ± 355 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

3.4 ms ± 17.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
393 µs ± 35.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
4.146737221823692e-13


Numpy vs numba with on-the-fly computation

In [24]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

462 ms ± 251 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
478 ms ± 656 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

103 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
419 ms ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [27]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

651 ms ± 557 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
694 ms ± 11.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [28]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

115 ms ± 10.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
217 ms ± 24.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
3.9911132851821105e-13


Other stuff, work in progress

In [30]:
import numpy as np

from numba import vectorize, float64

@jit(nopython=True, nogil=True)
def fdouble(x):
    return x**2
@jit(nopython=True, nogil=True)
def passfunction(a, f, out):
    out = f(a)
    return out

arr = np.arange(10)
out = np.zeros(10)
print(passfunction(arr, fdouble, out))

[ 0  1  4  9 16 25 36 49 64 81]


In [31]:
from numba import vectorize, guvectorize, float64, int64, float32, int32

def _hyperbolic(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='cpu')
def _hyperbolic_numba(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='parallel')
def _hyperbolic_numba1(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

x=np.arange(1000)
t0=10
px=0.1

%timeit -n 10 _hyperbolic(x, t0, px)
%timeit -n 10 _hyperbolic_numba(x, t0, px)
%timeit -n 10 _hyperbolic_numba1(x, t0, px)

19.6 µs ± 1.51 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 6.45 times longer than the fastest. This could mean that an intermediate result is being cached.
23.9 µs ± 25.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 4.80 times longer than the fastest. This could mean that an intermediate result is being cached.
26.8 µs ± 22.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [32]:
@guvectorize([(float64[:], float64, float64[:])], '(n),()->(n)')
def g(x, y, res):
    for i in range(x.shape[0]):
        res[i] = x[i] + y
        
a = np.ones(5, np.float64)
g(a, 2.2)

array([3.2, 3.2, 3.2, 3.2, 3.2])

In [33]:
from numba import stencil

def kernel(a):
    return 0.5 * (a[:, 2:] - a[:, 0:-2])

@stencil
def kernel1(a):
    return 0.5 * (a[0, 1] - a[0, -1])

input_arr = np.arange(100).reshape(10, 10)
%timeit -n 2 kernel(input_arr)
%timeit -n 2 kernel1(input_arr)

13.5 µs ± 5.71 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
177 ms ± 1.89 ms per loop (mean ± std. dev. of 7 runs, 2 loops each)
