# Pylops - numba

### Author: M.Ravasi

In this notebook I will investigate the benifit of adding numba to various operators

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

import pylops

from scipy.sparse import csr_matrix, vstack, rand
from scipy.linalg import lstsq, solve, pinv
from scipy.sparse.linalg import LinearOperator as spLinearOperator
from scipy.sparse.linalg import LinearOperator, cg, lsqr

from pylops.utils                      import dottest
from pylops.utils.wavelets             import *
from pylops.utils.seismicevents        import *
from pylops.basicoperators             import *
from pylops.signalprocessing             import *
from pylops.waveeqprocessing.mdd       import *
from pylops.optimization.leastsquares  import *
from pylops.optimization.sparsity  import IRLS as IRLSpylops
from pylops.optimization.sparsity  import FISTA

from numba import jit

In [2]:
os.environ['NUMBA_NUM_THREADS'] = '4'
os.environ['NUMBA_DEVELOPER_MODE'] = '1'

## Slant stack 2d

Initialization

In [3]:
par = {'nt': 11, 'nhx': 21, 'nhy': 10, 'npx':31, 'npy':21, 'pxmax':1e-2,
       'centeredh': True, 'kind': 'linear'} # linear, centered, linear interp

dt, dh = 0.005, 1
t = np.arange(par['nt']) * dt
h = np.arange(par['nhx']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
x = np.zeros((par['npx'], par['nt']))
x[2, par['nt']//2] = 1
 
# numpy
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numpy', dtype='float64')
# numba
Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

7.3 ms ± 1.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
347 µs ± 3.66 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with pre-computed table

In [4]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

11.1 ms ± 2.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
10.2 ms ± 49.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

91 µs ± 3.77 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
95.6 µs ± 4.37 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [7]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

16.4 ms ± 23.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
18.2 ms ± 230 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

40.3 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
47.4 µs ± 1.08 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
4.821487820240936e-15


Numpy vs numba with on-the-fly computation

In [32]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

AttributeError: 'float' object has no attribute 'size'

In [None]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

In [None]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

In [12]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

24.9 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
27.4 ms ± 548 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

254 µs ± 1.66 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
786 µs ± 15.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

3.6392917503883147
13.895252586565116


## Slant stack 3d

Initialization

In [15]:
hx = np.arange(par['nhx']) * dh
hy = np.arange(par['nhy']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
py = np.linspace(0, par['pxmax'], par['npy'])
x = np.zeros((par['npy'], par['npx'], par['nt']))
x[2, 2, par['nt']//2] = 1
 
# numpy
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], dtype='float64')
# numba
Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

220 ms ± 3.59 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
86.3 ms ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


Numpy vs numba with pre-computed table

In [16]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

225 ms ± 4.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
226 ms ± 2.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

2.44 ms ± 13.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
742 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [19]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

406 ms ± 564 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
428 ms ± 4.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

3.42 ms ± 69.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.09 ms ± 12 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
4.146737221823692e-13


Numpy vs numba with on-the-fly computation

In [22]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

471 ms ± 247 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
476 ms ± 814 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

80.7 ms ± 1.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
67.5 ms ± 7.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [25]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

616 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
642 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

80 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
98.3 ms ± 6.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

11.327449453037127
280.2292252363112


Other stuff, work in progress

In [28]:
import numpy as np

from numba import vectorize, float64

@jit(nopython=True, nogil=True)
def fdouble(x):
    return x**2
@jit(nopython=True, nogil=True)
def passfunction(a, f, out):
    out = f(a)
    return out

arr = np.arange(10)
out = np.zeros(10)
print(passfunction(arr, fdouble, out))

[ 0  1  4  9 16 25 36 49 64 81]


In [29]:
from numba import vectorize, guvectorize, float64, int64, float32, int32

def _hyperbolic(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='cpu')
def _hyperbolic_numba(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='parallel')
def _hyperbolic_numba1(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

x=np.arange(1000)
t0=10
px=0.1

%timeit -n 10 _hyperbolic(x, t0, px)
%timeit -n 10 _hyperbolic_numba(x, t0, px)
%timeit -n 10 _hyperbolic_numba1(x, t0, px)

18.4 µs ± 1.57 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
17.2 µs ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
21.3 µs ± 7.34 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
@guvectorize([(float64[:], float64, float64[:])], '(n),()->(n)')
def g(x, y, res):
    for i in range(x.shape[0]):
        res[i] = x[i] + y
        
a = np.ones(5, np.float64)
g(a, 2.2)

array([3.2, 3.2, 3.2, 3.2, 3.2])

In [31]:
from numba import stencil

def kernel(a):
    return 0.5 * (a[:, 2:] - a[:, 0:-2])

@stencil
def kernel1(a):
    return 0.5 * (a[0, 1] - a[0, -1])

input_arr = np.arange(100).reshape(10, 10)
%timeit -n 2 kernel(input_arr)
%timeit -n 2 kernel1(input_arr)

10.2 µs ± 3.92 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
172 ms ± 6.66 ms per loop (mean ± std. dev. of 7 runs, 2 loops each)
