# Pylops - numba

### Author: M.Ravasi

In this notebook I will investigate the benifit of adding numba to various operators

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

import pylops

from scipy.sparse import csr_matrix, vstack, rand
from scipy.linalg import lstsq, solve, pinv
from scipy.sparse.linalg import LinearOperator as spLinearOperator
from scipy.sparse.linalg import LinearOperator, cg, lsqr

from pylops.utils                      import dottest
from pylops.utils.wavelets             import *
from pylops.utils.seismicevents        import *
from pylops.basicoperators             import *
from pylops.signalprocessing             import *
from pylops.waveeqprocessing.mdd       import *
from pylops.optimization.leastsquares  import *
from pylops.optimization.sparsity  import IRLS as IRLSpylops
from pylops.optimization.sparsity  import FISTA

from numba import jit

In [2]:
os.environ['NUMBA_NUM_THREADS'] = '4'
os.environ['NUMBA_DEVELOPER_MODE'] = '1'

## Slant stack 2d

Initialization

In [3]:
par = {'nt': 11, 'nhx': 21, 'nhy': 10, 'npx':31, 'npy':21, 'pxmax':1e-2,
       'centeredh': True, 'kind': 'linear'} # linear, centered, linear interp

dt, dh = 0.005, 1
t = np.arange(par['nt']) * dt
h = np.arange(par['nhx']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
x = np.zeros((par['npx'], par['nt']))
x[2, par['nt']//2] = 1
 
# numpy
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numpy', dtype='float64')
# numba
Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

7.94 ms ± 3.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
345 µs ± 4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with pre-computed table

In [4]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

9.7 ms ± 24.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
10.2 ms ± 351 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

89.4 µs ± 3.23 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
112 µs ± 31.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

15.9 ms ± 44 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
18 ms ± 811 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

40.2 µs ± 1.21 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
50.3 µs ± 3.49 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with on-the-fly computation

In [8]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

16 ms ± 230 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
17 ms ± 507 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

254 µs ± 1.46 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
802 µs ± 24.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)


y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

24.2 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
28.1 ms ± 1.9 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [11]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

258 µs ± 6.15 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
841 µs ± 50.6 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)


## Slant stack 3d

Initialization

In [12]:
hx = np.arange(par['nhx']) * dh
hy = np.arange(par['nhy']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
py = np.linspace(0, par['pxmax'], par['npy'])
x = np.zeros((par['npy'], par['npx'], par['nt']))
x[2, 2, par['nt']//2] = 1
 
# numpy
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], dtype='float64')
# numba
Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

215 ms ± 702 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
94.9 ms ± 10.9 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


Numpy vs numba with pre-computed table

In [13]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

233 ms ± 440 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
225 ms ± 6.69 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

2.42 ms ± 14.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
751 µs ± 17 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

394 ms ± 969 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
419 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

3.37 ms ± 14.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
852 µs ± 20.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with on-the-fly computation

In [17]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

457 ms ± 327 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
467 ms ± 347 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

72.5 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
80 ms ± 342 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

607 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
634 ms ± 2.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

84.9 ms ± 1.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
80.3 ms ± 5.77 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Other stuff, work in progress

In [21]:
import numpy as np

from numba import vectorize, float64

@jit(nopython=True, nogil=True)
def fdouble(x):
    return x**2
@jit(nopython=True, nogil=True)
def passfunction(a, f, out):
    out = f(a)
    return out

arr = np.arange(10)
out = np.zeros(10)
print(passfunction(arr, fdouble, out))

[ 0  1  4  9 16 25 36 49 64 81]


In [22]:
from numba import vectorize, guvectorize, float64, int64, float32, int32

def _hyperbolic(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='cpu')
def _hyperbolic_numba(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='parallel')
def _hyperbolic_numba1(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

x=np.arange(1000)
t0=10
px=0.1

%timeit -n 10 _hyperbolic(x, t0, px)
%timeit -n 10 _hyperbolic_numba(x, t0, px)
%timeit -n 10 _hyperbolic_numba1(x, t0, px)

19.8 µs ± 1.81 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 6.68 times longer than the fastest. This could mean that an intermediate result is being cached.
23.9 µs ± 26.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 43.26 times longer than the fastest. This could mean that an intermediate result is being cached.
122 µs ± 256 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
@guvectorize([(float64[:], float64, float64[:])], '(n),()->(n)')
def g(x, y, res):
    for i in range(x.shape[0]):
        res[i] = x[i] + y
        
a = np.ones(5, np.float64)
g(a, 2.2)

array([3.2, 3.2, 3.2, 3.2, 3.2])

In [24]:
from numba import stencil

def kernel(a):
    return 0.5 * (a[:, 2:] - a[:, 0:-2])

@stencil
def kernel1(a):
    return 0.5 * (a[0, 1] - a[0, -1])

input_arr = np.arange(100).reshape(10, 10)
%timeit -n 2 kernel(input_arr)
%timeit -n 2 kernel1(input_arr)

10.3 µs ± 4.39 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
172 ms ± 6.79 ms per loop (mean ± std. dev. of 7 runs, 2 loops each)
