# Pylops - numba

### Author: M.Ravasi

In this notebook I will investigate the benifit of adding numba to various operators

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

import pylops

from scipy.sparse import csr_matrix, vstack, rand
from scipy.linalg import lstsq, solve, pinv
from scipy.sparse.linalg import LinearOperator as spLinearOperator
from scipy.sparse.linalg import LinearOperator, cg, lsqr

from pylops.utils                      import dottest
from pylops.utils.wavelets             import *
from pylops.utils.seismicevents        import *
from pylops.basicoperators             import *
from pylops.signalprocessing             import *
from pylops.waveeqprocessing.mdd       import *
from pylops.optimization.leastsquares  import *
from pylops.optimization.sparsity  import IRLS as IRLSpylops
from pylops.optimization.sparsity  import FISTA

from numba import jit

In [2]:
os.environ['NUMBA_NUM_THREADS'] = '4'
os.environ['NUMBA_DEVELOPER_MODE'] = '1'

## Slant stack 2d

Initialization

In [3]:
par = {'nt': 11, 'nhx': 21, 'nhy': 10, 'npx':31, 'npy':21, 'pxmax':1e-2,
       'centeredh': True, 'kind': 'linear'} # linear, centered, linear interp

dt, dh = 0.005, 1
t = np.arange(par['nt']) * dt
h = np.arange(par['nhx']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
x = np.zeros((par['npx'], par['nt']))
x[2, par['nt']//2] = 1
 
# numpy
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numpy', dtype='float64')
# numba
Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

5.61 ms ± 298 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
642 µs ± 40.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with pre-computed table

In [4]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop.H * y.flatten()

8.59 ms ± 246 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
9.46 ms ± 970 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop.H * y.flatten()

57.8 µs ± 10.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 4.12 times longer than the fastest. This could mean that an intermediate result is being cached.
138 µs ± 90.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop.H * y.flatten()

14.6 ms ± 355 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
17.6 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop.H * y.flatten()

76.8 µs ± 4.16 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
131 µs ± 84.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with on-the-fly computation

In [8]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

16.4 ms ± 1.19 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
17.5 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [9]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

988 µs ± 83.5 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
1.06 ms ± 514 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [10]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)


y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

The slowest run took 5.84 times longer than the fastest. This could mean that an intermediate result is being cached.
52.6 ms ± 39.5 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
41.5 ms ± 7.04 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [11]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

The slowest run took 11.35 times longer than the fastest. This could mean that an intermediate result is being cached.
4.48 ms ± 4.08 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
The slowest run took 86.83 times longer than the fastest. This could mean that an intermediate result is being cached.
8.56 ms ± 13.9 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


## Slant stack 3d

Initialization

In [12]:
hx = np.arange(par['nhx']) * dh
hy = np.arange(par['nhy']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
py = np.linspace(0, par['pxmax'], par['npy'])
x = np.zeros((par['npy'], par['npx'], par['nt']))
x[2, 2, par['nt']//2] = 1
 
# numpy
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], dtype='float64')
# numba
Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

235 ms ± 41.8 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
490 ms ± 30.1 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


Numpy vs numba with pre-computed table

In [13]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

228 ms ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
224 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [14]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

4.86 ms ± 421 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
1.7 ms ± 111 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [16]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

424 ms ± 20 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
433 ms ± 52.9 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [17]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

8.21 ms ± 256 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
3.54 ms ± 405 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)


Numpy vs numba with on-the-fly computation

In [18]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

460 ms ± 35 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
469 ms ± 44.2 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [19]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

287 ms ± 86.3 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
552 ms ± 39 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [20]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

652 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
720 ms ± 23.2 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [21]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

297 ms ± 114 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
346 ms ± 97.2 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


Other stuff, work in progress

In [None]:
import numpy as np

from numba import vectorize, float64

@jit(nopython=True, nogil=True)
def fdouble(x):
    return x**2
@jit(nopython=True, nogil=True)
def passfunction(a, f, out):
    out = f(a)
    return out

arr = np.arange(10)
out = np.zeros(10)
print(passfunction(arr, fdouble, out))

In [None]:
from numba import vectorize, guvectorize, float64, int64, float32, int32

def _hyperbolic(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='cpu')
def _hyperbolic_numba(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='parallel')
def _hyperbolic_numba1(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

x=np.arange(1000)
t0=10
px=0.1

%timeit -n 10 _hyperbolic(x, t0, px)
%timeit -n 10 _hyperbolic_numba(x, t0, px)
%timeit -n 10 _hyperbolic_numba1(x, t0, px)

In [None]:
@guvectorize([(float64[:], float64, float64[:])], '(n),()->(n)')
def g(x, y, res):
    for i in range(x.shape[0]):
        res[i] = x[i] + y
        
a = np.ones(5, np.float64)
g(a, 2.2)

In [None]:
from numba import stencil

def kernel(a):
    return 0.5 * (a[:, 2:] - a[:, 0:-2])

@stencil
def kernel1(a):
    return 0.5 * (a[0, 1] - a[0, -1])

input_arr = np.arange(100).reshape(10, 10)
%timeit -n 2 kernel(input_arr)
%timeit -n 2 kernel1(input_arr)