# Pylops - numba

### Author: M.Ravasi

In this notebook I will investigate the benifit of adding numba to various operators

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

import pylops

from scipy.sparse import csr_matrix, vstack, rand
from scipy.linalg import lstsq, solve, pinv
from scipy.sparse.linalg import LinearOperator as spLinearOperator
from scipy.sparse.linalg import LinearOperator, cg, lsqr

from pylops.utils                      import dottest
from pylops.utils.wavelets             import *
from pylops.utils.seismicevents        import *
from pylops.basicoperators             import *
from pylops.signalprocessing             import *
from pylops.waveeqprocessing.mdd       import *
from pylops.optimization.leastsquares  import *
from pylops.optimization.sparsity  import IRLS as IRLSpylops
from pylops.optimization.sparsity  import FISTA

from numba import jit

In [2]:
os.environ['NUMBA_NUM_THREADS'] = '4'
os.environ['NUMBA_DEVELOPER_MODE'] = '0'

## Slant stack 2d

Initialization

In [3]:
par = {'nt': 11, 'nhx': 21, 'nhy': 10, 'npx':31, 'npy':21, 'pxmax':1e-2,
       'centeredh': True, 'kind': 'linear'} # linear, centered, linear interp

dt, dh = 0.005, 1
t = np.arange(par['nt']) * dt
h = np.arange(par['nhx']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
x = np.zeros((par['npx'], par['nt']))
x[2, par['nt']//2] = 1
 
# numpy
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numpy', dtype='float64')
# numba
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

5.1 ms ± 513 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 11.56 times longer than the fastest. This could mean that an intermediate result is being cached.
78.1 ms ± 102 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with pre-computed table

In [4]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)


y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop.H * y.flatten()

14.9 ms ± 1.81 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
13.8 ms ± 1.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y1 = Rop * x.flatten()
xadj1 = Rop.H * y1.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop.H * y.flatten()

95.2 µs ± 48.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
95.4 µs ± 13.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with on-the-fly computation

In [6]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)


y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

12.4 ms ± 819 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
13.2 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [7]:
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

The slowest run took 5.30 times longer than the fastest. This could mean that an intermediate result is being cached.
73.9 ms ± 65 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
79.1 ms ± 42.1 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


## Slant stack 3d

Initialization

In [8]:
hx = np.arange(par['nhx']) * dh
hy = np.arange(par['nhy']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
py = np.linspace(0, par['pxmax'], par['npy'])
x = np.zeros((par['npy'], par['npx'], par['nt']))
x[2, 2, par['nt']//2] = 1
 
# numpy
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], dtype='float64')
# numba
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

168 ms ± 10 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
823 ms ± 283 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


Numpy vs numba with pre-computed table

In [9]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'],
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

180 ms ± 7.61 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
189 ms ± 13.5 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [10]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

7.09 ms ± 2.22 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
The slowest run took 13.42 times longer than the fastest. This could mean that an intermediate result is being cached.
6.68 ms ± 6.95 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


Numpy vs numba with on-the-fly computation

In [11]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

363 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
389 ms ± 43.1 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [12]:
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
xadj = Rop.H * y.flatten()

%timeit -n 3 Rop * x.flatten()
%timeit -n 3 Rop.H * y.flatten()

1.92 s ± 874 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
1.23 s ± 72.1 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


Other stuff, work in progress

In [13]:
import numpy as np

from numba import vectorize, float64

@jit(nopython=True, nogil=True)
def fdouble(x):
    return x**2
@jit(nopython=True, nogil=True)
def passfunction(a, f, out):
    out = f(a)
    return out

arr = np.arange(10)
out = np.zeros(10)
print(passfunction(arr, fdouble, out))

[ 0  1  4  9 16 25 36 49 64 81]


In [14]:
from numba import vectorize, float64, int64, float32, int32

def _hyperbolic(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='cpu')
def _hyperbolic_numba(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
           nopython=True, target='parallel')
def _hyperbolic_numba1(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

x=np.arange(1000)
t0=10
px=0.1

%timeit -n 10 _hyperbolic(x, t0, px)
%timeit -n 10 _hyperbolic_numba(x, t0, px)
%timeit -n 10 _hyperbolic_numba1(x, t0, px)

17 µs ± 1.86 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 5.82 times longer than the fastest. This could mean that an intermediate result is being cached.
25.4 µs ± 24.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
30.9 µs ± 19.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
@guvectorize([(float64[:], float64, float64[:])], '(n),()->(n)')
def g(x, y, res):
    for i in range(x.shape[0]):
        res[i] = x[i] + y
        
a = np.ones(5, np.float64)
g(a, 2.2)

NameError: name 'guvectorize' is not defined

In [None]:
from numba import stencil

def kernel(a):
    return 0.5 * (a[:, 2:] - a[:, 0:-2])

@stencil
def kernel1(a):
    return 0.5 * (a[0, 1] - a[0, -1])

input_arr = np.arange(100).reshape(10, 10)
%timeit -n 2 kernel(input_arr)
%timeit -n 2 kernel1(input_arr)