# Pylops - numba

### Author: M.Ravasi

In this notebook I will investigate the benifit of adding numba to various operators

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

import pylops

from scipy.sparse import csr_matrix, vstack, rand
from scipy.linalg import lstsq, solve, pinv
from scipy.sparse.linalg import LinearOperator as spLinearOperator
from scipy.sparse.linalg import LinearOperator, cg, lsqr

from pylops.utils                      import dottest
from pylops.utils.wavelets             import *
from pylops.utils.seismicevents        import *
from pylops.basicoperators             import *
from pylops.signalprocessing             import *
from pylops.waveeqprocessing.mdd       import *
from pylops.optimization.leastsquares  import *
from pylops.optimization.sparsity  import IRLS as IRLSpylops
from pylops.optimization.sparsity  import FISTA

import numba
from numba import stencil
from numba import jit

In [2]:
os.environ['OMP_NUM_THREADS'] = '4'
os.environ['MKL_NUM_THREADS'] = '4'
os.environ['NUMBA_NUM_THREADS'] = '4'
os.environ['NUMBA_DEVELOPER_MODE'] = '0'

## Slant stack 2d

Initialization

In [3]:
par = {'nt': 11, 'nhx': 21, 'nhy': 10, 'npx':31, 'npy':21, 'pxmax':1e-2,
       'centeredh': True, 'kind': 'linear'} # linear, centered, linear interp

dt, dh = 0.005, 1
t = np.arange(par['nt']) * dt
h = np.arange(par['nhx']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
x = np.zeros((par['npx'], par['nt']))
x[2, par['nt']//2] = 1
 
# numpy
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numpy', dtype='float64')
# numba
Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 10  Radon2D(t, h, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

2.72 ms ± 82.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
261 µs ± 7.14 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Numpy vs numba with pre-computed table

In [4]:
# Numpy
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

3.87 ms ± 67.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
3.87 ms ± 77.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
# Nunmba
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

29.7 µs ± 7.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
58 µs ± 21.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [7]:
# Numpy
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

7.05 ms ± 192 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
7.19 ms ± 98.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
# Numba
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba',
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

31.5 µs ± 1.03 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
76.1 µs ± 21.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
4.821487820240936e-15


Numpy vs numba with on-the-fly computation

In [10]:
# Numpy
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

7.59 ms ± 108 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
7.64 ms ± 65 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
# Numba
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

281 µs ± 1.43 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
309 µs ± 90.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [13]:
# Numpy
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numpy', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

11.2 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
11.7 ms ± 200 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
# Numba
Rop = Radon2D(t, h, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', onthefly=True,
              dtype='float64')
dottest(Rop, par['nhx']*par['nt'], par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

325 µs ± 1.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
345 µs ± 116 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
4.821487820240936e-15


## Slant stack 3d

Initialization

In [16]:
dt, dh = 0.005, 1
t = np.arange(par['nt']) * dt 
hx = np.arange(par['nhx']) * dh
hy = np.arange(par['nhy']) * dh
px = np.linspace(0, par['pxmax'], par['npx'])
py = np.linspace(0, par['pxmax'], par['npy'])
x = np.zeros((par['npy'], par['npx'], par['nt']))
x[2, 2, par['nt']//2] = 1
 
# numpy
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], dtype='float64')
# numba
Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, 
        kind=par['kind'], engine='numba', dtype='float64')
%timeit -n 3  Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'], interp=False, kind=par['kind'], engine='numba', dtype='float64')

84.2 ms ± 892 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
334 ms ± 21.8 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


Numpy vs numba with pre-computed table

In [17]:
# Numpy
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

88.7 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
88.2 ms ± 508 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
# Numba
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

2.11 ms ± 2.05 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
873 µs ± 35 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [20]:
# Numpy
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

159 ms ± 2.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
164 ms ± 1.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
# Numba
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

2.29 ms ± 19.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
866 µs ± 36.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
4.146737221823692e-13


Numpy vs numba with on-the-fly computation

In [23]:
# Numpy
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], onthefly=True, dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

193 ms ± 376 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
195 ms ± 2.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
# Numba
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=False, kind=par['kind'], engine='numba', 
              onthefly=True, dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

390 ms ± 72.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.12 s ± 203 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
0.0


In [26]:
# Numpy
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], 
              onthefly=True, dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

y = Rop * x.flatten()
Rop1 = Rop.H
xadj = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

293 ms ± 4.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
297 ms ± 5.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
# Numba
Rop = Radon3D(t, hy, hx, py, px, centeredh=par['centeredh'],
              interp=True, kind=par['kind'], engine='numba', 
              onthefly=True, dtype='float64')
dottest(Rop, par['nhy']*par['nhx']*par['nt'], par['npy']*par['npx']*par['nt'],
        complexflag=0)

ynb = Rop * x.flatten()
Rop1 = Rop.H
xadjnb = Rop1 * y.flatten()

%timeit -n 10 Rop * x.flatten()
%timeit -n 10 Rop1 * y.flatten()

306 ms ± 15.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
894 ms ± 128 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [28]:
print(np.linalg.norm(y-ynb))
print(np.linalg.norm(xadj-xadjnb))

0.0
4.146737221823692e-13


## 2. Vectorize ufunc 

Let's investigate if vectorizing ufunc like *_hyperbolic* bring any improvement

In [29]:
import numpy as np

from numba import vectorize, float64

@jit(nopython=True, nogil=True)
def fsquare(x):
    return x**2
@jit(nopython=True, nogil=True)
def passfunction(a, f, out):
    out = f(a)
    return out

arr = np.arange(10)
out = np.zeros(10)
print(fsquare(arr))
print(passfunction(arr, fsquare, out))

[ 0  1  4  9 16 25 36 49 64 81]
[ 0  1  4  9 16 25 36 49 64 81]


In [30]:
from numba import vectorize, guvectorize, float64, int64, float32, int32

def _hyperbolic(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
            nopython=True, target='cpu')
def _hyperbolic_numba(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

@vectorize([float64(float64, float64, float64),
            float64(float64, int64, float64),
            float64(float64, int32, float64),
            float32(float32, float32, float32),
            float32(float32, int32, float32)], 
            nopython=True, target='parallel')
def _hyperbolic_numba1(x, t, px):
    return np.sqrt(t**2 + (x/px)**2)

x=np.arange(1000)
t0=10
px=0.1

%timeit -n 10 _hyperbolic(x, t0, px)
%timeit -n 10 _hyperbolic_numba(x, t0, px)
%timeit -n 10 _hyperbolic_numba1(x, t0, px)

5.96 µs ± 1.46 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 10.53 times longer than the fastest. This could mean that an intermediate result is being cached.
7.57 µs ± 8.98 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The slowest run took 4.18 times longer than the fastest. This could mean that an intermediate result is being cached.
28.1 µs ± 13.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## 3. Guvectorize ufunc 

In [31]:
@guvectorize([(float64[:], float64, float64[:])], '(n),()->(n)')
def g(x, y, res):
    for i in range(x.shape[0]):
        res[i] = x[i] + y
        
a = np.ones(5, np.float64)
g(a, 2.2)

array([3.2, 3.2, 3.2, 3.2, 3.2])

## 4. Kernels

In [32]:
def deriv(a):
    a1 = np.zeros_like(a)
    a1[:, 1:-1] = 0.5 * (a[:, 2:] - a[:, 0:-2])
    return a1

@stencil
def _deriv1(a):
    return 0.5 * (a[0, 1] - a[0, -1])

@numba.guvectorize(
    [(numba.float64[:, :], numba.float64[:, :])],
    '(n, n) -> (n, n)', nopython=True, target='parallel')
def deriv1(x, out):
    out[:] = _deriv1(x)  

n=4
input_arr = np.arange(n*n).reshape(n, n)
output_arr = deriv1(input_arr)

print(input_arr)
print(deriv(input_arr))
print(deriv(input_arr) - output_arr)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
[[0 1 1 0]
 [0 1 1 0]
 [0 1 1 0]
 [0 1 1 0]]
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [33]:
n = 5000
input_arr = np.arange(n*n).reshape(n, n)

%timeit -n 2 deriv(input_arr)
%timeit -n 2 deriv1(input_arr)

215 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 2 loops each)
200 ms ± 991 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)


Let's make a first order derivative operator with stencils

In [34]:
n=10
D = 0.5*np.eye(n, n, 1) -0.5*np.eye(n, n, -1)
x = np.ones(n)
print(D)
print(D.T)

[[ 0.   0.5  0.   0.   0.   0.   0.   0.   0.   0. ]
 [-0.5  0.   0.5  0.   0.   0.   0.   0.   0.   0. ]
 [ 0.  -0.5  0.   0.5  0.   0.   0.   0.   0.   0. ]
 [ 0.   0.  -0.5  0.   0.5  0.   0.   0.   0.   0. ]
 [ 0.   0.   0.  -0.5  0.   0.5  0.   0.   0.   0. ]
 [ 0.   0.   0.   0.  -0.5  0.   0.5  0.   0.   0. ]
 [ 0.   0.   0.   0.   0.  -0.5  0.   0.5  0.   0. ]
 [ 0.   0.   0.   0.   0.   0.  -0.5  0.   0.5  0. ]
 [ 0.   0.   0.   0.   0.   0.   0.  -0.5  0.   0.5]
 [ 0.   0.   0.   0.   0.   0.   0.   0.  -0.5  0. ]]
[[ 0.  -0.5  0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.5  0.  -0.5  0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.5  0.  -0.5  0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.5  0.  -0.5  0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.5  0.  -0.5  0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.5  0.  -0.5  0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.5  0.  -0.5  0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.5  0.  -0.5  0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.5  0.

We start with pure numpy but here instead of doing like in real implementation we always try to do things that would fit stencil... it actually turns out that this implementation works as well (although stencils are not an option, they need to be implemented)

In [35]:
class FirstDerivative1(LinearOperator):
    def __init__(self, N, sampling=1., dims=None, dir=0, dtype='float64'):
        self.N = N
        self.sampling = sampling
        self.shape = (self.N, self.N)
        self.dims = dims if dims is not None else (self.N, )
        self.dir = dir
        self.dtype = np.dtype(dtype)
        self.explicit = False

    def _matvec(self, x):
        x = x.reshape(self.dims)
        y = np.zeros(self.dims, self.dtype)
        y[1:-1] = (0.5*x[2:]-0.5*x[0:-2])/self.sampling
        y[0] = 0.5*x[1] / self.sampling
        y[-1] = -0.5*x[-2] / self.sampling
        return y.ravel()

    def _rmatvec(self, x):
        x = x.reshape(self.dims)
        y = np.zeros(self.dims, self.dtype)
        y[1:-1] = (-0.5*x[2:]+0.5*x[0:-2])/self.sampling
        y[0] = -0.5*x[1] / self.sampling
        y[-1] = 0.5*x[-2] / self.sampling
        return y.ravel()

In [36]:
@stencil
def _deriv1(a):
    return 0.5*a[1] - 0.5*a[-1]

@numba.guvectorize(
    [(numba.float64[:], numba.float64[:])],
    '(n) -> (n)', nopython=True, target='cpu')
def deriv1(x, out):
    out[:] = _deriv1(x)  
    out[0] = 0.5*x[1]
    out[-1] = -0.5*x[-2]
        
@stencil
def _rderiv1(a):
    return -0.5*a[1] + 0.5*a[-1]

@numba.guvectorize(
    [(numba.float64[:], numba.float64[:])],
    '(n) -> (n)', nopython=True, target='cpu')
def rderiv1(x, out):
    out[:] = _rderiv1(x) 
    out[0] = -0.5*x[1]
    out[-1] = 0.5*x[-2]

class FirstDerivative_numba(LinearOperator):
    def __init__(self, N, sampling=1., dims=None, dir=0, dtype='float64'):
        self.N = N
        self.sampling = sampling
        self.shape = (self.N, self.N)
        self.dims = dims if dims is not None else (self.N, )
        self.dir = dir
        self.dtype = np.dtype(dtype)
        self.explicit = False

    def _matvec(self, x):
        x = x.reshape(self.dims)
        y = deriv1(x)
        return y.ravel()

    def _rmatvec(self, x):
        x = x.reshape(self.dims)
        y = rderiv1(x)
        return y.ravel()

In [37]:
from scipy.sparse import csc_matrix

n = 3000
D = 0.5*np.eye(n, n, 1) -0.5*np.eye(n, n, -1)
D = csc_matrix(D)

Dop = FirstDerivative(n, edge=True)
Dop1 = FirstDerivative1(n)
Dop1_numba = FirstDerivative_numba(n)
Dop2 = MatrixMult(D)

x = np.arange(n)
dottest(Dop, n, n)
dottest(Dop1, n, n)
dottest(Dop1_numba, n, n)
dottest(Dop2, n, n)

True

In [38]:
%timeit -n 10 -r 4 dottest(Dop, n, n)
%timeit -n 10 -r 4 dottest(Dop1, n, n)
%timeit -n 10 -r 4 dottest(Dop1_numba, n, n)
%timeit -n 10 -r 4 dottest(Dop2, n, n)

244 µs ± 13.6 µs per loop (mean ± std. dev. of 4 runs, 10 loops each)
232 µs ± 8.89 µs per loop (mean ± std. dev. of 4 runs, 10 loops each)
216 µs ± 3.62 µs per loop (mean ± std. dev. of 4 runs, 10 loops each)
371 µs ± 6.53 µs per loop (mean ± std. dev. of 4 runs, 10 loops each)


In [39]:
from scipy.sparse import csc_matrix

n, m, p = 1000, 100, 10
D = 0.5*np.eye(n, n, 1) -0.5*np.eye(n, n, -1)
D = csc_matrix(D)

Dop = FirstDerivative(dims=(n, m, p), axis=0, edge=True)
Dop1 = FirstDerivative1(n*m*p, dims=(n, m, p), dir=0)
Dop1_numba = FirstDerivative_numba(n*m*p, dims=(n, m, p), dir=0)
Dop2 = MatrixMult(D, otherdims=(m, p))

x = np.ones((n, m, p))
dottest(Dop, n*m*p, n*m*p)
dottest(Dop1, n*m*p, n*m*p)
dottest(Dop1_numba, n*m*p, n*m*p)
dottest(Dop2, n*m*p, n*m*p)

True

In [40]:
%timeit -n 10 -r 4 dottest(Dop, n*m*p, n*m*p)
%timeit -n 10 -r 4 dottest(Dop1, n*m*p, n*m*p)
%timeit -n 10 -r 4 dottest(Dop1_numba, n*m*p, n*m*p)
%timeit -n 10 -r 4 dottest(Dop2, n*m*p, n*m*p)

70 ms ± 810 µs per loop (mean ± std. dev. of 4 runs, 10 loops each)
62.6 ms ± 1.61 ms per loop (mean ± std. dev. of 4 runs, 10 loops each)
85 ms ± 41.2 µs per loop (mean ± std. dev. of 4 runs, 10 loops each)
61.8 ms ± 115 µs per loop (mean ± std. dev. of 4 runs, 10 loops each)
