## Patching 3D

In [28]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import pylops

from pylops.signalprocessing.patch3d import patch3d_design
from pylops.utils.describe import describe

from pylops.signalprocessing import Patch3D
from patch3dol import Patch3D as Patch3DOLD

USE_CUPY = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
if USE_CUPY:
    import cupy as np
    from cupyx.profiler import benchmark
    np_asarray = np.asarray
    np_asnumpy = np.asnumpy
    np_float = np.float32
    np_floatc = np.complex64
    mempool = np.get_default_memory_pool()
    fftengine = 'numpy'
    fftkwargs = dict()
else:
    np_asarray = np.asarray
    np_asnumpy = np.asarray
    np_float = np.float64
    np_floatc = np.complex128
    fftengine = 'scipy'
    fftkwargs = dict(workers=16)


In [30]:
def bench_Op(Op, x):
    return Op @ x

def bench_OpH(Op, x):
    return Op.H @ x

In [31]:
nwin = (42, 42, 34)
nover = (10, 10, 4)
nop = (64, 64, 64)
dimsd = (500, 100, 200) # small
#dimsd = (1000, 500, 500) # large
tapertype = 'cosine'

y = np.arange(dimsd[0]*dimsd[1]*dimsd[2]).reshape(dimsd).astype(np_float)

nwins, dims, mwin_inends, dwin_inends = patch3d_design(dimsd, nwin, nover, nop)

# no operator broadcast
Op = pylops.signalprocessing.FFTND(nwin, nffts=nop, 
                                   engine=fftengine, 
                                   dtype=np_floatc, 
                                   **fftkwargs)
Slid = Patch3DOLD(Op.H, dims, dimsd, nwin, nover, nop, tapertype='cosine')
Slid1a = Patch3D(Op.H, dims, dimsd, nwin, nover, nop, tapertype='cosine')

# with operator broadcast
Op = pylops.signalprocessing.FFTND((*nwins, *nwin), nffts=nop, 
                                   engine=fftengine,
                                   dtype=np_floatc, 
                                   **fftkwargs)
Slid1b = Patch3D(Op.H, dims, dimsd, nwin, nover, nop, tapertype='cosine')

x = Slid.H * y.ravel()



In [32]:
print(np.allclose(Slid @ x, Slid1a @ x), np.allclose(Slid.H @ y, Slid1a.H @ y))
print(np.allclose(Slid @ x, Slid1b @ x), np.allclose(Slid.H @ y, Slid1b.H @ y))

True True
True True


In [33]:
if not USE_CUPY:
    %timeit -n 5 -r 10 Slid * x # OLD
    %timeit -n 5 -r 10 Slid1a * x # NEW
    %timeit -n 5 -r 10 Slid1b * x # NEW with Op broadcasted
else:
    print(benchmark(bench_Op, (Slid, x,), n_repeat=20))
    print(benchmark(bench_Op, (Slid1a, x,), n_repeat=20))
    print(benchmark(bench_Op, (Slid1b, x,), n_repeat=20))

bench_Op            :    CPU: 98667.811 us   +/- 846.243 (min: 97299.004 / max: 100450.822) us     GPU-0: 104002.600 us   +/- 846.976 (min: 102630.402 / max: 105793.533) us
bench_Op            :    CPU: 80825.131 us   +/- 886.752 (min: 78533.284 / max: 81987.781) us     GPU-0: 80831.898 us   +/- 886.924 (min: 78539.619 / max: 81996.033) us
bench_Op            :    CPU:  7215.120 us   +/- 342.974 (min:  6863.846 / max:  8490.250) us     GPU-0:  7222.347 us   +/- 343.207 (min:  6870.848 / max:  8498.112) us


In [34]:
if not USE_CUPY:
    %timeit -n 5 -r 10 Slid.H * y # OLD
    %timeit -n 5 -r 10 Slid1a.H * y # NEW
    %timeit -n 5 -r 10 Slid1b.H * y # NEW with Op broadcasted
else:
    print(benchmark(bench_OpH, (Slid, y,), n_repeat=20))
    print(benchmark(bench_OpH, (Slid1a, y,), n_repeat=20))
    print(benchmark(bench_OpH, (Slid1b, y,), n_repeat=20))

bench_OpH           :    CPU: 67042.146 us   +/- 848.052 (min: 65822.656 / max: 69077.217) us     GPU-0: 67048.936 us   +/- 848.322 (min: 65829.086 / max: 69084.160) us
bench_OpH           :    CPU: 41348.440 us   +/- 565.801 (min: 40602.660 / max: 42461.556) us     GPU-0: 41355.630 us   +/- 565.841 (min: 40609.791 / max: 42468.513) us
bench_OpH           :    CPU:   643.734 us   +/- 26.571 (min:   622.674 / max:   716.201) us     GPU-0:  6887.696 us   +/- 17.406 (min:  6873.088 / max:  6938.624) us
