# Pylops - CUDA basic linear operators

### Author: M.Ravasi

In this notebook we will experiment with Pytorch to assess its usability as backend for CUDA enabled operators

In [1]:
!pip install pylops
!pip install git+https://git@github.com/equinor/pylops-gpu.git@master

Collecting pylops
[?25l  Downloading https://files.pythonhosted.org/packages/ba/5a/dc9d93cd0f9ba3ea9a77c30c92865f07523ebf2fc391dff19aeca2f2b848/pylops-1.4.0-py3-none-any.whl (141kB)
[K     |████████████████████████████████| 143kB 3.4MB/s 
Installing collected packages: pylops
Successfully installed pylops-1.4.0
Collecting git+https://git@github.com/equinor/pylops-gpu.git@master
  Cloning https://git@github.com/equinor/pylops-gpu.git (to revision master) to /tmp/pip-req-build-tzuejl6s
  Running command git clone -q https://git@github.com/equinor/pylops-gpu.git /tmp/pip-req-build-tzuejl6s
Collecting pytorch_complex_tensor (from pylops-gpu==0.0.0)
  Downloading https://files.pythonhosted.org/packages/0e/98/6e5718dd0c7d7b648c560624aea9a81b98b0442e03a7ff4c81430b0c8082/pytorch-complex-tensor-0.0.134.tar.gz
Building wheels for collected packages: pylops-gpu, pytorch-complex-tensor
  Building wheel for pylops-gpu (setup.py) ... [?25l[?25hdone
  Stored in directory: /tmp/pip-ephem-wheel-cac

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%pylab inline

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import torch
import scipy as sp
import matplotlib.pyplot as plt

import pylops
from pylops import Diagonal
from pylops.utils import dottest
from pylops_gpu.utils.backend import device
from pylops_gpu.utils import dottest as gdottest
from pylops_gpu import Diagonal as gDiagonal

Populating the interactive namespace from numpy and matplotlib


In [3]:
dev = device()
print('PyLops-gpu working on %s...' % dev)

PyLops-gpu working on cuda...


## Diagonal

Example with model and data already on GPU

In [12]:
n = 100000
xg = torch.ones(n, dtype=torch.float32).to(dev)
dg = (torch.arange(0, n, dtype=torch.float32) + 1.).to(dev)

x = xg.cpu().numpy()
d = dg.cpu().numpy()

Dop = Diagonal(d)
Dop_gpu = gDiagonal(dg, device=dev)
dottest(Dop, n, n, verb=True)
gdottest(Dop_gpu, n, n, device=dev, verb=True)

# y = Dx
yg = Dop_gpu * xg
print(yg)

Dot test passed, v^T(Opu)=-7843362.240638 - u^T(Op^Tv)=-7843362.240638
Dot test passed, v^T(Opu)=1246191872.000000 - u^T(Op^Tv)=1246191872.000000
tensor([1.0000e+00, 2.0000e+00, 3.0000e+00,  ..., 9.9998e+04, 9.9999e+04,
        1.0000e+05], device='cuda:0')


In [10]:
%timeit -n 10 Dop * x
%timeit -n 10 Dop_gpu * xg

10 loops, best of 3: 101 µs per loop
10 loops, best of 3: 12.7 µs per loop


Example with model and data transfered from and to gpu in forward and adjoint operations

In [6]:
n = 100000
xg = torch.ones(n, dtype=torch.float32).to(dev)
dg = (torch.arange(0, n, dtype=torch.float32) + 1.).to(dev)

xc = xg.cpu()
x = xg.cpu().numpy()
d = dg.cpu().numpy()

Dop = Diagonal(d)
Dop_gpu = gDiagonal(dg, device=dev, togpu=(True, True), tocpu=(True, True))
gdottest(Dop_gpu, n, n, verb=True)

# y = Dx
y = Dop_gpu * x
# xinv = D^-1 y
xinv = Dop_gpu / y

print(y)
print(xinv)

Dot test passed, v^T(Opu)=1245179904.000000 - u^T(Op^Tv)=1245179904.000000
[1.0000e+00 2.0000e+00 3.0000e+00 ... 9.9998e+04 9.9999e+04 1.0000e+05]
[4.09996035e-07 1.63998405e-06 3.68996179e-06 ... 9.99280264e-01
 9.99063708e-01 9.98822355e-01]


In [8]:
Dop_gpu = gDiagonal(dg, device=dev, togpu=(True, True), tocpu=(True, True))

%timeit -n 10 Dop * x
%timeit -n 10 Dop_gpu * xc

10 loops, best of 3: 116 µs per loop
10 loops, best of 3: 363 µs per loop


Note here how we get beaten by the cost of moving x and y back and forth between CPU and GPU.

# 1D Convolution

In [0]:
N = 11
Nh = 3
x = np.zeros(N)
x[N//2] = 1

h = np.arange(Nh)+1
y = np.convolve(x, h, mode='same')
print(y)
print(y.shape)

[0. 0. 0. 0. 1. 2. 3. 0. 0. 0. 0.]
(11,)


In [0]:
xt = torch.zeros(N)
xt[N//2] = 1

ht = torch.torch.arange(0, Nh, dtype=torch.float) + 1.
yt = torch.torch.conv_transpose1d(xt.reshape(1, 1, N), ht.reshape(1, 1, 3), padding=Nh//2)
print(yt)
print(yt.shape)

tensor([[[0., 0., 0., 0., 1., 2., 3., 0., 0., 0., 0.]]])
torch.Size([1, 1, 11])


In [0]:
y = np.correlate(x, h, mode='same')
print(y)

yt = torch.torch.conv1d(xt.reshape(1, 1, N), ht.reshape(1, 1, 3), padding=Nh//2)
print(yt)
print(yt.shape)

[0. 0. 0. 0. 3. 2. 1. 0. 0. 0. 0.]
tensor([[[0., 0., 0., 0., 3., 2., 1., 0., 0., 0., 0.]]])
torch.Size([1, 1, 11])


In [0]:
xt = torch.zeros((1000, N))
xt[:, N//2] = 1
ht = torch.torch.arange(0, Nh, dtype=torch.float) + 1.

xc = xt.to(device)
hc = ht.to(device)

yt = torch.torch.conv1d(xt.reshape(1000, 1, N), ht.reshape(1, 1, 3), padding=Nh//2)
print(yt.shape)
yc = torch.torch.conv1d(xc.reshape(1000, 1, N), hc.reshape(1, 1, 3), padding=Nh//2)
print(yc.shape)

% timeit torch.torch.conv1d(xt.reshape(1000, 1, N), ht.reshape(1, 1, 3), padding=Nh//2)
% timeit torch.torch.conv1d(xc.reshape(1000, 1, N), hc.reshape(1, 1, 3), padding=Nh//2)

torch.Size([1000, 1, 11])
torch.Size([1000, 1, 11])
1000 loops, best of 3: 434 µs per loop
The slowest run took 15.01 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 40.6 µs per loop
