# Pylops - torch operator

### Author: M.Ravasi

In this notebook I will show how to use the `TorchOperator` to mix and match pylops and pytorch operators into an AD-friendy chain of operations

In [66]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

#import warnings
#warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import torch
import torch.nn as nn

from torch.autograd import gradcheck
from pylops.torchoperator import TorchOperator
from pylops.basicoperators import *
from pylops.signalprocessing import Convolve2D

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## AD

Single batch

In [67]:
nx, ny = 10, 6
x0 = torch.arange(nx, dtype=torch.double, requires_grad=True)

# Forward
A = np.random.normal(0., 1., (ny, nx))
Aop = TorchOperator(MatrixMult(A))
y = Aop.apply(torch.sin(x0))

# AD
v = torch.ones(ny, dtype=torch.double)
y.backward(v, retain_graph=True)
adgrad = x0.grad

# Analytical
At = torch.from_numpy(A)
#J = (At * torch.cos(x0))
J = (At * torch.cos(x0))
print(J.shape)
anagrad = torch.matmul(J.T, v)

print('Input: ', x0)
print('AD gradient: ', adgrad)
print('Analytical gradient: ', anagrad)

# Grad check
input = (torch.arange(nx, dtype=torch.double, requires_grad=True),
         Aop.matvec, Aop.rmatvec, Aop.device, 'cpu')
test = gradcheck(Aop.Top, input, eps=1e-6, atol=1e-4)
print(test)

torch.Size([6, 10])
Input:  tensor([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], dtype=torch.float64,
       requires_grad=True)
AD gradient:  tensor([-3.5634,  1.5422, -1.1205,  3.1014, -0.7585, -0.0989,  2.0274,  1.0850,
         0.2566, -1.8949], dtype=torch.float64)
Analytical gradient:  tensor([-3.5634,  1.5422, -1.1205,  3.1014, -0.7585, -0.0989,  2.0274,  1.0850,
         0.2566, -1.8949], dtype=torch.float64, grad_fn=<MvBackward0>)
True


Multi batch, we should get here to sum of gradients

In [68]:
nbatch, nx, ny = 5, 3, 6
x0 = torch.arange(nbatch * nx, dtype=torch.float).reshape(nbatch, nx)
x0.requires_grad=True

# Forward
A = np.random.normal(0., 1., (ny, nx)).astype(np.float32)
Aop = TorchOperator(MatrixMult(A), batch=True)
y = Aop.apply(torch.sin(x0))

# AD
v = torch.ones((nbatch, ny), dtype=torch.float32)
y.backward(v, retain_graph=True)
adgrad = x0.grad
print('AD gradient: ', adgrad)

# Analytical
x0.grad.data.zero_()
At = torch.from_numpy(A)
Lin = nn.Linear(nx, ny, bias=False)
Lin.weight.data[:] = At.float()
y1 = Lin(torch.sin(x0))
y1.backward(v, retain_graph=True)
anagrad = x0.grad

print('Analytical gradient: ', anagrad)

AD gradient:  tensor([[-2.1960, -0.3267, -1.5877],
        [ 2.1741,  0.3953,  1.0822],
        [-2.1086, -0.4559, -0.5551],
        [ 2.0009,  0.5074,  0.0169],
        [-1.8531, -0.5487,  0.5217]])
Analytical gradient:  tensor([[-2.1960, -0.3267, -1.5877],
        [ 2.1741,  0.3953,  1.0822],
        [-2.1086, -0.4559, -0.5551],
        [ 2.0009,  0.5074,  0.0169],
        [-1.8531, -0.5487,  0.5217]])


In [69]:
nbatch, nx, ny = 5, 3, 6
x0 = torch.arange(nbatch*nx, dtype=torch.float).reshape(nbatch, nx).requires_grad_()

# Forward
A = np.random.normal(0., 1., (ny, nx)).astype(np.float32)
Aop = TorchOperator(MatrixMult(A), batch=True)
y = Aop.apply(torch.sin(x0))
l = torch.mean(y**2)
l.backward()
adgrad = x0.grad
print('AD gradient: ', adgrad)

# Analytical
x1 = torch.arange(nbatch*nx, dtype=torch.float).reshape(nbatch, nx).requires_grad_()
At = torch.from_numpy(A)
Lin = nn.Linear(nx, ny, bias=False)
Lin.weight.data[:] = At.float()
y1 = Lin(torch.sin(x1))
l1 = torch.mean(y1**2)
l1.backward()
anagrad = x1.grad

print('Analytical gradient: ', anagrad)

AD gradient:  tensor([[ 0.2593,  0.4270, -0.0529],
        [ 0.1449,  0.4336, -0.0390],
        [ 0.0292,  0.3944, -0.0212],
        [-0.0784,  0.3124, -0.0007],
        [-0.1695,  0.1943,  0.0208]])
Analytical gradient:  tensor([[ 0.2593,  0.4270, -0.0529],
        [ 0.1449,  0.4336, -0.0390],
        [ 0.0292,  0.3944, -0.0212],
        [-0.0784,  0.3124, -0.0007],
        [-0.1695,  0.1943,  0.0208]])


## Mixing NN and Physics

In [70]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [71]:
device.type

'cuda'

In [72]:
class Network(nn.Module):
    def __init__(self, input_channels):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, input_channels // 2, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(input_channels // 2, input_channels // 4, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(input_channels // 4, input_channels // 8, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(input_channels // 8, input_channels // 32, kernel_size=3, padding=1)
        self.activation = nn.LeakyReLU(0.2)
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = self.conv1(x)
        x = self.activation(x)
        x = self.conv2(x)
        x = self.activation(x)
        x = self.conv3(x)
        x = self.activation(x)
        x = self.conv4(x)
        x = self.activation(x)
        return x

In [73]:
net_cpu = Network(32)
net_gpu = Network(32)
net_gpu.to(device)

Network(
  (conv1): Conv2d(32, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(8, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(4, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (activation): LeakyReLU(negative_slope=0.2)
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)

In [74]:
# CPU
n = 512
Pop = Transpose(dims=(n, n), axes=(1,0))
Pop_torch_cpu = TorchOperator(Pop, device='cpu')

# forward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((1, 32, n, n))).view(-1)) # dry run
print(y.device, y.shape)
%timeit -n2 -r2 Pop_torch_cpu.apply(net_cpu(torch.ones((1, 32, n, n))).view(-1))

# backward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((1, 32, n, n))).view(-1))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cpu torch.Size([262144])
17.6 ms ± 1.61 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)
33.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [75]:
# CPU with NDarray
n = 512
Pop = Transpose(dims=(1, n, n), axes=(0,2,1))
Pop_torch_cpu = TorchOperator(Pop, device='cpu')

# forward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((1, 32, n, n))).squeeze(0))
print(y.device, y.shape)
%timeit -n2 -r2 Pop_torch_cpu.apply(net_cpu(torch.ones((1, 32, n, n))).squeeze(0))

# backward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((1, 32, n, n))).squeeze(0))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cpu torch.Size([1, 512, 512])
19.6 ms ± 995 µs per loop (mean ± std. dev. of 2 runs, 2 loops each)
30 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [76]:
# CPU with batch 
n = 512
Pop = Transpose(dims=(n, n), axes=(1,0))
Pop_torch_cpu = TorchOperator(Pop, device='cpu', batch=True)

# forward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((4, 32, n, n))).view(4, -1))
print(y.device, y.shape)
%timeit -n2 -r2 Pop_torch_cpu.apply(net_cpu(torch.ones((4, 32, n, n))).view(4, -1))

# backward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((4, 32, n, n))).view(4, -1))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cpu torch.Size([4, 262144])
113 ms ± 139 µs per loop (mean ± std. dev. of 2 runs, 2 loops each)
148 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [77]:
# CPU with batch NDarray
n = 512
Pop = Transpose(dims=(1, n, n), axes=(0,2,1))
Pop_torch_cpu = TorchOperator(Pop, device='cpu', batch=True, flatten=False)

# forward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((4, 32, n, n))))
print(y.device, y.shape)
%timeit -n2 -r2 Pop_torch_cpu.apply(net_cpu(torch.ones((4, 32, n, n))))

# backward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((4, 32, n, n))))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cpu torch.Size([4, 1, 512, 512])
114 ms ± 9.89 µs per loop (mean ± std. dev. of 2 runs, 2 loops each)
170 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [78]:
# GPU
Pop = Transpose(dims=(n, n), axes=(1,0))
Pop_torch_gpu = TorchOperator(Pop, device=device)

# forward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1)) # dry run
print(y.device)
%timeit -n2 -r2 Pop_torch_gpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1))

# backward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cuda:0
11.4 ms ± 208 µs per loop (mean ± std. dev. of 2 runs, 2 loops each)
2.54 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [79]:
# GPU with NDarray
n = 512
Pop = Transpose(dims=(1, n, n), axes=(0,2,1))
Pop_torch_gpu = TorchOperator(Pop, device=device)

# forward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).squeeze(0))
print(y.device, y.shape)
%timeit -n2 -r2 Pop_torch_gpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).squeeze(0))

# backward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).squeeze(0))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cuda:0 torch.Size([1, 512, 512])
11.3 ms ± 78 µs per loop (mean ± std. dev. of 2 runs, 2 loops each)
2.64 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [80]:
# GPU with batch 
n = 512
Pop = Transpose(dims=(n, n), axes=(1,0))
Pop_torch_gpu = TorchOperator(Pop, device=device, batch=True)

# forward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((4, 32, n, n)).to(device)).view(4, -1))
print(y.device, y.shape)
%timeit -n2 -r2 Pop_torch_gpu.apply(net_gpu(torch.ones((4, 32, n, n)).to(device)).view(4, -1))

# backward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((4, 32, n, n)).to(device)).view(4, -1))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cuda:0 torch.Size([4, 262144])
44.8 ms ± 126 µs per loop (mean ± std. dev. of 2 runs, 2 loops each)
3.68 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [81]:
# GPU with batch NDarray
n = 512
Pop = Transpose(dims=(1, n, n), axes=(0,2,1))
Pop_torch_gpu = TorchOperator(Pop, device=device, batch=True, flatten=False)

# forward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((4, 32, n, n)).to(device)))
print(y.device, y.shape)
%timeit -n2 -r2 Pop_torch_gpu.apply(net_gpu(torch.ones((4, 32, n, n)).to(device)))

# backward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((4, 32, n, n)).to(device)))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cuda:0 torch.Size([4, 1, 512, 512])
44.9 ms ± 132 µs per loop (mean ± std. dev. of 2 runs, 2 loops each)
4.48 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [82]:
# Mixed
Pop = Transpose(dims=(n, n), axes=(1,0))
Pop_torch_cpu = TorchOperator(Pop, device='cpu', devicetorch=device)

# forward
y = Pop_torch_cpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1)) # dry run
print(y.device)
%timeit -n2 -r2 Pop_torch_cpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1))

# backward
y = Pop_torch_cpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1))
loss = y.sum()
%timeit -n1 -r1 loss.backward()



cuda:0




13.8 ms ± 149 µs per loop (mean ± std. dev. of 2 runs, 2 loops each)
6.17 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
