# Pylops - torch operator

### Author: M.Ravasi

In this notebook I will show how to use the `TorchOperator` to mix and match pylops and pytorch operators into an AD-friendy chain of operations

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

#import warnings
#warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import torch
import torch.nn as nn

from torch.autograd import gradcheck
from pylops.torchoperator import TorchOperator
from pylops.basicoperators import *
from pylops.signalprocessing import Convolve2D

## AD

Single batch

In [2]:
nx, ny = 10, 6
x0 = torch.arange(nx, dtype=torch.double, requires_grad=True)

# Forward
A = np.random.normal(0., 1., (ny, nx))
Aop = TorchOperator(MatrixMult(A))
y = Aop.apply(torch.sin(x0))

# AD
v = torch.ones(ny, dtype=torch.double)
y.backward(v, retain_graph=True)
adgrad = x0.grad

# Analytical
At = torch.from_numpy(A)
#J = (At * torch.cos(x0))
J = (At * torch.cos(x0))
print(J.shape)
anagrad = torch.matmul(J.T, v)

print('Input: ', x0)
print('AD gradient: ', adgrad)
print('Analytical gradient: ', anagrad)

# Grad check
input = (torch.arange(nx, dtype=torch.double, requires_grad=True),
         Aop.matvec, Aop.rmatvec, Aop.device, 'cpu')
test = gradcheck(Aop.Top, input, eps=1e-6, atol=1e-4)
print(test)

torch.Size([6, 10])
Input:  tensor([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], dtype=torch.float64,
       requires_grad=True)
AD gradient:  tensor([ 1.0280, -2.7161,  1.3890, -3.7811, -1.3855, -0.7098, -1.7599, -1.7247,
        -0.4403,  1.7002], dtype=torch.float64)
Analytical gradient:  tensor([ 1.0280, -2.7161,  1.3890, -3.7811, -1.3855, -0.7098, -1.7599, -1.7247,
        -0.4403,  1.7002], dtype=torch.float64, grad_fn=<MvBackward0>)
True


Multi batch, we should get here to sum of gradients

In [3]:
nbatch, nx, ny = 5, 3, 6
x0 = torch.arange(nbatch * nx, dtype=torch.float).reshape(nbatch, nx)
x0.requires_grad=True

# Forward
A = np.random.normal(0., 1., (ny, nx)).astype(np.float32)
Aop = TorchOperator(MatrixMult(A), batch=True)
y = Aop.apply(torch.sin(x0))

# AD
v = torch.ones((nbatch, ny), dtype=torch.float32)
y.backward(v, retain_graph=True)
adgrad = x0.grad
print('AD gradient: ', adgrad)

# Analytical
x0.grad.data.zero_()
At = torch.from_numpy(A)
Lin = nn.Linear(nx, ny, bias=False)
Lin.weight.data[:] = At.float()
y1 = Lin(torch.sin(x0))
y1.backward(v, retain_graph=True)
anagrad = x0.grad

print('Analytical gradient: ', anagrad)

AD gradient:  tensor([[-0.8326, -0.6849,  0.7844],
        [ 0.8242,  0.8285, -0.5347],
        [-0.7994, -0.9556,  0.2743],
        [ 0.7586,  1.0636, -0.0083],
        [-0.7026, -1.1503, -0.2577]])
Analytical gradient:  tensor([[-0.8326, -0.6849,  0.7844],
        [ 0.8242,  0.8285, -0.5347],
        [-0.7994, -0.9556,  0.2743],
        [ 0.7586,  1.0636, -0.0083],
        [-0.7026, -1.1503, -0.2577]])


In [4]:
nbatch, nx, ny = 5, 3, 6
x0 = torch.arange(nbatch*nx, dtype=torch.float).reshape(nbatch, nx).requires_grad_()

# Forward
A = np.random.normal(0., 1., (ny, nx)).astype(np.float32)
Aop = TorchOperator(MatrixMult(A), batch=True)
y = Aop.apply(torch.sin(x0))
l = torch.mean(y**2)
l.backward()
adgrad = x0.grad
print('AD gradient: ', adgrad)

# Analytical
x1 = torch.arange(nbatch*nx, dtype=torch.float).reshape(nbatch, nx).requires_grad_()
At = torch.from_numpy(A)
Lin = nn.Linear(nx, ny, bias=False)
Lin.weight.data[:] = At.float()
y1 = Lin(torch.sin(x1))
l1 = torch.mean(y1**2)
l1.backward()
anagrad = x1.grad

print('Analytical gradient: ', anagrad)

AD gradient:  tensor([[-0.1963,  0.0531, -0.0243],
        [-0.2584,  0.0653, -0.0231],
        [-0.3078,  0.0750, -0.0150],
        [-0.3404,  0.0815, -0.0005],
        [-0.3537,  0.0842,  0.0190]])
Analytical gradient:  tensor([[-0.1963,  0.0531, -0.0243],
        [-0.2584,  0.0653, -0.0231],
        [-0.3078,  0.0750, -0.0150],
        [-0.3404,  0.0815, -0.0005],
        [-0.3537,  0.0842,  0.0190]])


## Mixing NN and Physics

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
device.type

'cuda'

In [7]:
class Network(nn.Module):
    def __init__(self, input_channels):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, input_channels // 2, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(input_channels // 2, input_channels // 4, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(input_channels // 4, input_channels // 8, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(input_channels // 8, input_channels // 32, kernel_size=3, padding=1)
        self.activation = nn.LeakyReLU(0.2)
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = self.conv1(x)
        x = self.activation(x)
        x = self.conv2(x)
        x = self.activation(x)
        x = self.conv3(x)
        x = self.activation(x)
        x = self.conv4(x)
        x = self.activation(x)
        return x

In [8]:
net_cpu = Network(32)
net_gpu = Network(32)
net_gpu.to(device)

Network(
  (conv1): Conv2d(32, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(8, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(4, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (activation): LeakyReLU(negative_slope=0.2)
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)

In [9]:
# CPU
n = 512
Pop = Transpose(dims=(n, n), axes=(1,0))
Pop_torch_cpu = TorchOperator(Pop, device='cpu')

# forward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((1, 32, n, n))).view(-1)) # dry run
print(y.device)
%timeit -n2 -r2 Pop_torch_cpu.apply(net_cpu(torch.ones((1, 32, n, n))).view(-1))

# backward
y = Pop_torch_cpu.apply(net_cpu(torch.ones((1, 32, n, n))).view(-1))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cpu
17 ms ± 1.09 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)
37.7 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [10]:
# GPU
Pop = Transpose(dims=(n, n), axes=(1,0))
Pop_torch_gpu = TorchOperator(Pop, device=device)

# forward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1)) # dry run
print(y.device)
%timeit -n2 -r2 Pop_torch_gpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1))

# backward
y = Pop_torch_gpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1))
loss = y.sum()
%timeit -n1 -r1 loss.backward()

cuda:0
12.2 ms ± 1.59 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)
23.6 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [11]:
# Mixed (currently not allowed!)
Pop = Transpose(dims=(n, n), axes=(1,0))
Pop_torch_cpu = TorchOperator(Pop, device='cpu', devicetorch=device)

# forward
y = Pop_torch_cpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1)) # dry run
print(y.device)
%timeit -n2 -r2 Pop_torch_cpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1))

# backward
y = Pop_torch_cpu.apply(net_gpu(torch.ones((1, 32, n, n)).to(device)).view(-1))
loss = y.sum()
%timeit -n1 -r1 loss.backward()



cuda:0




14.4 ms ± 547 µs per loop (mean ± std. dev. of 2 runs, 2 loops each)
7.6 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
