# Pylops-GPU - extending pytorch with Lops

### Author: M.Ravasi

In this notebook we experiment with extending Pytorch.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%pylab inline

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import torch
import scipy as sp
import matplotlib.pyplot as plt
import pylops

from torch.autograd import gradcheck
from scipy.signal import triang
from pylops import Diagonal, MatrixMult, FirstDerivative
from pylops.utils import dottest
from pylops.signalprocessing import Convolve1D

from scipy.sparse.linalg import cg
from pylops_gpu.utils.backend import device
from pylops_gpu.utils import dottest as gdottest
from pylops_gpu import Restriction as gRestriction

Populating the interactive namespace from numpy and matplotlib


Let's consider the following **scalar** function:

$$f(x) = (3*x)^2$$

that is expressed as:

$$y = 3*x, \quad z = y^2$$

We can thus compute the following derivatives

$$df/dx = 18 * x, \quad dy/dx = 3$$

In [2]:
x = torch.ones(1, requires_grad=True)
y = 3 * x
z = y ** 2

In [3]:
z.backward(retain_graph=True)
print(x.grad)

tensor([18.])


In [4]:
x.grad.data.zero_() # always clean gradient otherwise it will be summed
y.backward(retain_graph=True)
print(x.grad)

x.grad.data.zero_() # always clean gradient otherwise it will be summed
z.backward(retain_graph=True)
print(x.grad)

tensor([3.])
tensor([18.])


Let's do the same with a **scalar** function and a **vectorial** input

$$f(\textbf{x}) = \sum (3*\textbf{x})^2$$

We can thus compute the following derivatives

$$df/dx_i = 18 * x_i$$

In [5]:
x = torch.arange(5, dtype=torch.float32, requires_grad=True)
y = 3 * x
z = torch.sum(y ** 2)

In [6]:
z.backward(torch.tensor(1.), retain_graph=True)
print(x.grad)

tensor([ 0., 18., 36., 54., 72.])


Finally we consider a **vectorial** function and a **vectorial** input

$$\textbf{y} = 3*\textbf{x}^2$$

Now we cannot compute the jacobian, but we can compute the product of the jacobian by a vector $$\textbf{J}^T * \textbf{v}$$.

In our case:

$$\textbf{J} = \begin{vmatrix}
dy_1/dx_1&...&dy_1/dx_M \\
...&...&...\\
dy_N/dx_1&...&dy_N/dx_M
\end{vmatrix} = 
\begin{vmatrix}
6*x_1&...&0 \\
...&...&...\\
0&...&6*x_M
\end{vmatrix}
$$

If we choose a unitary vector:

$$
\textbf{g} = \textbf{J}^T * \textbf{v} = \begin{vmatrix} 6*x_1 \\ ...\\ 6*x_M \end{vmatrix}
$$

In [7]:
x = torch.arange(5, dtype=torch.float32, requires_grad=True)
y = 3 * (x ** 2)

In [8]:
v = torch.ones(5)
y.backward(v, retain_graph=True)
print(x)
print(x.grad)

tensor([0., 1., 2., 3., 4.], requires_grad=True)
tensor([ 0.,  6., 12., 18., 24.])


Let's consider now a **matrix-vector multiplication**

$$\textbf{y} = \textbf{A}\textbf{x}$$

For any matrix the Jacobian is the matrix itself ($\textbf{J} = \textbf{A}$), and the gradient is equal:

$$\textbf{g} =\textbf{A}^T\textbf{v}$$

In [9]:
n, m = 10, 5 
A = torch.from_numpy(np.arange(n*m, dtype=np.float32).reshape(n, m))

x = torch.arange(m, dtype=torch.float32, requires_grad=True)
y = torch.matmul(A, x)

In [10]:
v = torch.ones(n)
y.backward(v, retain_graph=True)
print(x.grad)
print(torch.matmul(A.T, v))

tensor([225., 235., 245., 255., 265.])
tensor([225., 235., 245., 255., 265.])


If we thus have the following relation:

$$\textbf{y} = \textbf{A} (3*\textbf{x}^2)$$

the gradient can be obtained by first multiplying $\textbf{A}^T$ followed by the gradient of the second term.

In [11]:
n, m = 10, 5 
A = torch.from_numpy(np.arange(n*m, dtype=np.float32).reshape(n, m))

x = torch.ones(m, dtype=torch.float32, requires_grad=True)
y = 3 * x**2
z = torch.matmul(A, y)
z

tensor([ 30., 105., 180., 255., 330., 405., 480., 555., 630., 705.],
       grad_fn=<MvBackward>)

In [13]:
v = torch.matmul(A.T, torch.ones(n))
y.backward(v, retain_graph=True)
print(x.grad)

tensor([1350., 1410., 1470., 1530., 1590.])


Compare with full gradient from AD

In [14]:
x.grad.data.zero_() # always clean gradient otherwise it will be summed
v = torch.ones(n)
z.backward(v, retain_graph=True)
print(x.grad)

tensor([1350., 1410., 1470., 1530., 1590.])


Finally we consider a linear operator that mimics a matrix $\textbf{A}$ and define its backward operator as its adjoint and compare results with its equivalent dense matrix

In [15]:
class MatMult(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, R):
        y = torch.matmul(R, x)
        ctx.save_for_backward(R)
        return y
        
    @staticmethod
    def backward(ctx, y):
        R, = ctx.saved_tensors
        return  torch.matmul(R.T, y), None

In [16]:
x = torch.ones(m, dtype=torch.float32, requires_grad=True)
y = 3 * x**2
z = MatMult.apply(y, A)

In [17]:
v = torch.ones(n)
z.backward(v, retain_graph=True)
print(x.grad)

tensor([1350., 1410., 1470., 1530., 1590.])


And with a more complicated operator, the **Restriction** operator

In [18]:
class Lop(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, iava):
        #save_for_backward(iava, torch.Tensor([x.shape[0]]))
        ctx.iava = iava
        ctx.n = x.shape[0]
        y = torch.take(x,iava)
        #y =x[iava]
        return y
        
    @staticmethod
    def backward(ctx, y):
        #iava, n = ctx.saved_tensors
        x = torch.zeros(int(ctx.n),  dtype=torch.float64)
        x[ctx.iava] = y
        return  x, None, None

In [19]:
# subsampling 
perc_subsampling=0.4

nsub=int(np.round(n*perc_subsampling))
iava = np.sort(np.random.permutation(np.arange(n))[:nsub])
R = np.zeros((nsub, n))
R[np.arange(nsub), iava] = 1
R = torch.from_numpy(R)

In [20]:
x = torch.arange(n, dtype=torch.float64, requires_grad=True)
y = MatMult.apply(x, R)
print(y)

# gradient
v = torch.randn(nsub, dtype=torch.float64)
y.backward(v, retain_graph=True)
print(x.grad)

tensor([1., 2., 3., 6.], dtype=torch.float64, grad_fn=<MatMultBackward>)
tensor([ 0.0000, -0.2306, -1.6422, -1.0424,  0.0000,  0.0000,  0.1976,  0.0000,
         0.0000,  0.0000], dtype=torch.float64)


In [21]:
Rop = gRestriction(n, iava, dtype=torch.float64)
x = torch.arange(n, dtype=torch.float64, requires_grad=True)
y = Lop.apply(x, torch.from_numpy(iava))
print(y)

# gradient
y.backward(v, retain_graph=True)
print(x.grad)

tensor([1., 2., 3., 6.], dtype=torch.float64, grad_fn=<LopBackward>)
tensor([ 0.0000, -0.2306, -1.6422, -1.0424,  0.0000,  0.0000,  0.1976,  0.0000,
         0.0000,  0.0000], dtype=torch.float64)


In [22]:
inputs = (torch.randn(n,dtype=torch.double,requires_grad=True), torch.from_numpy(iava))
test = gradcheck(Lop.apply, inputs, eps=1e-6, atol=1e-4)
print(test)

True


And wrapping the methods from pylops. This could become very generical way to include all linear operators of pylops in pytorch and create combination of NN layers and physical operators

In [23]:
class Lop(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, forw, adj):
        ctx.forw = forw
        ctx.adj = adj
        y = ctx.forw(x)
        return y
        
    @staticmethod
    def backward(ctx, y):
        x = ctx.adj(y)
        return  x, None, None

In [24]:
Rop = gRestriction(n, iava, dtype=torch.float64)
x = torch.arange(n, dtype=torch.float64, requires_grad=True)
y = Lop.apply(x, Rop.matvec, Rop.rmatvec)
print(y)

# gradient
y.backward(v, retain_graph=True)
print(x.grad)

inputs = (torch.randn(n,dtype=torch.double,requires_grad=True), Rop.matvec, Rop.rmatvec)
test = gradcheck(Lop.apply, inputs, eps=1e-6, atol=1e-4)
print(test)

tensor([1., 2., 3., 6.], dtype=torch.float64, grad_fn=<LopBackward>)
tensor([ 0.0000, -0.2306, -1.6422, -1.0424,  0.0000,  0.0000,  0.1976,  0.0000,
         0.0000,  0.0000], dtype=torch.float64)
True
