# Pylops - CUDA basic linear operators

### Author: M.Ravasi

In this notebook we will experiment with Pytorch to assess its usability as backend for CUDA enabled operators

In [1]:
!pip install pylops

Collecting pylops
[?25l  Downloading https://files.pythonhosted.org/packages/ba/5a/dc9d93cd0f9ba3ea9a77c30c92865f07523ebf2fc391dff19aeca2f2b848/pylops-1.4.0-py3-none-any.whl (141kB)
[K     |████████████████████████████████| 143kB 4.7MB/s 
Installing collected packages: pylops
Successfully installed pylops-1.4.0


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%pylab inline

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

import pylops
from pylops.utils import dottest

import torch
import torch.nn as nn

Populating the interactive namespace from numpy and matplotlib


In [3]:
import torch

device = 'cpu'
if torch.cuda.device_count() > 0 and torch.cuda.is_available():
    print("Cuda installed! Running on GPU!")
    device = 'cuda'
else:
    print("No GPU available!")

Cuda installed! Running on GPU!


# Diagonal

In [4]:
nx = 100000
x = np.ones(nx)
Dop = pylops.Diagonal(np.arange(nx))
dottest(Dop, nx, nx, verb=True)

y  = Dop*x
y1 = Dop.H*x

Dot test passed, v^T(Opu)=-18204277.587282 - u^T(Op^Tv)=-18204277.587282


In [0]:
from pylops import LinearOperator

class Diagonal_cuda(LinearOperator):
    def __init__(self, diag, dtype='float64'):
        self.diag = diag
        self.dtype = np.dtype(dtype)
        self.explicit = False

    def _matvec(self, x):
        y = self.diag*x
        return y

    def _rmatvec(self, x):
        y = self.diag*x
        return y

In [10]:
x_cuda = torch.from_numpy(np.ones(nx, dtype=np.float32)).to(device)
diag_cuda = torch.from_numpy(np.arange(nx, dtype=np.float32)).to(device)
Dop_cuda = Diagonal_cuda(diag_cuda)
y_cuda = Dop_cuda._matvec(x_cuda)

print('x  = ',x_cuda)
print('D*x  = ',y_cuda)

x  =  tensor([1., 1., 1.,  ..., 1., 1., 1.], device='cuda:0')
D*x  =  tensor([0.0000e+00, 1.0000e+00, 2.0000e+00,  ..., 9.9997e+04, 9.9998e+04,
        9.9999e+04], device='cuda:0')


In [11]:
% timeit -n 10 Dop._matvec(x)
% timeit -n 10 Dop_cuda._matvec(x_cuda)

10 loops, best of 3: 304 µs per loop
The slowest run took 4.43 times longer than the fastest. This could mean that an intermediate result is being cached.
10 loops, best of 3: 9.96 µs per loop


# 1D Convolution

In [114]:
N = 11
Nh = 3
x = np.zeros(N)
x[N//2] = 1

h = np.arange(Nh)+1
y = np.convolve(x, h, mode='same')
print(y)
print(y.shape)

[0. 0. 0. 0. 1. 2. 3. 0. 0. 0. 0.]
(11,)


In [115]:
xt = torch.zeros(N)
xt[N//2] = 1

ht = torch.torch.arange(0, Nh, dtype=torch.float) + 1.
yt = torch.torch.conv_transpose1d(xt.reshape(1, 1, N), ht.reshape(1, 1, 3), padding=Nh//2)
print(yt)
print(yt.shape)

tensor([[[0., 0., 0., 0., 1., 2., 3., 0., 0., 0., 0.]]])
torch.Size([1, 1, 11])


In [116]:
y = np.correlate(x, h, mode='same')
print(y)

yt = torch.torch.conv1d(xt.reshape(1, 1, N), ht.reshape(1, 1, 3), padding=Nh//2)
print(yt)
print(yt.shape)

[0. 0. 0. 0. 3. 2. 1. 0. 0. 0. 0.]
tensor([[[0., 0., 0., 0., 3., 2., 1., 0., 0., 0., 0.]]])
torch.Size([1, 1, 11])


In [130]:
xt = torch.zeros((1000, N))
xt[:, N//2] = 1
ht = torch.torch.arange(0, Nh, dtype=torch.float) + 1.

xc = xt.to(device)
hc = ht.to(device)

yt = torch.torch.conv1d(xt.reshape(1000, 1, N), ht.reshape(1, 1, 3), padding=Nh//2)
print(yt.shape)
yc = torch.torch.conv1d(xc.reshape(1000, 1, N), hc.reshape(1, 1, 3), padding=Nh//2)
print(yc.shape)

% timeit torch.torch.conv1d(xt.reshape(1000, 1, N), ht.reshape(1, 1, 3), padding=Nh//2)
% timeit torch.torch.conv1d(xc.reshape(1000, 1, N), hc.reshape(1, 1, 3), padding=Nh//2)

torch.Size([1000, 1, 11])
torch.Size([1000, 1, 11])
1000 loops, best of 3: 434 µs per loop
The slowest run took 15.01 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 40.6 µs per loop
