In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from pathlib2 import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor

In [3]:
torch.cuda.set_device(0)

### Matrix multiplication

#### with elementwise operations

In [4]:
a = tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).to('cuda')

In [5]:
print(a.device, a.type())

cuda:0 torch.cuda.LongTensor


In [6]:
b = torch.randint(high=5, size=(3,3)).to('cuda')

In [7]:
def matmul(a,b):
    ar,ac = a.shape
    br,bc = b.shape
    assert ar==bc
    c = torch.zeros(ar,bc).to('cuda')
    for i in range(ar):
        for j in range(bc):
            c[i,j] = (a[i,:]*b[:,j]).sum(dim=0)
    return c

In [8]:
%timeit -n 10 _=matmul(a,b)

400 µs ± 20.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### with broadcasting

Broadcasting only makes sense if one is the inputs needs it, so we will add another case 'c'. We will still test the performance on the square matrices

In [9]:
c = tensor([1, 2, 3]).to('cuda')

In [10]:
def matmul_br(a,b):
    ar,ac = a.shape
    br,bc = b.shape
    assert ac==br
    c = torch.zeros(ar, bc).to('cuda')
    for i in range(ar):
        c[i]   = (a[i,:].unsqueeze(-1) * b).sum(dim=0)
    return c

In [11]:
%timeit -n 10 _=matmul_br(a,c[:,None])

168 µs ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
%timeit -n 10 _=matmul_br(a,b)

157 µs ± 10.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


After re-running this notebook several times, I found that the square matrices are always faster and has less std

In [13]:
assert (matmul(a,b) == matmul_br(a,b)).all()

We will always check above condition for all variants of matmul_* with matmul being the base case

#### with einsum

Cuda only supports floating point so we will push einsum to cuda

In [14]:
def matmul_es(a,b): return torch.einsum('ik,kj->ij', a, b).to('cuda')

In [15]:
a = a.float(); b = b.float(); c = c.float()

In [27]:
%timeit -n 10 _=matmul_es(a,b)

The slowest run took 19.12 times longer than the fastest. This could mean that an intermediate result is being cached.
124 µs ± 216 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


__We did push the einsum to gpu but it still does the compute on CPU, WHY?__

In [28]:
%timeit -n 10 _=matmul_es(a,c[:,None])

40.6 µs ± 4.67 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


__Why is this on GPU? The CPU cache issue is with a and b only!__

In [19]:
assert (matmul(a,b) == matmul_es(a,b)).all()

#### with pytorch

In [20]:
%timeit -n 10 _=a.matmul(b)

11.1 µs ± 4.14 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
%timeit -n 10 _=a.matmul(c)

11 µs ± 3.56 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
assert (a.matmul(b) == matmul(a,b)).all()

In [23]:
assert (a.matmul(b) == matmul_es(a,b)).all()

In [24]:
%timeit -n 10 _=a@b

12.1 µs ± 3.72 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
%timeit -n 10 _=a@c

10.7 µs ± 2.89 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
assert (a@b == matmul_es(a,b)).all()

It is interesting to observe that matmul_br still shows the trend with square matrices multiplying faster, we cannot compare for matmul_es because of inconsistent runs between GPU and CPU

But pytorch's implementation is faster for non square matrices, is it because it is greedy with its memory footprint and access for calculations?

__TODO: Need to understand this in details__