# Multidimensional arrays

### Check installation

In [None]:
!lscpu

In [None]:
!nvidia-smi

*Latency numbers every programmer should know* (Jeff Dean):

**L1 cache reference 0.5 ns**

**L2 cache reference 7 ns**

**Main memory reference 100 ns**

![CPUCUDA](https://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/gpu-devotes-more-transistors-to-data-processing.png)

In [None]:
import numpy as np
import numba as nb
print(np.show_config())

In [None]:
import torch
print(torch.__config__.show())
print(torch.__config__.parallel_info())

In [None]:
N = 500
a = np.random.normal(size=(N,N))
b = np.random.normal(size=(N,N))
a_ten = torch.tensor(a)
b_ten = torch.tensor(b)
a_tenc = a_ten.cuda()
b_tenc = b_ten.cuda()

In [None]:
%timeit np.dot(a,b)

In [None]:
%timeit torch.matmul(a_ten, b_ten)

In [None]:
%timeit torch.matmul(a_tenc, b_tenc)

In [None]:
a32 = np.random.normal(size=(N,N)).astype(np.float32)
b32 = np.random.normal(size=(N,N)).astype(np.float32)
a_ten32 = torch.tensor(a32)
b_ten32 = torch.tensor(b32)
a_tenc32 = a_ten32.cuda()
b_tenc32 = b_ten32.cuda()

In [None]:
%timeit np.dot(a32,b32)

In [None]:
%timeit torch.matmul(a_ten32, b_ten32)

In [None]:
%timeit torch.matmul(a_tenc32, b_tenc32)

In [None]:
def matmul(a, b):
    n = a.shape[0]
    k = a.shape[1]
    m = b.shape[1]  
    c = np.zeros((n, m))
    for i in range(n):
        for j in range(m):
            for s in range(k):
                c[i, j] += a[i, s] * b[s, j]
                
    return c

N = 100
a = np.random.normal(size=(N,N))
b = np.random.normal(size=(N,N))

In [None]:
%timeit np.dot(a,b)

In [None]:
%timeit matmul(a,b)

In [None]:
%prun matmul(a,b)

In [None]:
@nb.njit
def numba_matmul(a, b):
    n = a.shape[0]
    k = a.shape[1]
    m = b.shape[1]
    c = np.zeros((n, m))
    for i in range(n):
        for j in range(m):
            for s in range(k):
                c[i, j] += a[i, s] * b[s, j]
    return c

In [None]:
%timeit numba_matmul(a,b)

### Buffer Protocol

```cpp
struct buffer_info {
    void *ptr; /* Pointer to buffer */
    size_t itemsize; /* Size of one scalar */
    size_t ndim; /* Number of dimensions */
    size_t *shape; /* Buffer dimensions */
    size_t *strides; /* Strides (in bytes) for each index */
};
```

In [None]:
a = np.random.normal(size=(4,5)).astype(np.float32)
print(a.itemsize)
print(a.ndim)
print(a.shape)
print(a.strides)
a

In [None]:
a_tor = torch.from_numpy(a)
print(a_tor.dtype)
print(a_tor.dim())
print(a_tor.size())
print(a_tor.stride())
a_tor

In [None]:
a_num = a_tor.numpy()
a_num

In [None]:
a_tor[0,0] *= 10
print(a_tor[0,0])

In [None]:
assert a[0,0] == a_tor[0,0].item()
assert a[0,0] == a_num[0,0]

For more info have a look at: 
* PyTorch [docs](https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html) 
* Numba [docs](https://numba.pydata.org/numba-doc/latest/index.html)

### Writing native extensions 

Tutorials worth working through include: 
* [numba & CUDA](http://numba.pydata.org/numba-doc/0.16.0/CUDAJit.html)
* [CUDA made easy](https://developer.nvidia.com/blog/even-easier-introduction-cuda)
* [CUDA guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html)

In [None]:
N = 1 << 20
x = torch.rand(N)
y = torch.rand(N)
x_cuda = x.cuda()
y_cuda = y.cuda()

In [None]:
print(torch.cuda.device_count())
print(torch.cuda.get_device_properties(0))

In [None]:
%timeit x*y

In [None]:
%timeit x_cuda*y_cuda

In [None]:
@nb.njit
def mult_numba(x,y):
    r = np.zeros_like(x)
    n = r.shape[0]
    for i in range(n):
        r[i] = x[i]*y[i]
    return r

In [None]:
%timeit mult_numba(x.numpy(), y.numpy())

In [None]:
# Global Interpreter Lock (GIL) is released
@nb.njit(parallel=True)
def pmult_numba(x,y):
    r = np.zeros_like(x)
    n = r.shape[0]
    for i in nb.prange(n):
        r[i] = x[i]*y[i]
    return r

In [None]:
(torch.from_numpy(pmult_numba(x.numpy(), y.numpy())) - x * y).abs().sum()

In [None]:
%timeit pmult_numba(x.numpy(), y.numpy())

![multithreading](https://randu.org/tutorials/threads/images/process.png)

In [None]:
import ctypes
from numba import cuda
def float_devicendarray(tensor):
    assert tensor.type() == 'torch.cuda.FloatTensor'
    assert tensor.is_cuda
    ctx = cuda.cudadrv.devices.get_context(tensor.device.index)
    mp = cuda.cudadrv.driver.MemoryPointer(ctx, ctypes.c_ulong(tensor.data_ptr()), tensor.numel()*4)
    return cuda.cudadrv.devicearray.DeviceNDArray(tensor.size(), [i*4 for i in tensor.stride()], np.float32, 
                                                  gpu_data=mp, stream=torch.cuda.current_stream().cuda_stream)

In [None]:
torch.cuda.get_device_properties(0)

![sm](https://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/automatic-scalability.png)

![blocks](https://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/grid-of-thread-blocks.png)

In [None]:
block_size = 256; #mulptiple of 32
num_blocks = (N + block_size - 1) // block_size;
num_blocks

![CUDA](https://developer-blogs.nvidia.com/wp-content/uploads/2017/01/cuda_indexing.png)

In [None]:
@cuda.jit
def mult_cuda_kernel(x,y,r,n):
    index = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    stride = cuda.blockDim.x * cuda.gridDim.x
    for i in range(index, n, stride):
        r[i] = x[i] * y[i]

In [None]:
def mult_numba_cuda(x,y):
    r = torch.zeros_like(x)
    x_gpu = float_devicendarray(x)
    y_gpu = float_devicendarray(y)
    r_gpu = float_devicendarray(r)
    n = r.numel()
    
    mult_cuda_kernel[num_blocks, block_size](x_gpu, y_gpu, r_gpu, n)
    
    return r

In [None]:
(mult_numba_cuda(x_cuda,y_cuda) - x_cuda*y_cuda).abs().sum()

In [None]:
%timeit mult_numba_cuda(x_cuda,y_cuda)

Further performance gain can be obtained by writing C++ extensions directly using LibTorch:
* [PyTorch C++ extensions](https://pytorch.org/tutorials/advanced/cpp_extension.html)

### Exercises

**(1)** Think about how to speed up `mult_numba_cuda` 

**(2)** Provide a CUDA implementations for `numba_matmul` 

**(3)** Generate `N` random 2D rotations stacked up as a 3D tensor. Write a test for your implementation.

In [None]:
# Hints:
N = 3
PI = 2. * torch.acos(torch.tensor(0.))
thetas = 0.05 * PI * (torch.rand(N) - 0.5) # example of angles in radians

# return a 3D tensor of rotations given thetas 
# def get_rotations(thetas):
#   ...
    