In [1]:
import time
import torch
from torch.autograd import Function, gradcheck
from torch.utils.cpp_extension import load

# Python

Extending Pytorch: https://pytorch.org/docs/master/notes/extending.html

In [2]:
#myActFunc(x) = 0.5 x ( tanh(x) + 1 )
class CustomAct(Function):
    @staticmethod
    def forward(ctx, inp):
        
        ctx.save_for_backward(inp)
        out = 0.5 * inp * ( torch.tanh(inp) + 1)
        return out
    
    @staticmethod
    def backward(ctx, grad_out):
        inp = ctx.saved_tensors[0]
        #0.5 tanh(x) + 0.5 x sech^2(x) + 0.5
        grad_inp = 0.5 * torch.tanh(inp) + 0.5 * inp * (1/torch.cosh(inp))**2 + 0.5
        return grad_inp * grad_out

In [3]:
act = CustomAct.apply

# gradcheck takes a tuple of tensors as input, check if your gradient
# evaluated with these tensors are close enough to numerical
# approximations and returns True if they all verify this condition.
inps = torch.randn(2,4,dtype=torch.double,requires_grad=True)
test = gradcheck(act, inps, eps=1e-6, atol=1e-4)
print(test)

True


# C++

Custom C++ and CUDA extensions: https://pytorch.org/tutorials/advanced/cpp_extension.html
Tensor basics(C++): https://pytorch.org/cppdocs/notes/tensor_basics.html

In [4]:
# import cust_act_cpp
cust_act_cpp = load(name='cust_act_cpp', sources=['cust_act.cpp'])

In [5]:
class CustomAct_c(Function):
    @staticmethod
    def forward(ctx, inp):
        
        ctx.save_for_backward(inp)
        out = cust_act_cpp.forward(inp)
        return out
    
    @staticmethod
    def backward(ctx, grad_out):
        inp = ctx.saved_tensors[0]
        #0.5 tanh(x) + 0.5 x sech^2(x) + 0.5
        grad_inp = cust_act_cpp.backward(inp,grad_out)
        return grad_inp

In [6]:
act_c = CustomAct_c.apply
inps = torch.randn(2,4,dtype=torch.double,requires_grad=True)
test = gradcheck(act_c, inps, eps=1e-6, atol=1e-4)
print(test)

True


# CUDA

Custom C++ and CUDA extensions: https://pytorch.org/tutorials/advanced/cpp_extension.html
An even easier introduction to CUDA: https://devblogs.nvidia.com/even-easier-introduction-cuda/

In [7]:
cust_act_cuda = load(name='cust_act', sources=['cust_act_cuda.cpp', 'cust_act_cuda_kernel.cu'])

In [8]:
class CustomAct_cuda(Function):
    @staticmethod
    def forward(ctx, inp):
        
        ctx.save_for_backward(inp)
        out = cust_act_cuda.forward(inp)
        return out
    
    @staticmethod
    def backward(ctx, d_out):
        inp = ctx.saved_tensors[0]
        #0.5 tanh(x) + 0.5 x sech^2(x) + 0.5
        grad_inp = cust_act_cuda.backward(inp,d_out.contiguous())
        return grad_inp

In [9]:
act_cuda = CustomAct_cuda.apply
inps = torch.randn(2,4,dtype=torch.double,requires_grad=True,device='cuda')
test = gradcheck(act_cuda, inps, eps=1e-6, atol=1e-4)
print(test)

True


# Comparison

In [10]:
def profile(func,iters=10_000, device='cpu'):
    act = func.apply
    batch_size = 64
    input_features = 1_000
    X = torch.randn(batch_size, input_features,requires_grad=True,device=device)

    forward = 0
    backward = 0
    for _ in range(iters):
        start = time.time()
        out = act(X)
        forward += time.time() - start

        start = time.time()
        out.sum().backward()
        backward += time.time() - start

    print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/iters, backward * 1e6/iters))

In [11]:
#python
profile(CustomAct)
profile(CustomAct,device='cuda')

Forward: 276.037 us | Backward 711.382 us
Forward: 129.084 us | Backward 367.477 us


In [12]:
#c++
profile(CustomAct_c)
profile(CustomAct_c,device='cuda')

Forward: 196.571 us | Backward 734.201 us
Forward: 123.962 us | Backward 343.428 us


In [13]:
#cuda
profile(CustomAct_cuda,device='cuda')

Forward: 64.148 us | Backward 200.492 us
