In [82]:
import numpy as np
from scipy.stats import norm


class Activation:
    """Base activation class"""

    def __init__(self):
        self._input = None

    @property
    def input(self):
        """Returns the last input received by the activation"""
        return self._input

    def __call__(self, x: np.ndarray) -> np.ndarray:
        """Computes activation output

        Arguments:
            x: Input array of shape (`batch_size`, ...)

        Returns:
            An array of the same shape as `x`"""

        self._input = x
        return x

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        """Computes loss gradient with respect to the activation input.

        Arguments:
            gradOutput: Gradient of loss function with recpect to the activation output.
                An array of the same shape as the array received in `__call__` method.

        Returns:
            An array of the same shape as `gradOutput`"""
        raise NotImplementedError()


class ReLU(Activation):
    """Implements ReLU activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.maximum(0, x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * np.where(x > 0, 1, 0)


class LeakyReLU(Activation):
    """Implements LeakyReLU activation layer"""

    def __init__(self, slope: float = 0.03):
        """Initializes LeakyReLU layer.

        Arguments:
            slope: the slope coeffitient of the activation."""
        super().__init__()
        self.slope = slope

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.maximum(x * self.slope, x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * np.where(x > x * self.slope, 1, self.slope)


class GeLU(Activation):
    """Implements GeLU activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._input = x
        return x * norm.cdf(x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * (norm.cdf(x) + x * norm.pdf(x))


class SiLU(Activation):
    """Implements SiLU (swish) activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return x * (1 / (1 + np.exp(-x)))

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input

        sigm = 1 / (1 + np.exp(-x))
        return gradOutput * (sigm + x * sigm * (1 - sigm))


class Softplus(Activation):
    """Implements Softplus (SmoothReLU) activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.log(1 + np.exp(x))

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input

        return gradOutput * (1 / (1 + np.exp(-x)))


class ELU(Activation):
    """Implements ELU activation layer"""

    def __init__(self, alpha: float = 1):
        """Initializes ELU layer.

        Arguments:
            alpha: the alpha coeffitient of the activation."""
        super().__init__()
        self.alpha = alpha

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.where(x > 0, x, self.alpha * (np.exp(x) - 1))

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * np.where(x > 0, 1, self.alpha * np.exp(x))


class Sigmoid(Activation):
    """Implements Sigmoid activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return 1 / (1 + np.exp(-x))

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input

        sigm = self.__call__(x)
        return gradOutput * sigm * (1 - sigm)


class Tanh(Activation):
    """Implements Tanh activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.tanh(x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * (1 - self.__call__(x) ** 2)


class Softmax(Activation):
    """Implements Softmax activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        """Computes Softmax activation output

        Arguments:
            x: Input array of shape (`batch_size`, `n_features`)

        Returns:
            An array of the same shape as `x`"""
        Activation.__call__(self, x)
        e = np.exp(x)
        return e / e.sum(axis=1)[:, None]

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self.__call__(self._input)
        eye = np.eye(x.shape[1])
        return (gradOutput[:, None] @ ((eye - x[:,None]) * x[..., None])).sum(axis=1)
    

In [83]:
def check_function(function, shape):
    r = function()
    print(function.__name__)
    print("Function", r(np.random.uniform(size=shape)))
    print("X", r.input)
    print("Grad", r.grad(np.random.uniform(size=shape)))
    print()

In [84]:
check_function(ReLU, (10))
check_function(LeakyReLU, (10))
check_function(GeLU, (10))
check_function(SiLU, (10))
check_function(Softplus, (10))
check_function(ELU, (10))

check_function(Sigmoid, (10))
check_function(Tanh, (10))
check_function(Softmax, (10, 5))

ReLU
Function [0.55184521 0.12710795 0.60209008 0.68153449 0.24096814 0.43219638
 0.54662913 0.07445127 0.89695869 0.73381592]
X [0.55184521 0.12710795 0.60209008 0.68153449 0.24096814 0.43219638
 0.54662913 0.07445127 0.89695869 0.73381592]
Grad [0.93388775 0.21841784 0.35668377 0.78763933 0.98600588 0.1297962
 0.99164659 0.68629503 0.77298396 0.93357807]

LeakyReLU
Function [0.57558574 0.61860531 0.94181882 0.36701776 0.01626319 0.34122589
 0.20685179 0.04144744 0.39378443 0.78579776]
X [0.57558574 0.61860531 0.94181882 0.36701776 0.01626319 0.34122589
 0.20685179 0.04144744 0.39378443 0.78579776]
Grad [0.45986167 0.59500219 0.65174303 0.92246165 0.29287107 0.61945581
 0.14220388 0.06823636 0.3387666  0.70116917]

GeLU
Function [0.11724375 0.01722571 0.62924423 0.51910951 0.29189549 0.59821699
 0.5036627  0.80445175 0.32928862 0.34991082]
X [0.20211432 0.03355331 0.79875305 0.68817433 0.43647635 0.76813116
 0.67220227 0.96580932 0.48091566 0.5048098 ]
Grad [0.63141841 0.3341253  0.70

$$
\frac{\partial \text{GELU}}{\partial x_i}  =
                \frac{1}{2} + \frac{1}{2}\left(\text{erf}(\frac{x}{\sqrt{2}}) +
                    \frac{x + \text{erf}'(\frac{x}{\sqrt{2}})}{\sqrt{2}}\right)

        where :math:`\text{erf}'(x) = \frac{2}{\sqrt{\pi}} \cdot \exp\{-x^2\}`.
$$

Function [1.25611526e-02 1.17747648e-02 6.25798377e-05 1.43409824e-02
 9.38333792e-03 1.79696478e-03 1.88820898e-02 2.21133190e-02
 7.16284269e-03 9.75394268e-03]
X [0.41870509 0.39249216 0.00208599 0.47803275 0.31277793 0.05989883
 0.62940299 0.73711063 0.23876142 0.32513142]
Grad [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
