In [19]:
import numpy as np
from math import erf


class Activation:
    """Base activation class"""

    def __init__(self):
        self._input = None

    @property
    def input(self):
        """Returns the last input received by the activation"""
        return self._input

    def __call__(self, x: np.ndarray) -> np.ndarray:
        """Computes activation output

        Arguments:
            x: Input array of shape (`batch_size`, ...)

        Returns:
            An array of the same shape as `x`"""

        self._input = x
        return x

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        """Computes loss gradient with respect to the activation input.

        Arguments:
            gradOutput: Gradient of loss function with recpect to the activation output.
                An array of the same shape as the array received in `__call__` method.

        Returns:
            An array of the same shape as `gradOutput`"""
        raise NotImplementedError()


class ReLU(Activation):
    """Implements ReLU activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.maximum(0, x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * np.where(x > 0, 1, 0)


class LeakyReLU(Activation):
    """Implements LeakyReLU activation layer"""

    def __init__(self, slope: float = 0.03):
        """Initializes LeakyReLU layer.

        Arguments:
            slope: the slope coeffitient of the activation."""
        super().__init__()
        self.slope = slope

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.maximum(x * self.slope, x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * np.where(x > x * self.slope, 1, self.slope)


class GeLU(Activation):
    """Implements GeLU activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)

        return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        derf = 2 / np.sqrt(np.pi) * np.exp(-np.power(x / np.sqrt(2), 2))
        dx = 0.5 * (1 + (erf(x / np.sqrt(2)) + (x + derf) / np.sqrt(2)))
        return gradOutput * dx


class SiLU(Activation):
    """Implements SiLU (swish) activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return x * (1 / (1 + np.exp(-x)))

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input

        sigm = 1 / (1 + np.exp(-x))
        return gradOutput * (sigm + x * sigm * (1 - sigm))


class Softplus(Activation):
    """Implements Softplus (SmoothReLU) activation layer"""

    def __init__(self, beta=1, threshold=20):
        super().__init__()
        self.beta = beta
        self.threshold = threshold

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.where(
            x * self.beta > self.threshold,
            x * self.beta,
            1 / self.beta * np.log(1 + np.exp(self.beta * x)),
        )

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input

        return gradOutput * np.where(
            x * self.beta > self.threshold,
            self.beta,
            x * np.exp(self.beta * x) / (1 + np.exp(self.beta * x)),
        )


class ELU(Activation):
    """Implements ELU activation layer"""

    def __init__(self, alpha: float = 1):
        """Initializes ELU layer.

        Arguments:
            alpha: the alpha coeffitient of the activation."""
        super().__init__()
        self.alpha = alpha

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.where(x > 0, x, self.alpha * (np.exp(x) - 1))

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * np.where(x > 0, 1, self.alpha * x * np.exp(x))


class Sigmoid(Activation):
    """Implements Sigmoid activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return 1 / (1 + np.exp(-x))

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input

        sigm = self.__call__(x)
        return gradOutput * sigm * (1 - sigm)


class Tanh(Activation):
    """Implements Tanh activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        Activation.__call__(self, x)
        return np.tanh(x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * (1 - self.__call__(x) ** 2)


class Softmax(Activation):
    """Implements Softmax activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        """Computes Softmax activation output

        Arguments:
            x: Input array of shape (`batch_size`, `n_features`)

        Returns:
            An array of the same shape as `x`"""
        Activation.__call__(self, x)
        return x / x.sum(axis=-1)[..., None]

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        x = self._input
        return gradOutput * (x.sum() - x) / (np.power(x.sum(), 2))

In [20]:
def check_function(function, shape):
    r = function()
    print(function.__name__)
    print("Function", r(np.random.uniform(size=shape)))
    print("X", r.input)
    print("Grad", r.grad(np.random.uniform(size=shape)))
    print()

In [21]:
check_function(ReLU, (10))
check_function(LeakyReLU, (10))
check_function(GeLU, (10))
check_function(SiLU, (10))
check_function(Softplus, (10))
check_function(ELU, (10))

check_function(Sigmoid, (10))
check_function(Tanh, (10))
check_function(Softmax, (10))

ReLU
Function [0.57367105 0.20495396 0.45844234 0.16983032 0.11385525 0.4874076
 0.28419979 0.84406364 0.99939451 0.94121038]
X [0.57367105 0.20495396 0.45844234 0.16983032 0.11385525 0.4874076
 0.28419979 0.84406364 0.99939451 0.94121038]
Grad [0.20966514 0.74256248 0.24493337 0.14985843 0.15503913 0.96747571
 0.54672906 0.47384094 0.89505301 0.45983365]

LeakyReLU
Function [0.13937445 0.68792673 0.83564843 0.10755601 0.94100694 0.68959878
 0.74835983 0.38009471 0.64697814 0.54173316]
X [0.13937445 0.68792673 0.83564843 0.10755601 0.94100694 0.68959878
 0.74835983 0.38009471 0.64697814 0.54173316]
Grad [0.23684858 0.36350642 0.78746715 0.67532518 0.58847623 0.21699365
 0.14368608 0.61609624 0.03163702 0.67522493]

GeLU
Function [0.60023574 0.22512139 0.20358088 0.01597122 0.70495202 0.24345675
 0.26157972 0.71928578 0.75943592 0.5943225 ]
X [0.77020943 0.3529061  0.3245684  0.03116748 0.87213249 0.3764617
 0.39927538 0.88580664 0.92380839 0.76433314]


TypeError: only length-1 arrays can be converted to Python scalars

$$
\frac{\partial \text{GELU}}{\partial x_i}  =
                \frac{1}{2} + \frac{1}{2}\left(\text{erf}(\frac{x}{\sqrt{2}}) +
                    \frac{x + \text{erf}'(\frac{x}{\sqrt{2}})}{\sqrt{2}}\right)

        where :math:`\text{erf}'(x) = \frac{2}{\sqrt{\pi}} \cdot \exp\{-x^2\}`.
$$

Function [1.25611526e-02 1.17747648e-02 6.25798377e-05 1.43409824e-02
 9.38333792e-03 1.79696478e-03 1.88820898e-02 2.21133190e-02
 7.16284269e-03 9.75394268e-03]
X [0.41870509 0.39249216 0.00208599 0.47803275 0.31277793 0.05989883
 0.62940299 0.73711063 0.23876142 0.32513142]
Grad [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
