To Do:
- Implement LeakyRelu activation function --> Done
    - Accidentally added it to Conv layer; we need it on the BatchNorm layer for our model
- Implement forward pass & backward pass for every layer of our CNN in pure NumPy: --> Done
    - conv2d --> Done
    - batchnorm2d  --> Done
    - avgpool2d  --> Done
    - maxpool2d  --> Done
    - linear  --> Done
- Implement AdaMax optimizer in pure NumPy --> Done
- Implement Cross-Entropy Loss in pure NumPy --> Done
- Implement Model class in pure NumPy --> Done
- Verify pure NumPy model training works --> Done
- Modify to work with DP Tensors instead of numpy arrays

In [49]:
import numpy as np

In [50]:
class leaky_ReLU():

    def __init__(self, slope=0.01):
        super(leaky_ReLU, self).__init__()
        self.slope = slope

    def forward(self, input_array):
        self.last_forward = input_array  # Last image that has been forward passed through this activation function
        return ((input_array > 0) * input_array) + ((input_array <= 0) * input_array * self.slope)

    def derivative(self, input_array=None):
        last_forward = input_array if input_array else self.last_forward
        res = np.ones(last_forward.shape)
        res[last_forward <= 0] = self.slope
        return res
    
    def __str__(self):
        return self.__class__.__name__


In [51]:
def decompose_size(size):
    if len(size) == 2:
        fan_in = size[0]
        fan_out = size[1]

    elif len(size) == 4 or len(size) == 5:
        respective_field_size = np.prod(size[2:])
        fan_in = size[1] * respective_field_size
        fan_out = size[0] * respective_field_size

    else:
        fan_in = fan_out = int(np.sqrt(np.prod(size)))

    return fan_in, fan_out

In [70]:
class Uniform():
    def __init__(self, scale=0.05):
        self.scale = scale
        
    def __call__(self, size):
        return self.call(size)

    def call(self, size):
        return np.array(np.random.uniform(-self.scale, self.scale, size=size))
    
    def __str__(self):
        return self.__class__.__name__

In [71]:
class XavierInitialization():
    def __call__(self, size):
        return self.call(size)
    
    def call(self, size):
        fan_in, fan_out = decompose_size(size)
        return Uniform(np.sqrt(6 / (fan_in + fan_out)))(size)

    def __str__(self):
        return self.__class__.__name__

In [72]:
class Layer():
    """
    Subclassed when implementing new types of layers.
    
    Each layer can keep track of the layer(s) feeding into it, a
    network's output :class:`Layer` instance can double as a handle to the full
    network.
    """

    first_layer = False

    def forward(self, input, *args, **kwargs):
        raise NotImplementedError

    def backward(self, pre_grad, *args, **kwargs):
        raise NotImplementedError

    def connect_to(self, prev_layer):
        raise NotImplementedError

    @property
    def params(self):
        """ Layer parameters. 
        
        Returns a list of numpy.array variables or expressions that
        parameterize the layer.
        Returns
        -------
        list of numpy.array variables or expressions
            A list of variables that parameterize the layer
        Notes
        -----
        For layers without any parameters, this will return an empty list.
        """
        return []

    @property
    def grads(self):
        """ Get layer parameter gradients as calculated from backward(). """
        return []

    @property
    def param_grads(self):
        """ Layer parameters and corresponding gradients. """
        return list(zip(self.params, self.grads))

    def __str__(self):
        return self.__class__.__name__

In [205]:
class Convolution(Layer):
    """
    If this is the first layer in a model, provide the keyword argument `input_shape`
    (tuple of integers, does NOT include the sample axis, N.),
    e.g. `input_shape=(3, 128, 128)` for 128x128 RGB pictures.
    """

    def __init__(self, nb_filter, filter_size, input_shape=None, stride=1):
        self.nb_filter = nb_filter
        self.filter_size = filter_size
        self.input_shape = input_shape
        self.stride = stride

        self.W, self.dW = None, None
        self.b, self.db = None, None
        self.out_shape = None
        self.last_output = None
        self.last_input = None

        self.init = XavierInitialization()
        self.activation = leaky_ReLU()

    def connect_to(self, prev_layer=None):
        if prev_layer is None:
            assert self.input_shape is not None
            input_shape = self.input_shape
        else:
            input_shape = prev_layer.out_shape

        # input_shape: (batch size, num input feature maps, image height, image width)
        assert len(input_shape) == 4

        nb_batch, pre_nb_filter, pre_height, pre_width = input_shape
        if isinstance(self.filter_size, tuple):
            filter_height, filter_width = self.filter_size
        elif isinstance(self.filter_size, int):
            filter_height = filter_width = self.filter_size
        else:
            raise NotImplementedError

        height = (pre_height - filter_height) // self.stride + 1
        width = (pre_width - filter_width) // self.stride + 1

        # output shape
        self.out_shape = (nb_batch, self.nb_filter, height, width)

        # filters
        self.W = self.init((self.nb_filter, pre_nb_filter, filter_height, filter_width))
        self.b = np.zeros((self.nb_filter,))

    def forward(self, input, *args, **kwargs):

        self.last_input = input

        # shape
        nb_batch, input_depth, old_img_h, old_img_w = input.shape
        if isinstance(self.filter_size, tuple):
            filter_height, filter_width = self.filter_size
        elif isinstance(self.filter_size, int):
            filter_height = filter_width = self.filter_size
        else:
            raise NotImplementedError
#         filter_h, filter_w = self.filter_size
        new_img_h, new_img_w = self.out_shape[2:]

        # init
        outputs = np.zeros((nb_batch, self.nb_filter, new_img_h, new_img_w))

        # convolution operation
        for x in np.arange(nb_batch):
            for y in np.arange(self.nb_filter):
                for h in np.arange(new_img_h):
                    for w in np.arange(new_img_w):
                        h_shift, w_shift = h * self.stride, w * self.stride
                        # patch: (input_depth, filter_h, filter_w)
                        patch = input[x, :, h_shift: h_shift + filter_height, w_shift: w_shift + filter_width]
                        outputs[x, y, h, w] = np.sum(patch * self.W[y]) + self.b[y]

        # nonlinear activation
        # self.last_output: (nb_batch, output_depth, image height, image width)
        self.last_output = self.activation.forward(outputs)

        return self.last_output

    def backward(self, pre_grad, *args, **kwargs):

        # shape
        assert pre_grad.shape == self.last_output.shape
        nb_batch, input_depth, old_img_h, old_img_w = self.last_input.shape
        new_img_h, new_img_w = self.out_shape[2:]
        
        if isinstance(self.filter_size, tuple):
            filter_height, filter_width = self.filter_size
        elif isinstance(self.filter_size, int):
            filter_height = filter_width = self.filter_size
        else:
            raise NotImplementedError
        
#         filter_h, filter_w = self.filter_size
        old_img_h, old_img_w = self.last_input.shape[-2:]

        # gradients
        self.dW = np.zeros((self.W.shape))
        self.db = np.zeros((self.b.shape))
        delta = pre_grad * self.activation.derivative()

        # dW
        for r in np.arange(self.nb_filter):
            for t in np.arange(input_depth):
                for h in np.arange(filter_height):
                    for w in np.arange(filter_width):
                        input_window = self.last_input[:, t,
                                       h:old_img_h - filter_height + h + 1:self.stride,
                                       w:old_img_w - filter_width + w + 1:self.stride]
                        delta_window = delta[:, r]
                        self.dW[r, t, h, w] = np.sum(input_window * delta_window) / nb_batch

        # db
        for r in np.arange(self.nb_filter):
            self.db[r] = np.sum(delta[:, r]) / nb_batch

        # dX
        if not self.first_layer:
            layer_grads = np.zeros(self.last_input.shape)
            for b in np.arange(nb_batch):
                for r in np.arange(self.nb_filter):
                    for t in np.arange(input_depth):
                        for h in np.arange(new_img_h):
                            for w in np.arange(new_img_w):
                                h_shift, w_shift = h * self.stride, w * self.stride
                                layer_grads[b, t, h_shift:h_shift + filter_height, w_shift:w_shift + filter_width] += \
                                    self.W[r, t] * delta[b, r, h, w]
            return layer_grads

    @property
    def params(self):
        return self.W, self.b

    @property
    def grads(self):
        return self.dW, self.db

In [98]:
c = Convolution(3, 3, input_shape=(1, 3, 50, 50))

In [99]:
c.connect_to()

In [101]:
data = np.ones((1, 3, 50, 50))

c.forward(input=data)

array([[[[-0.01015495, -0.01015495, -0.01015495, ..., -0.01015495,
          -0.01015495, -0.01015495],
         [-0.01015495, -0.01015495, -0.01015495, ..., -0.01015495,
          -0.01015495, -0.01015495],
         [-0.01015495, -0.01015495, -0.01015495, ..., -0.01015495,
          -0.01015495, -0.01015495],
         ...,
         [-0.01015495, -0.01015495, -0.01015495, ..., -0.01015495,
          -0.01015495, -0.01015495],
         [-0.01015495, -0.01015495, -0.01015495, ..., -0.01015495,
          -0.01015495, -0.01015495],
         [-0.01015495, -0.01015495, -0.01015495, ..., -0.01015495,
          -0.01015495, -0.01015495]],

        [[-0.00241782, -0.00241782, -0.00241782, ..., -0.00241782,
          -0.00241782, -0.00241782],
         [-0.00241782, -0.00241782, -0.00241782, ..., -0.00241782,
          -0.00241782, -0.00241782],
         [-0.00241782, -0.00241782, -0.00241782, ..., -0.00241782,
          -0.00241782, -0.00241782],
         ...,
         [-0.00241782, -0.00241782

In [113]:
class BatchNorm(Layer):
    def __init__(self, epsilon=1e-6, momentum=0.9, axis=0,
                 beta_init='zero', gamma_init='one'):
        self.epsilon = epsilon
        self.momentum = momentum
        self.axis = axis

        self.beta, self.dbeta = None, None
        self.gamma, self.dgamma = None, None
        self.cache = None

    def connect_to(self, prev_layer):
        n_in = prev_layer.out_shape[-1]

        self.beta = np.zeros((n_in,))
        self.gamma = np.ones((n_in,))

    def forward(self, input, *args, **kwargs):
        # N, D = x.shape
        self.out_shape = input.shape

        # step1: calculate the mean
        # mu = 1. / N * np.sum(x, axis=0)
        mean = np.mean(input, axis=0)

        xmu = input - mean

        # step3:
        # sq = xmu ** 2
        # var = 1. / N * np.sum(sq, axis=0)
        var = np.std(xmu, axis=0)

        sqrtvar = np.sqrt(var + self.epsilon)
        ivar = 1. / sqrtvar

        # step5: normalization->x^
        xhat = xmu * ivar

        # step6: scale and shift
        gammax = self.gamma * xhat
        out = gammax + self.beta

        self.cache = (xhat, xmu, ivar, sqrtvar, var)

        return out

    def backward(self, pre_grad, *args, **kwargs):
        xhat, xmu, ivar, sqrtvar, var = self.cache

        N, D = pre_grad.shape

        # step6
        self.dbeta = np.sum(pre_grad, axis=0)
        dgammax = pre_grad
        self.dgamma = np.sum(dgammax * xhat, axis=0)
        dxhat = dgammax * self.gamma

        # step5
        divar = np.sum(dxhat * xmu, axis=0)
        dxmu1 = dxhat * ivar 

        # step4
        dsqrtvar = -1. / (sqrtvar ** 2) * divar
        dvar = 0.5 * 1. / np.sqrt(var + self.epsilon) * dsqrtvar

        # step3
        dsq = 1. / N * np.ones((N, D)) * dvar
        dxmu2 = 2 * xmu * dsq  

        # step2, 
        dx1 = (dxmu1 + dxmu2)

        # step1, 
        dmu = -1 * np.sum(dxmu1 + dxmu2, axis=0)
        dx2 = 1. / N * np.ones((N, D)) * dmu

        # step0 done!
        dx = dx1 + dx2

        return dx

    @property
    def params(self):
        return self.beta, self.gamma

    @property
    def grades(self):
        return self.dbeta, self.dgamma

In [114]:
bn = BatchNorm()

In [115]:
bn.connect_to(c)

In [116]:
bn.forward(c.forward(data))

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]]])

In [124]:
class AvgPool(Layer):
    """Average pooling operation for spatial data.
    Parameters
    ----------
    pool_size : tuple of 2 integers,
        factors by which to downscale (vertical, horizontal).
        (2, 2) will halve the image in each dimension.
    Returns
    -------
    4D numpy.array 
        with shape `(nb_samples, channels, pooled_rows, pooled_cols)` if dim_ordering='th'
        or 4D tensor with shape:
        `(samples, pooled_rows, pooled_cols, channels)` if dim_ordering='tf'.
    """

    def __init__(self, pool_size):
        self.pool_size = pool_size

        self.out_shape = 0
        self.out_shape = None
        self.input_shape = None

    def connect_to(self, prev_layer):
        assert 5 > len(prev_layer.out_shape) >= 3

        old_h, old_w = prev_layer.out_shape[-2:]
        pool_h, pool_w = self.pool_size
        new_h, new_w = old_h // pool_h, old_w // pool_w

        assert old_h % pool_h == old_w % pool_w == 0

        self.out_shape = prev_layer.out_shape[:-2] + (new_h, new_w)

    def forward(self, input, *args, **kwargs):

        # shape
        self.input_shape = input.shape
        pool_h, pool_w = self.pool_size
        new_h, new_w = self.out_shape[-2:]

        # forward
        outputs = np.zeros(self.input_shape[:-2] + self.out_shape[-2:])

        if np.ndim(input) == 4:
            nb_batch, nb_axis, _, _ = input.shape

            for a in np.arange(nb_batch):
                for b in np.arange(nb_axis):
                    for h in np.arange(new_h):
                        for w in np.arange(new_w):
                            outputs[a, b, h, w] = np.mean(input[a, b, h:h + pool_h, w:w + pool_w])

        elif np.ndim(input) == 3:
            nb_batch, _, _ = input.shape

            for a in np.arange(nb_batch):
                for h in np.arange(new_h):
                    for w in np.arange(new_w):
                        outputs[a, h, w] = np.mean(input[a, h:h + pool_h, w:w + pool_w])

        else:
            raise ValueError()

        return outputs

    def backward(self, pre_grad, *args, **kwargs):
        new_h, new_w = self.out_shape[-2:]
        pool_h, pool_w = self.pool_size
        length = np.prod(self.pool_size)

        layer_grads = np.zeros(self.input_shape)

        if np.ndim(pre_grad) == 4:
            nb_batch, nb_axis, _, _ = pre_grad.shape

            for a in np.arange(nb_batch):
                for b in np.arange(nb_axis):
                    for h in np.arange(new_h):
                        for w in np.arange(new_w):
                            h_shift, w_shift = h * pool_h, w * pool_w
                            layer_grads[a, b, h_shift: h_shift + pool_h, w_shift: w_shift + pool_w] = \
                                pre_grad[a, b, h, w] / length

        elif np.ndim(pre_grad) == 3:
            nb_batch, _, _ = pre_grad.shape

            for a in np.arange(nb_batch):
                for h in np.arange(new_h):
                    for w in np.arange(new_w):
                        h_shift, w_shift = h * pool_h, w * pool_w
                        layer_grads[a, h_shift: h_shift + pool_h, w_shift: w_shift + pool_w] = \
                            pre_grad[a, h, w] / length

        else:
            raise ValueError()

        return layer_grads


class MaxPool(Layer):
    """Max pooling operation for spatial data.
    Parameters
    ----------
    pool_size : tuple of 2 integers,
        factors by which to downscale (vertical, horizontal).
        (2, 2) will halve the image in each dimension.
    Returns
    -------
    4D numpy.array 
        with shape `(nb_samples, channels, pooled_rows, pooled_cols)` if dim_ordering='th'
        or 4D tensor with shape:
        `(samples, pooled_rows, pooled_cols, channels)` if dim_ordering='tf'.
    """
    def __init__(self, pool_size):
        self.pool_size = pool_size

        self.input_shape = None
        self.out_shape = None
        self.last_input = None

    def connect_to(self, prev_layer):
        # prev_layer.out_shape: (nb_batch, ..., height, width)
        assert len(prev_layer.out_shape) >= 3

        old_h, old_w = prev_layer.out_shape[-2:]
        pool_h, pool_w = self.pool_size
        new_h, new_w = old_h // pool_h, old_w // pool_w

        assert old_h % pool_h == old_w % pool_w == 0

        self.out_shape = prev_layer.out_shape[:-2] + (new_h, new_w)

    def forward(self, input, *args, **kwargs):
        # shape
        self.input_shape = input.shape
        pool_h, pool_w = self.pool_size
        new_h, new_w = self.out_shape[-2:]

        # forward
        self.last_input = input
        outputs = np.zeros(self.input_shape[:-2] + self.out_shape[-2:])

        if np.ndim(input) == 4:
            nb_batch, nb_axis, _, _ = input.shape

            for a in np.arange(nb_batch):
                for b in np.arange(nb_axis):
                    for h in np.arange(new_h):
                        for w in np.arange(new_w):
                            outputs[a, b, h, w] = np.max(input[a, b, h:h + pool_h, w:w + pool_w])

        elif np.ndim(input) == 3:
            nb_batch, _, _ = input.shape

            for a in np.arange(nb_batch):
                for h in np.arange(new_h):
                    for w in np.arange(new_w):
                        outputs[a, h, w] = np.max(input[a, h:h + pool_h, w:w + pool_w])

        else:
            raise ValueError()

        return outputs

    def backward(self, pre_grad, *args, **kwargs):
        new_h, new_w = self.out_shape[-2:]
        pool_h, pool_w = self.pool_size

        layer_grads = np.zeros(self.input_shape)

        if np.ndim(pre_grad) == 4:
            nb_batch, nb_axis, _, _ = pre_grad.shape

            for a in np.arange(nb_batch):
                for b in np.arange(nb_axis):
                    for h in np.arange(new_h):
                        for w in np.arange(new_w):
                            patch = self.last_input[a, b, h:h + pool_h, w:w + pool_w]
                            max_idx = np.unravel_index(patch.argmax(), patch.shape)
                            h_shift, w_shift = h * pool_h + max_idx[0], w * pool_w + max_idx[1]
                            layer_grads[a, b, h_shift, w_shift] = pre_grad[a, b, a, w]

        elif np.ndim(pre_grad) == 3:
            nb_batch, _, _ = pre_grad.shape

            for a in np.arange(nb_batch):
                for h in np.arange(new_h):
                    for w in np.arange(new_w):
                        patch = self.last_input[a, h:h + pool_h, w:w + pool_w]
                        max_idx = np.unravel_index(patch.argmax(), patch.shape)
                        h_shift, w_shift = h * pool_h + max_idx[0], w * pool_w + max_idx[1]
                        layer_grads[a, h_shift, w_shift] = pre_grad[a, a, w]

        else:
            raise ValueError()

        return layer_grads

In [120]:
bn.out_shape

(1, 3, 48, 48)

In [121]:
avg = AvgPool((2,2))

In [122]:
avg.connect_to(bn)

In [123]:
avg.forward(bn.forward(c.forward(data)))

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]]])

In [125]:
maxp = MaxPool((2,2))

In [126]:
maxp.connect_to(avg)

In [127]:
maxp.forward(avg.forward(bn.forward(c.forward(data))))

array([[[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [132]:
class Linear(Layer):
    def __init__(self, n_out, n_in=None):
        self.n_out = n_out
        self.n_in = n_in
        self.out_shape = (None, n_out)

        self.W = None
        self.b = None
        self.dW = None
        self.db = None
        self.last_input = None

    def connect_to(self, prev_layer=None):
        if prev_layer is None:
            assert self.n_in is not None
            n_in = self.n_in
        else:
            assert len(prev_layer.out_shape) == 2
            n_in = prev_layer.out_shape[-1]

        self.W = XavierInitialization((n_in, self.n_out))
        self.b = np.zeros((self.n_out,))

    def forward(self, input, *args, **kwargs):
        self.last_input = input
        return np.dot(input, self.W) + self.b

    def backward(self, pre_grad, *args, **kwargs):
        self.dW = np.dot(self.last_input.T, pre_grad)
        self.db = np.mean(pre_grad, axis=0)
        if not self.first_layer:
            return np.dot(pre_grad, self.W.T)

    @property
    def params(self):
        return self.W, self.b

    @property
    def grads(self):
        return self.dW, self.db

In [133]:
lin = Linear(n_out=2)

In [136]:
class Optimizer():
    """Abstract optimizer base class.
    
    Parameters
    ----------
    clip : float
        If smaller than 0, do not apply parameter clip.
    lr : float
        The learning rate controlling the size of update steps
    decay : float
        Decay parameter for the moving average. Must lie in [0, 1) where
        lower numbers means a shorter “memory”.
    lr_min : float
        When adapting step rates, do not move below this value. Default is 0.
    lr_max : float
        When adapting step rates, do not move above this value. Default is inf.
    """

    def __init__(self, lr=0.001, clip=-1, decay=0., lr_min=0., lr_max=np.inf):
        self.lr = lr
        self.clip = clip
        self.decay = decay
        self.lr_min = lr_min
        self.lr_max = lr_max

        self.iterations = 0

    def update(self, params, grads):
        self.iterations += 1

        self.lr *= (1. / 1 + self.decay * self.iterations)
        self.lr = np.clip(self.lr, self.lr_min, self.lr_max)

    def __str__(self):
        return self.__class__.__name__

In [137]:
class Adamax(Optimizer):
    """
    Parameters
    ----------
    beta1 : float
        Exponential decay rate for the first moment estimates.
    beta2 : float
        Exponential decay rate for the second moment estimates.
    epsilon : float
        Constant for numerical stability.
    References
    ----------
    .. [1] Kingma, Diederik, and Jimmy Ba (2014):
           Adam: A Method for Stochastic Optimization.
           arXiv preprint arXiv:1412.6980.
    """

    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, *args, **kwargs):
        super(Adamax, self).__init__(*args, **kwargs)

        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

        self.ms = None
        self.vs = None

    def update(self, params, grads):
        # init
        self.iterations += 1
        a_t = self.lr / (1 - np.power(self.beta1, self.iterations))
        if self.ms is None:
            self.ms = [np.zeros(p.shape) for p in params]
        if self.vs is None:
            self.vs = [np.zeros(p.shape) for p in params]

        # update parameters
        for i, (m, v, p, g) in enumerate(zip(self.ms, self.vs, params, grads)):
            m = self.beta1 * m + (1 - self.beta1) * g
            v = np.maximum(self.beta2 * v, np.abs(g))
            p -= a_t * m / (v + self.epsilon)

            self.ms[i] = m
            self.vs[i] = v

In [140]:
class LossFunction():
    def forward(self, outputs, targets):
        raise NotImplementedError()

    def backward(self, outputs, targets):
        raise NotImplementedError()

    def __str__(self):
        return self.__class__.__name__

In [141]:
class BinaryCrossEntropy():
    def __init__(self, epsilon=1e-11):
        self.epsilon = epsilon

    def forward(self, outputs, targets):
        """Forward pass.
        
        .. math:: L = -t \\log(p) - (1 - t) \\log(1 - p)
        
        Parameters
        ----------
        outputs : numpy.array
            Predictions in (0, 1), such as sigmoidal output of a neural network.
        targets : numpy.array
            Targets in [0, 1], such as ground truth labels.
        """
        outputs = np.clip(outputs, self.epsilon, 1 - self.epsilon)
        return np.mean(-np.sum(targets * np.log(outputs) + (1 - targets) * np.log(1 - outputs), axis=1))

    def backward(self, outputs, targets):
        """Backward pass.
        Parameters
        ----------
        outputs : numpy.array
            Predictions in (0, 1), such as sigmoidal output of a neural network.
        targets : numpy.array
            Targets in [0, 1], such as ground truth labels.
        """
        outputs = np.clip(outputs, self.epsilon, 1 - self.epsilon)
        divisor = np.maximum(outputs * (1 - outputs), self.epsilon)
        return (outputs - targets) / divisor
    
    def __str__(self):
        return self.__class__.__name__

In [197]:
class Model():
    def __init__(self, layers=None):
        self.layers = [] if layers is None else layers

        self.loss = None
        self.optimizer = Adamax

    def add(self, layer):
        assert isinstance(layer, Layer), "PySyft doesn't recognize this kind of layer."
        self.layers.append(layer)

    def compile(self, loss=BinaryCrossEntropy(), optimizer=Adamax()):
        # check
        # assert isinstance(self.layers[0], InputLayer)
        self.layers[0].first_layer = True

        # connect to
        next_layer = None
        for layer in self.layers:
            layer.connect_to(next_layer)
            next_layer = layer
        # for pre_layer, layer in zip(self.layers[:-1], self.layers[1:]):
        #     layer.connect_to(pre_layer)

        # get loss class
        self.loss = BinaryCrossEntropy()

        # get optimizer class
        self.optimizer = Adamax()

    def fit(self, X, Y, max_iter=100, batch_size=64, shuffle=True,
            validation_split=0., validation_data=None):

        # prepare data
        train_X = X #.astype(get_dtype()) if np.issubdtype(np.float64, X.dtype) else X
        train_Y = Y #.astype(get_dtype()) if np.issubdtype(np.float64, Y.dtype) else Y

        if 1. > validation_split > 0.:
            split = int(train_Y.shape[0] * validation_split)
            valid_X, valid_Y = train_X[-split:], train_Y[-split:]
            train_X, train_Y = train_X[:-split], train_Y[:-split]
        elif validation_data is not None:
            valid_X, valid_Y = validation_data
        else:
            valid_X, valid_Y = None, None

        iter_idx = 0
        while iter_idx < max_iter:
            iter_idx += 1

            # shuffle
            if shuffle:
                seed = np.random.randint(111, 1111111)
                np.random.seed(seed)
                np.random.shuffle(train_X)
                np.random.seed(seed)
                np.random.shuffle(train_Y)

            # train
            train_losses, train_predicts, train_targets = [], [], []
            for b in range(train_Y.shape[0] // batch_size):
                batch_begin = b * batch_size
                batch_end = batch_begin + batch_size
                x_batch = train_X[batch_begin:batch_end]
                y_batch = train_Y[batch_begin:batch_end]

                # forward propagation
                y_pred = self.predict(x_batch)

                # backward propagation
                next_grad = self.loss.backward(y_pred, y_batch)
                for layer in self.layers[::-1]:
                    next_grad = layer.backward(next_grad)

                # get parameter and gradients
                params = []
                grads = []
                for layer in self.layers:
                    params += layer.params
                    grads += layer.grads

                # update parameters
                self.optimizer.update(params, grads)

                # got loss and predict
                train_losses.append(self.loss.forward(y_pred, y_batch))
                train_predicts.extend(y_pred)
                train_targets.extend(y_batch)

            # output train status
            runout = "iter %d, train-[loss %.4f, acc %.4f]; " % (
                iter_idx, float(np.mean(train_losses)), float(self.accuracy(train_predicts, train_targets)))

            # runout = "iter %d, train-[loss %.4f, ]; " % (
            #     iter_idx, float(np.mean(train_losses)))

            if valid_X is not None and valid_Y is not None:
                # valid
                valid_losses, valid_predicts, valid_targets = [], [], []
                for b in range(valid_X.shape[0] // batch_size):
                    batch_begin = b * batch_size
                    batch_end = batch_begin + batch_size
                    x_batch = valid_X[batch_begin:batch_end]
                    y_batch = valid_Y[batch_begin:batch_end]

                    # forward propagation
                    y_pred = self.predict(x_batch)

                    # got loss and predict
                    valid_losses.append(self.loss.forward(y_pred, y_batch))
                    valid_predicts.extend(y_pred)
                    valid_targets.extend(y_batch)

                # output valid status
                runout += "valid-[loss %.4f, acc %.4f]; " % (
                    float(np.mean(valid_losses)), float(self.accuracy(valid_predicts, valid_targets)))

    def predict(self, X):
        """ Calculate an output Y for the given input X. """
        x_next = X
        for layer in self.layers[:]:
            x_next = layer.forward(x_next)
        y_pred = x_next
        return y_pred

    def accuracy(self, outputs, targets):
        y_predicts = np.argmax(outputs, axis=1)
        y_targets = np.argmax(targets, axis=1)
        acc = y_predicts == y_targets
        return np.mean(acc)

        # acc = 0
        # for i in range(y_targets.shape[0]):
        #     if y_targets[i] == y_predicts[i]:
        #         acc += 1
        # return acc / y_targets.shape[0]

    def evaluate(self, X, Y):
        raise NotImplementedError()

In [187]:
Model()

<__main__.Model at 0x7f87141d4130>

In [147]:
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.1.1-cp38-cp38-macosx_10_13_x86_64.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 2.8 MB/s eta 0:00:01
[?25hCollecting joblib>=1.0.0
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
[K     |████████████████████████████████| 306 kB 15.4 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1316 sha256=5b4472ffdaa6d47091d86780f6d85da67d1e93b5214e6d0462e4f573cddbfc12
  Stored in directory: /Users/ishanmishra/Library/Caches/pip/wheels/22/0b/40/fd3f795caaa1fb4c6cb738bc1f56100be1e57da95849bfc897
Successfully built sklearn
Installing collected packages: threadpoolctl, joblib, scikit-learn, sklearn
Successfully installed j

In [149]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784")

In [151]:
type(mnist)

sklearn.utils._bunch.Bunch

In [157]:
mnist.categories

In [158]:
mnist.data

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
type(mnist.data)

pandas.core.frame.DataFrame

In [160]:
images = mnist.data.to_numpy()

In [164]:
targets = mnist.target.to_numpy()

In [161]:
seed = 100
nb_data = 1000

In [162]:
X_train = images.reshape((-1, 1, 28, 28)) / 255.0
np.random.seed(seed)

In [165]:
X_train = np.random.permutation(X_train)[:nb_data]
y_train = targets

In [166]:
np.random.seed(seed)
y_train = np.random.permutation(y_train)[:nb_data]
n_classes = np.unique(y_train).size

In [167]:
def one_hot(labels, nb_classes=None):
    classes = np.unique(labels)
    if nb_classes is None:
        nb_classes = classes.size
    one_hot_labels = np.zeros((labels.shape[0], nb_classes))
    for i, c in enumerate(classes):
        one_hot_labels[labels == c, i] = 1
    return one_hot_labels

In [171]:
class Flatten(Layer):
    def __init__(self, outdim=2):
        self.outdim = outdim
        if outdim < 1:
            raise ValueError('Dim must be >0, was %i', outdim)

        self.last_input_shape = None
        self.out_shape = None

    def connect_to(self, prev_layer):
        assert len(prev_layer.out_shape) > 2

        to_flatten = np.prod(prev_layer.out_shape[self.outdim - 1:])
        flattened_shape = prev_layer.out_shape[:self.outdim - 1] + (to_flatten,)

        self.out_shape = flattened_shape

    def forward(self, input, *args, **kwargs):
        self.last_input_shape = input.shape

        # to_flatten = np.prod(self.last_input_shape[self.outdim-1:])
        # flattened_shape = input.shape[:self.outdim-1] + (to_flatten, )
        flattened_shape = input.shape[:self.outdim - 1] + (-1,)
        return np.reshape(input, flattened_shape)

    def backward(self, pre_grad, *args, **kwargs):
        return np.reshape(pre_grad, self.last_input_shape)

In [184]:
class Dense(Layer):
    """A fully connected layer implemented as the dot product of inputs and
    weights. Generally used to implemenent nonlinearities for layer post activations.
    Parameters
    ----------
    n_out : int
        Desired size or shape of layer output
    n_in : int, or None
        The layer input size feeding into this layer
    activation : str, or npdl.activatns.Activation
        Defaults to ``Tanh``
    init : str, or npdl.initializations.Initializer
        Initializer object to use for initializing layer weights
    """

    def __init__(self, n_out, n_in=None):
        self.n_out = n_out
        self.n_in = n_in
        self.out_shape = (None, n_out)
        self.init = XavierInitialization()
        self.act_layer = Softmax_func()

        self.W, self.dW = None, None
        self.b, self.db = None, None
        self.last_input = None

    def connect_to(self, prev_layer=None):
        if prev_layer is None:
            assert self.n_in is not None
            n_in = self.n_in
        else:
            assert len(prev_layer.out_shape) == 2
            n_in = prev_layer.out_shape[-1]

        self.W = self.init((n_in, self.n_out))
        self.b = np.zeros((self.n_out,))

    def forward(self, input, *args, **kwargs):
        """ Apply the forward pass transformation to the input data.
        Parameters
        ----------
        input : numpy.array
            input data
        Returns
        -------
        numpy.array
            output data
        """
        self.last_input = input
        linear_out = np.dot(input, self.W) + self.b
        act_out = self.act_layer.forward(linear_out)
        return act_out

    def backward(self, pre_grad, *args, **kwargs):
        """Apply the backward pass transformation to the input data.
        Parameters
        ----------
        pre_grad : numpy.array
            deltas back propagated from the adjacent higher layer
        Returns
        -------
        numpy.array
            deltas to propagate to the adjacent lower layer
        """
        act_grad = pre_grad * self.act_layer.derivative()
        self.dW = np.dot(self.last_input.T, act_grad)
        self.db = np.mean(act_grad, axis=0)
        if not self.first_layer:
            return np.dot(act_grad, self.W.T)

    @property
    def params(self):
        return self.W, self.b

    @property
    def grads(self):
        return self.dW, self.db

class Softmax_func():
    """Softmax activation function.
    """

    def __init__(self):
        self.last_forward = None

    def forward(self, input):
        """:math:`\\varphi(\\mathbf{x})_j =
        \\frac{e^{\mathbf{x}_j}}{\sum_{k=1}^K e^{\mathbf{x}_k}}`
        where :math:`K` is the total number of neurons in the layer. This
        activation function gets applied row-wise.
        
        Parameters
        ----------
        x : float32
            The activation (the summed, weighted input of a neuron).
    
        Returns
        -------
        float32 where the sum of the row is 1 and each single value is in [0, 1]
            The output of the softmax function applied to the activation.
        """
        assert np.ndim(input) == 2
        self.last_forward = input
        x = input - np.max(input, axis=1, keepdims=True)
        exp_x = np.exp(x)
        s = exp_x / np.sum(exp_x, axis=1, keepdims=True)
        return s

    def derivative(self, input=None):
        """Backward propagation.
        Returns
        -------
        float32 
            The derivative of Softmax function.
        """
        last_forward = input if input else self.last_forward
        return np.ones(last_forward.shape)    


class Softmax(Dense):
    def __init__(self, n_out, n_in=None):
        super(Softmax, self).__init__(n_out, n_in)

In [207]:
cnn = Model()
cnn.add(Convolution(1, (3, 3), input_shape=(None, 1, 28, 28)))
cnn.add(AvgPool((2, 2)))
cnn.add(Convolution(2, (4, 4)))
cnn.add(AvgPool((2, 2)))
cnn.add(Flatten())
cnn.add(Softmax(n_out=n_classes))
cnn.compile()

In [208]:
max_iter = 10
from time import time

In [209]:
t0 = time()
cnn.fit(X_train, one_hot(y_train), max_iter=max_iter, validation_split=0.1, batch_size=100)
tf = time()

print(tf - t0)

116.99449396133423
