### Note- code was moved to the PySyft torch.nn folder. This code may be out of date.



To Do:
- Implement LeakyRelu activation function --> Done
    - Accidentally added it to Conv layer; we need it on the BatchNorm layer for our model
- Implement forward pass & backward pass for every layer of our CNN in pure NumPy: --> Done
    - conv2d --> Done
    - batchnorm2d  --> Done
    - avgpool2d  --> Done
    - maxpool2d  --> Done
    - linear  --> Done
- Implement AdaMax optimizer in pure NumPy --> Done
- Implement Cross-Entropy Loss in pure NumPy --> Done
- Implement Model class in pure NumPy --> Done
- Verify pure NumPy model training works --> Done
- Modify to work with DP Tensors instead of numpy arrays --> DONE!!!!

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
from syft import PhiTensor
from syft.core.adp.data_subject_list import DataSubjectList
from syft.core.tensor.lazy_repeat_array import lazyrepeatarray as lra
from typing import Optional

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def dp_leakyrelu(dp_tensor: PhiTensor, slope: float=0.01) -> PhiTensor:
    # TODO: Should we have an index in DSLs that corresponds to no data?
    
    gt = (dp_tensor.child > 0)
    return PhiTensor(
        child= gt * dp_tensor.child + (1 - gt) * dp_tensor.child * slope,
        data_subjects=dp_tensor.data_subjects,
        min_vals= lra(data=dp_tensor.min_vals.data * slope, shape=dp_tensor.min_vals.shape), 
        max_vals= lra(data=dp_tensor.max_vals.data * slope, shape=dp_tensor.max_vals.shape),
    )


class leaky_ReLU():

    def __init__(self, slope=0.01):
        super(leaky_ReLU, self).__init__()
        self.slope = slope

    def forward(self, input_array: PhiTensor):
        # Last image that has been forward passed through this activation function
        self.last_forward = input_array        
        return dp_leakyrelu(dp_tensor=input_array, slope=self.slope)

    def derivative(self, input_array: Optional[PhiTensor]=None):
        last_forward = input_array if input_array else self.last_forward
        res = np.ones(last_forward.shape)
        idx = last_forward <= 0
        res[idx.child] = self.slope
        
        return PhiTensor(child=res,
                         data_subjects=last_forward.data_subjects,
                         min_vals=last_forward.min_vals*0,
                         max_vals = last_forward.max_vals*1)
    
    def __str__(self):
        return self.__class__.__name__


In [3]:
class Uniform():
    def __init__(self, scale=0.05):
        self.scale = scale
        
    def __call__(self, size):
        return self.call(size)

    def call(self, size):
        return np.array(np.random.uniform(-self.scale, self.scale, size=size))
    
    def __str__(self):
        return self.__class__.__name__

In [4]:
def decompose_size(size):
    if len(size) == 2:
        fan_in = size[0]
        fan_out = size[1]

    elif len(size) == 4 or len(size) == 5:
        respective_field_size = np.prod(size[2:])
        fan_in = size[1] * respective_field_size
        fan_out = size[0] * respective_field_size

    else:
        fan_in = fan_out = int(np.sqrt(np.prod(size)))

    return fan_in, fan_out

In [5]:
class XavierInitialization():
    def __call__(self, size):
        return self.call(size)
    
    def call(self, size):
        fan_in, fan_out = decompose_size(size)
        return Uniform(np.sqrt(6 / (fan_in + fan_out)))(size)

    def __str__(self):
        return self.__class__.__name__

In [6]:
class Layer():
    """
    Subclassed when implementing new types of layers.
    
    Each layer can keep track of the layer(s) feeding into it, a
    network's output :class:`Layer` instance can double as a handle to the full
    network.
    """

    first_layer = False

    def forward(self, input: PhiTensor, *args, **kwargs):
        raise NotImplementedError

    def backward(self, pre_grad, *args, **kwargs):
        raise NotImplementedError

    def connect_to(self, prev_layer):
        raise NotImplementedError

    @property
    def params(self):
        """ Layer parameters. 
        
        Returns a list of numpy.array variables or expressions that
        parameterize the layer.
        Returns
        -------
        list of numpy.array variables or expressions
            A list of variables that parameterize the layer
        Notes
        -----
        For layers without any parameters, this will return an empty list.
        """
        return []

    @property
    def grads(self):
        """ Get layer parameter gradients as calculated from backward(). """
        return []

    @property
    def param_grads(self):
        """ Layer parameters and corresponding gradients. """
        return list(zip(self.params, self.grads))

    def __str__(self):
        return self.__class__.__name__

<hr>

**Note:**

In the Convolution layer below, W, dw, b, db all start as numpy arrays.
They **should** be PhiTensors- if someone wants to print them, they should have to spend PB.

However, when they're initialized in the Convolution layer, they have to be initialized as 
np arrays because the initial values are public information. But as soon as they are exposed to a DP Tensor, they should convert to a PhiTensor.

However we should be able to pass the data to and from various layers to each other.
<hr>

In [7]:
class Convolution(Layer):
    """
    If this is the first layer in a model, provide the keyword argument `input_shape`
    (tuple of integers, does NOT include the sample axis, N.),
    e.g. `input_shape=(3, 128, 128)` for 128x128 RGB pictures.
    """

    def __init__(self, nb_filter, filter_size, input_shape=None, stride=1):
        self.nb_filter = nb_filter
        self.filter_size = filter_size
        self.input_shape = input_shape
        self.stride = stride
        
        self.W, self.dW = None, None
        self.b, self.db = None, None
        self.out_shape = None
        self.last_output = None
        self.last_input = None

        self.init = XavierInitialization()
        self.activation = leaky_ReLU()

    def connect_to(self, prev_layer=None):
        if prev_layer is None:
            assert self.input_shape is not None
            input_shape = self.input_shape
        else:
            input_shape = prev_layer.out_shape

        # input_shape: (batch size, num input feature maps, image height, image width)
        assert len(input_shape) == 4

        nb_batch, pre_nb_filter, pre_height, pre_width = input_shape
        if isinstance(self.filter_size, tuple):
            filter_height, filter_width = self.filter_size
        elif isinstance(self.filter_size, int):
            filter_height = filter_width = self.filter_size
        else:
            raise NotImplementedError

        height = (pre_height - filter_height) // self.stride + 1
        width = (pre_width - filter_width) // self.stride + 1

        # output shape
        self.out_shape = (nb_batch, self.nb_filter, height, width)

        # filters
        self.W = self.init((self.nb_filter, pre_nb_filter, filter_height, filter_width))
        self.b = np.zeros((self.nb_filter,))

    def forward(self, input: PhiTensor, *args, **kwargs):

        self.last_input = input
        
        # TODO: This could fail if the DP Tensor has < 4 dimensions
        
        # shape
        nb_batch, input_depth, old_img_h, old_img_w = input.shape
        if isinstance(self.filter_size, tuple):
            filter_height, filter_width = self.filter_size
        elif isinstance(self.filter_size, int):
            filter_height = filter_width = self.filter_size
        else:
            raise NotImplementedError
        
        new_img_h, new_img_w = self.out_shape[2:]

        # init
        outputs = np.zeros((nb_batch, self.nb_filter, new_img_h, new_img_w))
        
        

        # convolution operation
        for x in np.arange(nb_batch):
            for y in np.arange(self.nb_filter):
                for h in np.arange(new_img_h):
                    for w in np.arange(new_img_w):
                        h_shift, w_shift = h * self.stride, w * self.stride
                        # patch: (input_depth, filter_h, filter_w)
                        patch = input[x, :, h_shift: h_shift + filter_height, w_shift: w_shift + filter_width]
                        outputs[x, y, h, w] = np.sum(patch.child * self.W[y]) + self.b[y]

        # nonlinear activation
        # self.last_output: (nb_batch, output_depth, image height, image width)
        
        # TODO: Min/max vals are direct function of private data- fix this when we have time
        outputs = PhiTensor(
            child=outputs,data_subjects=np.zeros_like(outputs), 
            min_vals=outputs.min(), max_vals=outputs.max()
        )
        self.last_output = self.activation.forward(outputs)

        return self.last_output

    def backward(self, pre_grad, *args, **kwargs):

        # shape
        assert pre_grad.shape == self.last_output.shape
        nb_batch, input_depth, old_img_h, old_img_w = self.last_input.shape
        new_img_h, new_img_w = self.out_shape[2:]
        
        if isinstance(self.filter_size, tuple):
            filter_height, filter_width = self.filter_size
        elif isinstance(self.filter_size, int):
            filter_height = filter_width = self.filter_size
        else:
            raise NotImplementedError
        
#         filter_h, filter_w = self.filter_size
        old_img_h, old_img_w = self.last_input.shape[-2:]

        # gradients
        self.dW = np.zeros((self.W.shape))
        self.db = np.zeros((self.b.shape))
        delta = pre_grad * self.activation.derivative()

        # dW
        for r in np.arange(self.nb_filter):
            for t in np.arange(input_depth):
                for h in np.arange(filter_height):
                    for w in np.arange(filter_width):
                        input_window = self.last_input[:, t,
                                       h:old_img_h - filter_height + h + 1:self.stride,
                                       w:old_img_w - filter_width + w + 1:self.stride]
                        delta_window = delta[:, r]
                        self.dW[r, t, h, w] = ((input_window * delta_window).sum() * (1/nb_batch)).child
        # db
        for r in np.arange(self.nb_filter):
            self.db[r] = (delta[:, r].sum() * (1/nb_batch)).child
        
        
        # dX
        
        
        if not self.first_layer:
            layer_grads = self.last_input.zeros_like()
            for b in np.arange(nb_batch):
                for r in np.arange(self.nb_filter):
                    for t in np.arange(input_depth):
                        for h in np.arange(new_img_h):
                            for w in np.arange(new_img_w):
                                h_shift, w_shift = h * self.stride, w * self.stride
                                temp = layer_grads[b, t, h_shift:h_shift + filter_height, w_shift:w_shift + filter_width]
                                layer_grads[b, t, h_shift:h_shift + filter_height, w_shift:w_shift + filter_width] = temp+ (delta[b, r, h, w] * self.W[r, t])
                              
        return layer_grads
                         

    @property
    def params(self):
        return self.W, self.b

    @property
    def grads(self):
        return self.dW, self.db

In [19]:
class BatchNorm(Layer):
    def __init__(self, epsilon=1e-6, momentum=0.9, axis=0):
        self.epsilon = epsilon
        self.momentum = momentum
        self.axis = axis

        self.beta, self.dbeta = None, None
        self.gamma, self.dgamma = None, None
        self.cache = None

    def connect_to(self, prev_layer):
        n_in = prev_layer.out_shape[-1]
        self.beta = np.zeros((n_in,))
        self.gamma = np.ones((n_in,))

    def forward(self, input: PhiTensor, *args, **kwargs):
        # N, D = x.shape
        self.out_shape = input.shape

        # step1: calculate the mean
        xmu = input - input.mean(axis=0)
        # step3:
        var = xmu.std(axis=0)
        sqrtvar = (var + self.epsilon).sqrt()
        ivar = sqrtvar.reciprocal()
        # step5: normalization->x^
        xhat = xmu * ivar

        # step6: scale and shift
        gammax = xhat * self.gamma
        out = gammax + self.beta

        self.cache = (xhat, xmu, ivar, sqrtvar, var)
        return out

    def backward(self, pre_grad, *args, **kwargs):
        """
        If you get stuck, here's a resource:
        https://kratzert.github.io/2016/02/12/understanding-the-
        gradient-flow-through-the-batch-normalization-layer.html
        
        Note: 
            - I removed the np.ones() at a few places where I 
               thought it wasn't making a difference
            - I occasionally have kernel crashes on my 8GB machine
            when running this. Perhaps too many large temp vars?
            could also be due to too many large numbers.
        """
        
        xhat, xmu, ivar, sqrtvar, var = self.cache

        N, D,x,y = pre_grad.shape
#         print(f"input shape of (N,D,x,y) = {(N, D, x, y)}")

        # step6
        self.dbeta = pre_grad.sum(axis=0)
        dgammax = pre_grad
        self.dgamma = (dgammax * xhat).sum( axis=0)
        dxhat = dgammax * self.gamma
#         print(f"step 6: shaep of dbeta = {self.dbeta.shape}")
#         print(f"step 6: shaep of dgamma = {self.dgamma.shape}")
#         print(f"step 6: shaep of dxhat = {dxhat.shape}")

        # step5
        divar = (dxhat * xmu).sum(axis=0)
        dxmu1 = dxhat * ivar 

        # step4
        dsqrtvar = -1. / (sqrtvar * sqrtvar) * divar
#         print(f"step 4: shaep of dsqrtvar = {dsqrtvar.shape}")
        inv_var_eps_sqrt = (var + self.epsilon).sqrt().reciprocal()
        
#         print(f"var + eps shape:", inv_var_eps_sqrt.shape)
        dvar = dsqrtvar * 0.5 * inv_var_eps_sqrt
#         print(f"dvar shape:", dvar.shape)

        
        # step3
        dxmu2 = xmu * dvar * (2/N)

        # step2, 
        dx1 = (dxmu1 + dxmu2)

#         # step1, 
        dmu = (dxmu1 + dxmu2).sum(axis=0) * -1
        dx2 = dmu * (1/N)

        # step0 done!
        dx = dx1 + dx2

        return dx

    @property
    def params(self):
        return self.beta, self.gamma

    @property
    def grades(self):
        return self.dbeta, self.dgamma

In [9]:
def test_bnc():
    shape = (1, 1, 50, 50)

    smol_data = PhiTensor(
        child=np.random.rand(*shape)*255, 
        data_subjects=np.zeros(shape), min_vals=0, max_vals=255)

    c = Convolution(3, 3, input_shape=shape)
    c.connect_to()
    bn = BatchNorm()
    bn.connect_to(c)
    c_out = c.forward(smol_data)
    output = bn.forward(c_out)
    grad_signal = PhiTensor(
    child=np.random.random((1,1,48,48))*255, data_subjects=np.zeros((1,1, 48, 48)), min_vals=0, max_vals=255
)
    return bn.backward(grad_signal)
    
# test_bnc()

In [10]:
# test_bnc()

In [11]:
# test_bnc()

In [42]:
class AvgPool(Layer):
    """Average pooling operation for spatial data.
    Parameters
    ----------
    pool_size : tuple of 2 integers,
        factors by which to downscale (vertical, horizontal).
        (2, 2) will halve the image in each dimension.
    Returns
    -------
    4D numpy.array 
        with shape `(nb_samples, channels, pooled_rows, pooled_cols)` if dim_ordering='th'
        or 4D tensor with shape:
        `(samples, pooled_rows, pooled_cols, channels)` if dim_ordering='tf'.
    """

    def __init__(self, pool_size):
        self.pool_size = pool_size

        self.out_shape = 0
        self.out_shape = None
        self.input_shape = None

    def connect_to(self, prev_layer):
        assert 5 > len(prev_layer.out_shape) >= 3

        old_h, old_w = prev_layer.out_shape[-2:]
        pool_h, pool_w = self.pool_size
        new_h, new_w = old_h // pool_h, old_w // pool_w

        assert old_h % pool_h == old_w % pool_w == 0

        self.out_shape = prev_layer.out_shape[:-2] + (new_h, new_w)

    def forward(self, input: PhiTensor, *args, **kwargs):

        # shape
        self.input_shape = input.shape
        pool_h, pool_w = self.pool_size
        new_h, new_w = self.out_shape[-2:]

        # forward
        outputs = np.zeros(self.input_shape[:-2] + self.out_shape[-2:])
        outputs = PhiTensor(child=outputs, data_subjects=np.zeros_like(outputs), min_vals=0, max_vals=1)
        
        ndim = len(input.shape)
        if ndim == 4:
            nb_batch, nb_axis, _, _ = input.shape

            for a in np.arange(nb_batch):
                for b in np.arange(nb_axis):
                    for h in np.arange(new_h):
                        for w in np.arange(new_w):
                            outputs[a, b, h, w] = input[a, b, h:h + pool_h, w:w + pool_w].mean()

        elif ndim == 3:
            nb_batch, _, _ = input.shape

            for a in np.arange(nb_batch):
                for h in np.arange(new_h):
                    for w in np.arange(new_w):
                        outputs[a, h, w] = np.mean(input[a, h:h + pool_h, w:w + pool_w])

        else:
            raise ValueError()

        return outputs

    def backward(self, pre_grad: PhiTensor, *args, **kwargs):
        new_h, new_w = self.out_shape[-2:]
        pool_h, pool_w = self.pool_size
        length = np.prod(self.pool_size)

        layer_grads = np.zeros(self.input_shape)
        layer_grads = PhiTensor(child=layer_grads, data_subjects=np.zeros_like(layer_grads), min_vals=0, max_vals=1)
        
        ndim = len(pre_grad.shape)

        if ndim == 4:
            nb_batch, nb_axis, _, _ = pre_grad.shape

            for a in np.arange(nb_batch):
                for b in np.arange(nb_axis):
                    for h in np.arange(new_h):
                        for w in np.arange(new_w):
                            h_shift, w_shift = h * pool_h, w * pool_w
                            layer_grads[a, b, h_shift: h_shift + pool_h, w_shift: w_shift + pool_w] = \
                                pre_grad[a, b, h, w] * (1/length)

        elif ndim == 3:
            nb_batch, _, _ = pre_grad.shape

            for a in np.arange(nb_batch):
                for h in np.arange(new_h):
                    for w in np.arange(new_w):
                        h_shift, w_shift = h * pool_h, w * pool_w
                        layer_grads[a, h_shift: h_shift + pool_h, w_shift: w_shift + pool_w] = \
                            pre_grad[a, h, w] * (1/length)

        else:
            raise ValueError()

        return layer_grads


class MaxPool(Layer):
    """Max pooling operation for spatial data.
    Parameters
    ----------
    pool_size : tuple of 2 integers,
        factors by which to downscale (vertical, horizontal).
        (2, 2) will halve the image in each dimension.
    Returns
    -------
    4D numpy.array 
        with shape `(nb_samples, channels, pooled_rows, pooled_cols)` if dim_ordering='th'
        or 4D tensor with shape:
        `(samples, pooled_rows, pooled_cols, channels)` if dim_ordering='tf'.
    """
    def __init__(self, pool_size):
        self.pool_size = pool_size

        self.input_shape = None
        self.out_shape = None
        self.last_input = None

    def connect_to(self, prev_layer):
        # prev_layer.out_shape: (nb_batch, ..., height, width)
        assert len(prev_layer.out_shape) >= 3

        old_h, old_w = prev_layer.out_shape[-2:]
        pool_h, pool_w = self.pool_size
        new_h, new_w = old_h // pool_h, old_w // pool_w

        assert old_h % pool_h == old_w % pool_w == 0

        self.out_shape = prev_layer.out_shape[:-2] + (new_h, new_w)

    def forward(self, input, *args, **kwargs):
        # shape
        self.input_shape = input.shape
        pool_h, pool_w = self.pool_size
        new_h, new_w = self.out_shape[-2:]

        # forward
        self.last_input = input
        outputs = np.zeros(self.input_shape[:-2] + self.out_shape[-2:])
        outputs = PhiTensor(child=outputs, data_subjects=np.zeros_like(outputs), min_vals=0, max_vals=1)
        
        ndim = len(input.shape)

        if ndim == 4:
            nb_batch, nb_axis, _, _ = input.shape

            for a in np.arange(nb_batch):
                for b in np.arange(nb_axis):
                    for h in np.arange(new_h):
                        for w in np.arange(new_w):
                            outputs[a, b, h, w] = input[a, b, h:h + pool_h, w:w + pool_w].max()

        elif ndim == 3:
            nb_batch, _, _ = input.shape

            for a in np.arange(nb_batch):
                for h in np.arange(new_h):
                    for w in np.arange(new_w):
                        outputs[a, h, w] = input[a, h:h + pool_h, w:w + pool_w].max()

        else:
            raise ValueError()
        

        return outputs

    def backward(self, pre_grad, *args, **kwargs):
        new_h, new_w = self.out_shape[-2:]
        pool_h, pool_w = self.pool_size

        layer_grads = np.zeros(self.input_shape)
        layer_grads = PhiTensor(child=layer_grads, data_subjects=np.zeros_like(layer_grads), min_vals=0, max_vals=1)
        
        ndim = len(pre_grad.shape)

        if ndim == 4:
            nb_batch, nb_axis, _, _ = pre_grad.shape

            for a in np.arange(nb_batch):
                for b in np.arange(nb_axis):
                    for h in np.arange(new_h):
                        for w in np.arange(new_w):
                            patch = self.last_input[a, b, h:h + pool_h, w:w + pool_w]
                            max_idx = patch.unravel_argmax()
#                             max_idx = np.unravel_index(patch.argmax(), patch.shape)

                            h_shift, w_shift = h * pool_h + max_idx[0], w * pool_w + max_idx[1]
                            layer_grads[a, b, h_shift, w_shift] = pre_grad[a, b, h, w]
#                             layer_grads[a, b, h_shift, w_shift] = pre_grad[a, b, a, w] 

        elif ndim == 3:
            nb_batch, _, _ = pre_grad.shape

            for a in np.arange(nb_batch):
                for h in np.arange(new_h):
                    for w in np.arange(new_w):
                        patch = self.last_input[a, h:h + pool_h, w:w + pool_w]
                        max_idx = patch.unravel_argmax()
#                         max_idx = np.unravel_index(patch.argmax(), patch.shape)
                        h_shift, w_shift = h * pool_h + max_idx[0], w * pool_w + max_idx[1]
                        layer_grads[a, h_shift, w_shift] = pre_grad[a, h, w]
#                         layer_grads[a, h_shift, w_shift] = pre_grad[a, a, w]

        else:
            raise ValueError()

        return layer_grads

In [13]:
avg = AvgPool((2,2))

In [44]:
def test_c_bn_avg():
    shape = (10, 3, 6, 6)

    smol_data = PhiTensor(
        child=np.random.rand(*shape)*255, 
        data_subjects=np.zeros(shape), min_vals=0, max_vals=255)

    c = Convolution(3, 3, input_shape=shape)
    c.connect_to()
    bn = BatchNorm()
    bn.connect_to(c)
    c_out = c.forward(smol_data)
    bn_out = bn.forward(c_out)
    avg = AvgPool((2,2))
    avg.connect_to(bn)
    output = avg.forward(bn_out)
    return avg.backward(output)
    
assert isinstance(test_c_bn_avg(), PhiTensor)

In [46]:
def test_c_bn_max():
    shape = (10, 3, 6, 6)

    smol_data = PhiTensor(
        child=np.random.rand(*shape)*255, 
        data_subjects=np.zeros(shape), min_vals=0, max_vals=255)

    c = Convolution(3, 3, input_shape=shape)
    c.connect_to()
    bn = BatchNorm()
    bn.connect_to(c)
    c_out = c.forward(smol_data)
    bn_out = bn.forward(c_out)
    maxp = MaxPool((2,2))
    maxp.connect_to(bn)
    output = maxp.forward(bn_out)
    return maxp.backward(output)
    
assert isinstance(test_c_bn_max(), PhiTensor)

In [78]:
class Linear(Layer):
    def __init__(self, n_out, n_in=None):
        self.n_out = n_out
        self.n_in = n_in
        self.out_shape = (None, n_out)

        self.W = None
        self.b = None
        self.dW = None
        self.db = None
        self.last_input = None
        self.init = XavierInitialization()

    def connect_to(self, prev_layer=None):
        if prev_layer is None:
            assert self.n_in is not None
            n_in = self.n_in
        else:
            assert len(prev_layer.out_shape) == 2
            n_in = prev_layer.out_shape[-1]

        self.W = self.init((n_in, self.n_out))
        self.b = np.zeros((self.n_out,))

    def forward(self, input: PhiTensor, *args, **kwargs):
        self.last_input = input
        return input.dot(self.W) + self.b

    def backward(self, pre_grad: PhiTensor, *args, **kwargs):
        self.dW = self.last_input.T.dot(pre_grad)  # Should this transpose just the last 2 indices?
#         self.dW = self.last_input.swapaxes(-1, -2)
        self.db = pre_grad.mean(axis=0)
        if not self.first_layer:
            return pre_grad.dot(self.W.T)

    @property
    def params(self):
        return self.W, self.b

    @property
    def grads(self):
        return self.dW, self.db

In [80]:
def test_lin():
    
    # Works for 2D symmetrical shapes like (10, 10)
    shape = (10, 10)
    bpop_shape = (10, 10)

    smol_data = PhiTensor(
        child=np.random.rand(*shape)*255, 
        data_subjects=np.zeros(shape), min_vals=0, max_vals=255)
    
    lin = Linear(n_out=10)
    lin.n_in = 10
    lin.connect_to()
    l_out = lin.forward(smol_data)
    
    bpop = PhiTensor(
        child=np.random.rand(*bpop_shape)*255, 
        data_subjects=np.zeros(bpop_shape), min_vals=0, max_vals=255)
    print(lin.last_input.T.shape)
    
    return lin.backward(bpop)
    
assert isinstance(test_lin(), PhiTensor)


(10, 10)


In [None]:
class Optimizer():
    """Abstract optimizer base class.
    
    Parameters
    ----------
    clip : float
        If smaller than 0, do not apply parameter clip.
    lr : float
        The learning rate controlling the size of update steps
    decay : float
        Decay parameter for the moving average. Must lie in [0, 1) where
        lower numbers means a shorter “memory”.
    lr_min : float
        When adapting step rates, do not move below this value. Default is 0.
    lr_max : float
        When adapting step rates, do not move above this value. Default is inf.
    """

    def __init__(self, lr=0.001, clip=-1, decay=0., lr_min=0., lr_max=np.inf):
        self.lr = lr
        self.clip = clip
        self.decay = decay
        self.lr_min = lr_min
        self.lr_max = lr_max

        self.iterations = 0

    def update(self, params, grads):
        self.iterations += 1

        self.lr *= (1. / 1 + self.decay * self.iterations)
        self.lr = np.clip(self.lr, self.lr_min, self.lr_max)

    def __str__(self):
        return self.__class__.__name__

In [None]:
def dp_maximum(x, y):
    x_data = x.child
    y_data = y.child if hasattr(y, "child") else y
    
    output = np.maximum(x_data, y_data)
    min_v, max_v = output.min(), output.max()
    dsl = DataSubjectList(
            one_hot_lookup=x.data_subjects.one_hot_lookup,
            data_subjects_indexed=np.zeros_like(output)
        )
    return PhiTensor(
        child=output,
        data_subjects=dsl,
        min_vals=min_v,
        max_vals=max_v,
    )

In [None]:
class Adamax(Optimizer):
    """
    Parameters
    ----------
    beta1 : float
        Exponential decay rate for the first moment estimates.
    beta2 : float
        Exponential decay rate for the second moment estimates.
    epsilon : float
        Constant for numerical stability.
    References
    ----------
    .. [1] Kingma, Diederik, and Jimmy Ba (2014):
           Adam: A Method for Stochastic Optimization.
           arXiv preprint arXiv:1412.6980.
    """

    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, *args, **kwargs):
        super(Adamax, self).__init__(*args, **kwargs)

        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

        self.ms = None
        self.vs = None

    def update(self, params, grads):
        # init
        self.iterations += 1
        a_t = self.lr / (1 - np.power(self.beta1, self.iterations))
        if self.ms is None:
            self.ms = [p.zeros_like() for p in params]
        if self.vs is None:
            self.vs = [p.zeros_like() for p in params]

        # update parameters
        for i, (m, v, p, g) in enumerate(zip(self.ms, self.vs, params, grads)):
            m = m * self.beta1 + g * (1 - self.beta1)
            v = dp_maximum(v * self.beta2, g.abs())
            p = p - m * (v + self.epsilon).reciprocal() * a_t

            self.ms[i] = m
            self.vs[i] = v

In [None]:
optimizer = Adamax()

In [None]:
optimizer.update(b_data, b_data)

In [None]:
optimizer.ms

In [None]:
def dp_log(input: PhiTensor):
    data = input.child
    
    output = np.log(data)
    min_v, max_v = output.min(), output.max()
    dsl = DataSubjectList(
            one_hot_lookup=input.data_subjects.one_hot_lookup,
            data_subjects_indexed=np.zeros_like(output)
        )
    return PhiTensor(
        child=output,
        data_subjects=dsl,
        min_vals=min_v,
        max_vals=max_v,
    )

In [None]:
class BinaryCrossEntropy():
    def __init__(self, epsilon=1e-11):
        self.epsilon = epsilon

    def forward(self, outputs, targets):
        """Forward pass.
        
        .. math:: L = -t \\log(p) - (1 - t) \\log(1 - p)
        
        Parameters
        ----------
        outputs : numpy.array
            Predictions in (0, 1), such as sigmoidal output of a neural network.
        targets : numpy.array
            Targets in [0, 1], such as ground truth labels.
        """
        outputs = outputs.clip(self.epsilon, 1 - self.epsilon)
        log_loss = targets * dp_log(outputs) + ((targets * -1) + 1) * dp_log((outputs * -1) + 1)
        log_loss = log_loss.sum(axis=1) * -1
        return log_loss.mean()

    def backward(self, outputs: PhiTensor, targets: PhiTensor):
        """Backward pass.
        Parameters
        ----------
        outputs : numpy.array
            Predictions in (0, 1), such as sigmoidal output of a neural network.
        targets : numpy.array
            Targets in [0, 1], such as ground truth labels.
        """
        outputs = outputs.clip(self.epsilon, 1 - self.epsilon)
        divisor = dp_maximum(outputs * ((outputs * -1) + 1), self.epsilon)
        return (outputs - targets) * divisor.reciprocal()
    
    def __str__(self):
        return self.__class__.__name__

In [None]:
loss_fn = BinaryCrossEntropy()

In [None]:
input_shape = (2, 10)
target = PhiTensor(child=np.random.randint(low=0, high=2, size=input_shape),
              data_subjects=np.zeros(input_shape),
              min_vals=0,
              max_vals=1
         )

prediction = PhiTensor(child=np.random.rand(*input_shape),
              data_subjects=np.zeros(input_shape),
              min_vals=0,
              max_vals=1
         )

In [None]:
loss_fn.forward(prediction, target)

In [None]:
loss_fn.backward(prediction, target)

In [None]:
class Model():
    def __init__(self, layers=None):
        self.layers = [] if layers is None else layers

        self.loss = None
        self.optimizer = Adamax

    def add(self, layer):
        assert isinstance(layer, Layer), "PySyft doesn't recognize this kind of layer."
        self.layers.append(layer)

    def compile(self, loss=BinaryCrossEntropy(), optimizer=Adamax()):
        self.layers[0].first_layer = True

        next_layer = None
        for layer in self.layers:
            layer.connect_to(next_layer)
            next_layer = layer

        self.loss = BinaryCrossEntropy()
        self.optimizer = Adamax()

    def fit(self, X, Y, max_iter=100, batch_size=64, shuffle=True,
            validation_split=0., validation_data=None):

        # prepare data
        train_X = X #.astype(get_dtype()) if np.issubdtype(np.float64, X.dtype) else X
        train_Y = Y #.astype(get_dtype()) if np.issubdtype(np.float64, Y.dtype) else Y

        if 1. > validation_split > 0.:
            split = int(train_Y.shape[0] * validation_split)
            valid_X, valid_Y = train_X[-split:], train_Y[-split:]
            train_X, train_Y = train_X[:-split], train_Y[:-split]
        elif validation_data is not None:
            valid_X, valid_Y = validation_data
        else:
            valid_X, valid_Y = None, None

        iter_idx = 0
        while iter_idx < max_iter:
            iter_idx += 1

            # shuffle
            if shuffle:
                seed = np.random.randint(111, 1111111)
                np.random.seed(seed)
                np.random.shuffle(train_X)
                np.random.seed(seed)
                np.random.shuffle(train_Y)

            # train
            train_losses, train_predicts, train_targets = [], [], []
            for b in range(train_Y.shape[0] // batch_size):
                batch_begin = b * batch_size
                batch_end = batch_begin + batch_size
                x_batch = train_X[batch_begin:batch_end]
                y_batch = train_Y[batch_begin:batch_end]

                # forward propagation
                y_pred = self.predict(x_batch)

                # backward propagation
                next_grad = self.loss.backward(y_pred, y_batch)
                for layer in self.layers[::-1]:
                    next_grad = layer.backward(next_grad)

                # get parameter and gradients
                params = []
                grads = []
                for layer in self.layers:
                    params += layer.params
                    grads += layer.grads

                # update parameters
                self.optimizer.update(params, grads)

                # got loss and predict
                train_losses.append(self.loss.forward(y_pred, y_batch))
                train_predicts.extend(y_pred)
                train_targets.extend(y_batch)

            # output train status
            runout = "iter %d, train-[loss %.4f, acc %.4f]; " % (
                iter_idx, float(np.mean(train_losses)), float(self.accuracy(train_predicts, train_targets)))

            # runout = "iter %d, train-[loss %.4f, ]; " % (
            #     iter_idx, float(np.mean(train_losses)))

            if valid_X is not None and valid_Y is not None:
                # valid
                valid_losses, valid_predicts, valid_targets = [], [], []
                for b in range(valid_X.shape[0] // batch_size):
                    batch_begin = b * batch_size
                    batch_end = batch_begin + batch_size
                    x_batch = valid_X[batch_begin:batch_end]
                    y_batch = valid_Y[batch_begin:batch_end]

                    # forward propagation
                    y_pred = self.predict(x_batch)

                    # got loss and predict
                    valid_losses.append(self.loss.forward(y_pred, y_batch))
                    valid_predicts.extend(y_pred)
                    valid_targets.extend(y_batch)

                # output valid status
                runout += "valid-[loss %.4f, acc %.4f]; " % (
                    float(np.mean(valid_losses)), float(self.accuracy(valid_predicts, valid_targets)))

    def predict(self, X):
        """ Calculate an output Y for the given input X. """
        x_next = X
        for layer in self.layers[:]:
            x_next = layer.forward(x_next)
        y_pred = x_next
        return y_pred

    def accuracy(self, outputs, targets):
        y_predicts = np.argmax(outputs, axis=1)
        y_targets = np.argmax(targets, axis=1)
        acc = y_predicts == y_targets
        return np.mean(acc)

    def evaluate(self, X, Y):
        raise NotImplementedError()

In [None]:
Model()