# Exercise 2
## 3 Application
In the following cell we complete the code from network.py

In [104]:
import numpy as np
from sklearn import datasets
    
####################################

class ReLULayer(object):
    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the ReLU of the input
        relu = np.max(np.array([np.zeros(np.shape(self.input)), self.input]), axis=0) #We compute max([0,input]) componentwise
        return relu

    def backward(self, upstream_gradient):
        # compute the derivative of ReLU from upstream_gradient and the stored input
        downstream_gradient = np.heaviside(self.input) * upstream_gradient # your code here                                         ###########################
        return downstream_gradient

    def update(self, learning_rate):
        pass # ReLU is parameter-free

####################################

class OutputLayer(object):
    def __init__(self, n_classes):
        self.n_classes = n_classes

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the softmax of the input
        softmax = np.exp(self.input) / np.sum(np.exp(self.input), axis=1).reshape((self.input).shape[0], 1) # We compute the softmax function componentwise
        return softmax

    def backward(self, predicted_posteriors, true_labels):
        # return the loss derivative with respect to the stored inputs
        # (use cross-entropy loss and the chain rule for softmax,
        #  as derived in the lecture)
                  
        #compute the predicted label assuming the class labels starting from 0, then compute where it is equal to the training set
        M,N = predicted_posteriors.shape
        classes = np.repeat([np.arange(N)], M, axis=0)
        true_labels = np.repeat([true_labels], N, axis=0).T
        equal_labels = np.array(true_labels==classes,dtype=int)            
                   
        downstream_gradient = (predicted_posteriors-1) * equal_labels + predicted_posteriors * (1 - equal_labels)  # your code here                                     #################################
        return downstream_gradient

    def update(self, learning_rate):
        pass # softmax is parameter-free

####################################

class LinearLayer(object):
    def __init__(self, n_inputs, n_outputs):
        self.n_inputs  = n_inputs
        self.n_outputs = n_outputs
        # randomly initialize weights and intercepts
        self.B = np.random.normal(loc=0, scale=np.sqrt(2/(n_input+1)), size=(n_output, n_input)) # We initialize the weights and bias according to the weight initialization for ReLU networks in the lecture notes
        self.b = np.random.normal(loc=0, scale=np.sqrt(2/(n_input+1)), size=(n_output,))

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # compute the scalar product of input and weights
        # (these are the preactivations for the subsequent non-linear layer)
        preactivations = (self.B@self.input.T).T + self.b #We compute the scalar product of weights and input and add the bias vector
        return preactivations

    def backward(self, upstream_gradient):
        # compute the derivative of the weights from
        # upstream_gradient and the stored input
                      
        self.grad_b = np.mean(upstream_gradient, axis=0) # your code here
        self.grad_B = ((self.input).T @ upstream_gradient) / (self.input).shape[0] #we compute the mean over all instaces in the mini-batch of the loss derivatives according to the lecture notes
        # compute the downstream gradient to be passed to the preceding layer
        downstream_gradient = upstream_gradient @ (self.B).T #we compute the downstream gradient, which corresponds to \delta_{l-1} in the lecture notes
        return downstream_gradient

    def update(self, learning_rate):
        # update the weights by batch gradient descent
        self.B = self.B - learning_rate * self.grad_B
        self.b = self.b - learning_rate * self.grad_b

####################################

class MLP(object):
    def __init__(self, n_features, layer_sizes):
        # constuct a multi-layer perceptron
        # with ReLU activation in the hidden layers and softmax output
        # (i.e. it predicts the posterior probability of a classification problem)
        #
        # n_features: number of inputs
        # len(layer_size): number of layers
        # layer_size[k]: number of neurons in layer k
        # (specifically: layer_sizes[-1] is the number of classes)
        self.n_layers = len(layer_sizes)
        self.layers   = []

        # create interior layers (linear + ReLU)
        n_in = n_features
        for n_out in layer_sizes[:-1]:
            self.layers.append(LinearLayer(n_in, n_out))
            self.layers.append(ReLULayer())
            n_in = n_out

        # create last linear layer + output layer
        n_out = layer_sizes[-1]
        self.layers.append(LinearLayer(n_in, n_out))
        self.layers.append(OutputLayer(n_out))

    def forward(self, X):
        # X is a mini-batch of instances
        batch_size = X.shape[0]
        # flatten the other dimensions of X (in case instances are images)
        X = X.reshape(batch_size, -1)

        # compute the forward pass
        # (implicitly stores internal activations for later backpropagation)
        result = X
        for layer in self.layers:
            result = layer.forward(result)
        return result

    def backward(self, predicted_posteriors, true_classes):
        # perform backpropagation w.r.t. the prediction for the latest mini-batch X
        ... # your code here

    def update(self, X, Y, learning_rate):
        posteriors = self.forward(X)
        self.backward(posteriors, Y)
        for layer in self.layers:
            layer.update(learning_rate)

    def train(self, x, y, n_epochs, batch_size, learning_rate):
        N = len(x)
        n_batches = N // batch_size
        for i in range(n_epochs):
            # print("Epoch", i)
            # reorder data for every epoch
            # (i.e. sample mini-batches without replacement)
            permutation = np.random.permutation(N)

            for batch in range(n_batches):
                # create mini-batch
                start = batch * batch_size
                x_batch = x[permutation[start:start+batch_size]]
                y_batch = y[permutation[start:start+batch_size]]

                # perform one forward and backward pass and update network parameters
                self.update(x_batch, y_batch, learning_rate)

##################################

if __name__=="__main__":

    # set training/test set size
    N = 2000

    # create training and test data
    X_train, Y_train = datasets.make_moons(N, noise=0.05)
    X_test,  Y_test  = datasets.make_moons(N, noise=0.05)
    n_features = 2
    n_classes  = 2

    # standardize features to be in [-1, 1]
    offset  = X_train.min(axis=0)
    scaling = X_train.max(axis=0) - offset
    X_train = ((X_train - offset) / scaling - 0.5) * 2.0
    X_test  = ((X_test  - offset) / scaling - 0.5) * 2.0

    # set hyperparameters (play with these!)
    layer_sizes = [5, 5, n_classes]
    n_epochs = 5
    batch_size = 200
    learning_rate = 0.05

    # create network
    network = MLP(n_features, layer_sizes)

    # train
    network.train(X_train, Y_train, n_epochs, batch_size, learning_rate)

    # test
    predicted_posteriors = network.forward(X_test)
    # determine class predictions from posteriors by winner-takes-all rule
    predicted_classes = ... # your code here
    # compute and output the error rate of predicted_classes
    error_rate = ... # your code here
    print("error rate:", error_rate)



NameError: name 'n_input' is not defined

In [100]:
import numpy as np


X = np.array([[0,100,0],[0,100,0]])
true_labels = np.array([1,0])

predicted_posteriors = np.exp(X) / np.sum(np.exp(X), axis=1).reshape((X).shape[0], 1)

M,N = predicted_posteriors.shape
classes = np.repeat([np.arange(N)], M, axis=0)
true_labels = np.repeat([true_labels], N, axis=0).T
equal_labels = np.array(true_labels==classes,dtype=int)

downstream_gradient = (predicted_posteriors-1) * equal_labels + predicted_posteriors * (1 - equal_labels)  
print(downstream_gradient)
print(np.array([np.mean(downstream_gradient, axis=0)]))

[[ 3.72007598e-44  0.00000000e+00  3.72007598e-44]
 [-1.00000000e+00  1.00000000e+00  3.72007598e-44]]
[[-5.00000000e-01  5.00000000e-01  3.72007598e-44]]


Annahme: input hat shape (6,)
