## Number 1

## Number 3

In [351]:
import numpy as np
from sklearn import datasets
# ###################################
class ReLULayer ( object ):
    def forward (self , input ):
        # remember the input for later backpropagation
        self.input = input
        # return the ReLU of the inpu't
        # basic relu implementation
        relu = np.maximum(0, input)
        return relu
    def backward (self , upstream_gradient ) :
        # compute the derivative of ReLU from upstream_gradient and the stored input
        # look for which input relu was called. if the input was <= 0, relu was not aplied, and hence derivative needs also be zero
        # if input > 0, we need to set the derivative to 1
        derivative = np.where(self.input > 0, 1, 0)
        downstream_gradient = upstream_gradient * derivative
        return downstream_gradient
    def update (self , learning_rate ):
        pass # ReLU is parameter - free

In [449]:
class OutputLayer( object ):
    def __init__ (self , n_classes ):
        self.n_classes = n_classes
    def forward (self, input ):
        # remember the input for later backpropagation
        self.input = input
        # return the softmax of the input
        softmax = np.exp(input) / np.sum(np.exp(input))
        return softmax
    def backward(self , predicted_posteriors , true_labels ) :
        # return the loss derivative with respect to the stored inputs
        # (use cross - entropy loss and the chain rule for softmax ,
        # as derived in the lecture )
        
        #calculate cross entropy loss derivative
        loss_derivative = []
        for row,label in enumerate(true_labels):
            sample_loss = []
            for class_idx in range(2):
                if class_idx == label:
                    sample_loss.append(-1/predicted_posteriors[row][class_idx])
                else:
                    sample_loss.append(0)
            loss_derivative.append(sample_loss)        
            
        # calculate softmaxe derivative of the inputs 
        softmax_derivative = []
        for row,label in enumerate(true_labels):
            sample_softmax_derivative = []
            for class_idx in range(2):
                if class_idx == label:
                    sample_softmax_derivative.append(self.input[row][class_idx]-1)
                else:
                    sample_softmax_derivative.append(self.input[row][class_idx])
            softmax_derivative.append(sample_softmax_derivative)
        # mltiply
        softmax_derivative = np.asarray(softmax_derivative)
        loss_derivative = np.asarray(loss_derivative)
        # I do think i need to multiply the derivatives and I also believe that i implemented them according to the lecture.
        # However if I multiply them, i receive a 2 x 2 matrix which is incompatible with the next gradient to compute...
        # Either this is here is false or the LinearLayer gradient calculation
        return softmax_derivative # @ loss_derivative

    def update(self , learning_rate ):
        pass # softmax is parameter - free

In [450]:
class LinearLayer( object ):
    def __init__ (self , n_inputs , n_outputs ):
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        # randomly initialize weights and intercepts for the feature dimensions
        self.B = np.random.normal(size=(n_inputs, n_outputs)) 
        self.b = np.random.normal(size=(n_outputs))
    def forward(self , input ):
        # remember the input for later backpropagation
        self.input = np.asarray(input)
        # compute the scalar product of input and weights
        # ( these are the preactivations for the subsequent non - linear layer )
        # the linear preactivation claclulation of the weights multiplied by the inputs and added the biases
        preactivations = np.matmul(self.input, self.B) + self.b 
        return preactivations
    def backward(self , upstream_gradient ) :
        # compute the derivative of the weights from
        # upstream_gradient and the stored input
        self.grad_b = np.sum(upstream_gradient, axis=0)
        self.grad_B = np.matmul(self.input.T, upstream_gradient)
        # compute the downstream gradient to be passed to the preceding layer
        downstream_gradient = np.matmul(upstream_gradient, self.B.T)
        return downstream_gradient
    def update(self , learning_rate ):
        # update the weights by batch gradient descent
        self.B = self.B - learning_rate * self.grad_B
        self.b = self.b - learning_rate * self.grad_b


In [451]:
class MLP( object ):
    def __init__ (self , n_features , layer_sizes ):
        # constuct a multi - layer perceptron
        # with ReLU activation in the hidden layers and softmax output
        # (i.e. it predicts the posterior probability of a classification problem )
        # n_features : number of inputs
        # len ( layer_size ): number of layers
        # layer_size [k]: number of neurons in layer k
        # ( specifically : layer_sizes [ -1] is the number of classes )
        self.n_layers = len( layer_sizes )
        self.layers = []
        # create interior layers ( linear + ReLU )
        n_in = n_features
        for n_out in layer_sizes[:-1]:
            self.layers.append( LinearLayer(n_in , n_out ))
            self.layers.append( ReLULayer() )
            n_in = n_out
        # create last linear layer + output layer
        n_out = layer_sizes[ -1]
        self.layers.append( LinearLayer(n_in , n_out ))
        self.layers.append( OutputLayer( n_out ))
    def forward (self , X):
        # X is a mini - batch of instances
        batch_size = X.shape[0]
        # flatten the other dimensions of X (in case instances are images )
        X = X.reshape( batch_size , -1)
        # compute the forward pass
        # ( implicitly stores internal activations for later backpropagation )
        result = X
        for layer in self.layers:
            result = layer.forward( result )
        return result
    def backward(self , predicted_posteriors , true_classes ):
        # init result of the last layer
        result = self.layers[-1].backward(predicted_posteriors , true_classes)
        for layer in reversed(self.layers[:-1]):
            # set the result to the according backpropagation function
            result = layer.backward(result)
        return result
    def update(self , X, Y, learning_rate ):
        posteriors = self.forward(X)
        self.backward(posteriors,Y)
        for layer in self.layers :
            layer.update( learning_rate )
    def train(self , x, y, n_epochs , batch_size , learning_rate ):
        N = len (x)
        n_batches = N // batch_size
        for i in range( 2 ):
            print (" Epoch ", i)
            # reorder data for every epoch
            # (i.e. sample mini - batches without replacement )
            permutation = np. random.permutation(N)
            for batch in range( n_batches ):
                # create mini - batch
                start = batch * batch_size
                x_batch = x[ permutation [ start:start + batch_size ]]
                y_batch = y[ permutation [ start:start + batch_size ]]
                # perform one forward and backward pass and update network parameters
                self.update( x_batch , y_batch , learning_rate )

In [452]:
# set training / test set size
N = 2000
# create training and test data
X_train , Y_train = datasets.make_moons (N, noise =0.05)
X_test , Y_test = datasets.make_moons (N, noise =0.05)
n_features = 2
n_classes = 2
# standardize features to be in [ -1 , 1]
offset = X_train.min(axis =0)
scaling = X_train.max(axis =0) - offset
X_train = (( X_train - offset ) / scaling - 0.5) * 2.0
X_test = (( X_test - offset ) / scaling - 0.5) * 2.0
# set hyperparameters ( play with these !)
layer_sizes = [5 , 5, n_classes ]
n_epochs = 5
batch_size = 200
learning_rate = 0.05
# create network
network = MLP( n_features , layer_sizes )
# train
network.train( X_train , Y_train , n_epochs , batch_size , learning_rate )
# test
predicted_posteriors = network.forward( X_test )
print(f"Predicted Posteriors: {predicted_posteriors}")
# determine class predictions from posteriors by winner -takes - all rule

predicted_classes = [ np.argmax(sample) for sample in predicted_posteriors]
# compute and output the error rate of predicted_classes
error_rate = 1 - (len(np.where(Y_test == predicted_classes)) / len(Y_test))
print (" error rate :", error_rate )


 Epoch  0
 Epoch  1
Predicted Posteriors: [[nan  0.]
 [nan  0.]
 [nan  0.]
 ...
 [nan  0.]
 [nan  0.]
 [nan  0.]]
 error rate : 0.9995


  sample_loss.append(-1/predicted_posteriors[row][class_idx])
  softmax = np.exp(input) / np.sum(np.exp(input))
  softmax = np.exp(input) / np.sum(np.exp(input))


In [455]:
# test the implementation
test_cases = [  MLP( n_features , [2 , 2 , n_classes ]),
                MLP( n_features , [3 , 3 , n_classes ]),
                MLP( n_features , [5 , 5 , n_classes ]),
                MLP( n_features , [30 , 30 , n_classes ])]

for test_nn in test_cases:
    print(50*"-")
    test_nn.train(X_train , Y_train , n_epochs , batch_size , learning_rate)
    predicted_posteriors = test_nn.forward( X_test )
    print(f"Predicted Posteriors: {predicted_posteriors}")
    predicted_classes = [ np.argmax(sample) for sample in predicted_posteriors]
    error_rate = 1 - (len(np.where(Y_test == predicted_classes)) / len(Y_test))
    print (" error rate :", error_rate )
# the test results are bad because of the wrong implementation of the backpropagation in the outputlayer.

--------------------------------------------------
 Epoch  0
 Epoch  1
Predicted Posteriors: [[nan nan]
 [nan nan]
 [nan nan]
 ...
 [nan nan]
 [nan nan]
 [nan nan]]
 error rate : 0.9995
--------------------------------------------------
 Epoch  0
 Epoch  1
Predicted Posteriors: [[ 0. nan]
 [ 0. nan]
 [ 0. nan]
 ...
 [ 0. nan]
 [ 0. nan]
 [ 0. nan]]
 error rate : 0.9995
--------------------------------------------------
 Epoch  0
 Epoch  1
Predicted Posteriors: [[ 0. nan]
 [ 0. nan]
 [ 0. nan]
 ...
 [ 0. nan]
 [ 0. nan]
 [ 0. nan]]
 error rate : 0.9995
--------------------------------------------------
 Epoch  0
 Epoch  1


  softmax = np.exp(input) / np.sum(np.exp(input))
  softmax = np.exp(input) / np.sum(np.exp(input))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_loss.append(-1/predicted_posteriors[row][class_idx])


Predicted Posteriors: [[ 0. nan]
 [ 0. nan]
 [ 0. nan]
 ...
 [ 0. nan]
 [ 0. nan]
 [ 0. nan]]
 error rate : 0.9995
