In [1]:
import numpy as np
import math

In [2]:
class Activation(object):


    def __init__(self):
        self.state = None

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        raise NotImplemented

    def derivative(self):
        raise NotImplemented

In [4]:
class Identity(Activation):


    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        self.state = x
        return x

    def derivative(self):
        return 1.0

In [5]:
class Sigmoid(Activation):
    
    def __init__(self):
        super(Sigmoid, self).__init__()

    def forward(self, x):
        self.state = 1 / (1 + np.exp(-x))
        return self.state
    
    def derivative(self):
        return self.state * (1 - self.state)

In [7]:
class Tanh(Activation):
    
    def __init__(self):
        super(Tanh, self).__init__()

    def forward(self, x):
        self.state = np.tanh(x)
        return self.state
    
    def derivative(self):
        return 1 - self.state**2

In [8]:
class ReLU(Activation):
    
    def __init__(self):
        super(ReLU, self).__init__()

    def forward(self, x):
        self.state = np.maximum(0, x)
        return self.state
    
    def derivative(self):
        return np.where(self.state > 0, 1, 0)

In [1]:
class Criterion(object):

    def __init__(self):
        self.logits = None
        self.labels = None
        self.loss = None

    def __call__(self, x, y):
        return self.forward(x, y)

    def forward(self, x, y):
        raise NotImplemented

    def derivative(self):
        raise NotImplemented

In [8]:
class SoftmaxCrossEntropy():
    
    def __init__(self):
        super(SoftmaxCrossEntropy, self).__init__()

    def forward(self, x, y):
        self.logits = x
        self.labels = y
        mx = np.max(self.logits, axis-1).reshape(-1,1)
        subtracted = self.logits - mx
        self.elogits = np.exp(subtracted)
        esum = self.elogits.sum(axis=1).reshape(-1,1)
        self.sm = self.elogits / esum
        
        fterm = -(self.logits*self.labels).sum(axis=1)
        sterm = mx +np.log(esum)
        return fterm + sterm.reshape(-1)

    
    def derivative(self):

        return self.sm - self.labels

       
     

In [1]:
class Linear():
    def __init__(self, in_feature, out_feature, weight_init_fn, bias_init_fn):

        """
        Argument:
            W (np.array): (in feature, out feature)
            dW (np.array): (in feature, out feature)
            momentum_W (np.array): (in feature, out feature)

            b (np.array): (1, out feature)
            db (np.array): (1, out feature)
            momentum_B (np.array): (1, out feature)
        """

        self.W = weight_init_fn(in_feature, out_feature)
        self.b = bias_init_fn(out_feature)

        # TODO: Complete these but do not change the names.
        self.dW = np.zeros(self.w.shape)
        self.db = np.zeros(self.b.shape)

        self.momentum_W = np.zeros(self.w.shape)
        self.momentum_b = np.zeros(self.b.shape)

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        """
        Argument:
            x (np.array): (batch size, in feature)
        Return:
            out (np.array): (batch size, out feature)
        """
        self.x = x
        out = np.matmul(self.x, self.w) + self.b

        return out
        

    def backward(self, delta):

        """
        Argument:
            delta (np.array): (batch size, out feature)
        Return:
            out (np.array): (batch size, in feature)
        """
        self.dw = np.dpt(self.x.T, delta) / delta.shape[0]
        self.db = np.sum(delta, axis=0, keepdims=True) / delta.shape[0]
        dx = np.dot(delta, self.w.T)
        return dx

In [2]:
class MLP(object):

    """
    A simple multilayer perceptron
    """

    def __init__(self, input_size, output_size, hiddens, activations, weight_init_fn,
                 bias_init_fn, criterion, lr):

        # Don't change this -->
        self.train_mode = True
        self.nlayers = len(hiddens) + 1
        self.input_size = input_size
        self.output_size = output_size
        self.activations = activations
        self.criterion = criterion
        self.lr = lr
        # <---------------------

        # Don't change the name of the following class attributes,
        # the autograder will check against these attributes. But you will need to change
        # the values in order to initialize them correctly

        # Initialize and add all your linear layers into the list 'self.linear_layers'
        # (HINT: self.foo = [ bar(???) for ?? in ? ])
        # (HINT: Can you use zip here?)
        self.linear_layers = [Linear(inf,outf, weight_init_fn, bias_init_fn) for inf, outf in zip([self.input_size] + hiddens, hiddens + [self.output_size])]


    def forward(self, x):
        for i, layer in enumerate(self.liner_layers):
            x = layer(x)
            x = self.activations[i](x)

        return x
    
    

    def zero_grads(self):
        # Use numpyArray.fill(0.0) to zero out your backpropped derivatives in each
        # of your linear and batchnorm layers.
        for layer in self.linear_layers:
            layer.dw.fill(0,0)
            layer.db.fill(0,0)
            

    def step(self):
        # Apply a step to the weights and biases of the linear layers.
        # (You will add momentum later in the assignment to the linear layers)

        for i in range(len(self.linear_layers)):
            layer = self.linear_layers[i]
            layer.w = layer.w -self.lr * layer.dw
            layer.b = layer.b -self.lr * layer.db

    def backward(self, labels):
        # Backpropagate through the activation functions, batch norm and
        # linear layers.
        # Be aware of which return derivatives and which are pure backward passes
        # i.e. take in a loss w.r.t it's output.
        final_layer = self.activations[-1]
        final_outputs = final_layer.state
        loss = self.criterion(final_outputs, labels)
        delta = self.criterion.derivative()

        for i in range(self.nlayers - 1, -1, -1):
            delta = delta * self.activations[i].derivative()
            delta = self.linear_layers[i].backward(delta)
            

    def error(self, labels):
                return (np.argmax(self.output, axis = 1) != np.argmax(labels, axis = 1)).sum()

    def total_loss(self, labels):
        return self.criterion(self.output, labels).sum()

    def __call__(self, x):
        return self.forward(x)

    def train(self):
        self.train_mode = True

    def eval(self):
        self.train_mode = False
