In [2]:
import numpy as np

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [3]:
class Module(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True
    """
    Basically, you can think of a module as of a something (black box) 
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`: 
        
        output = module.forward(input)
    
    The module should be able to perform a backward pass: to differentiate the `forward` function. 
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule. 
    
        gradInput = module.backward(input, gradOutput)
    """
    
    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput
    

    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.
        
        Make sure to both store the data in `output` field and return it. 
        """
        
        # The easiest case:
            
        # self.output = input 
        # return self.output
        
        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        
        Make sure to both store the gradients in `gradInput` field and return it.
        """
        
        # The easiest case:
        
        # self.gradInput = gradOutput 
        # return self.gradInput
        
        pass   
    
    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def training(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True
    
    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [4]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially. 
         
         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`. 
    """
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
   
    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)
    def pop(self):
        self.modules.pop()

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:
        
            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})   
            
            
        Just write a little loop. 
        """
        intermediate_output = input
#         i = 0
        for module in self.modules:
#             i+=1
            intermediate_output = module.forward(intermediate_output)
#             print(i, module.output.shape)
        self.output = intermediate_output

        # Your code goes here. ################################################
        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:
            
            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)   
            gradInput = module[0].backward(input, g_1)   
             
             
        !!!
                
        To ech module you need to provide the input, module saw while forward pass, 
        it is used while computing gradients. 
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass) 
        and NOT `input` to this Sequential module. 
        
        !!!
        
        """
        intermediate_grad = gradOutput
        for idx, module in reversed(list(enumerate(self.modules))[1:]):
#             print idx
#             print (idx - 1)
#             print(self.modules)
#             print(self.modules[idx])
#             print(self.modules[idx - 1])
#             print("input", self.modules[idx - 1].output.shape)
#             print("intermediate", intermediate_grad.shape)
            intermediate_grad = module.backward(self.modules[idx - 1].output, intermediate_grad)
        # Your code goes here. ################################################
        self.gradInput = self.modules[0].backward(input, intermediate_grad)
        return self.gradInput
      

    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)

# Layers

- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

In [49]:
class Linear(Module):
    """
    A module which applies a linear transformation 
    A common name is fully-connected layer, InnerProductLayer in caffe. 
    self.W : n_feats1 x n_feats2
    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
       
        # This is a nice initialization
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size = n_out)
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
        self.lr = 1e-4
        
    def updateOutput(self, input):
#         print(self)
#         print("input" + str(input.shape))
#         print("w" + str(self.W.shape))
#         print("dot" + str(self.W.dot(input.T).shape))
#         print("b" + str(self.b[:, np.newaxis].shape))
#         self.output = np.add(self.W.dot(input.T), self.b[:, np.newaxis]).T
        self.output = np.add(input.dot(self.W.T), self.b)
#         print(self.output.shape)
        # Your code goes here. ################################################      
        return self.output
    
    def updateGradInput(self, input, gradOutput):
#         print("grad w" + str(self.W.shape))
#         print("grad input" + str(input.shape))
#         print("grad output" + str(gradOutput.shape))
#         print("grad input upd" + str(gradOutput.dot(self.W).shape))
        self.gradInput = gradOutput.dot(self.W)
        # Your code goes here. ################################################
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
#         print("grad w" + str(gradOutput.T.dot(input).shape))
#         print("gradoutput" + str(gradOutput.shape))
#         print("grad result" + str(gradOutput.T.dot(input).shape))
#         print("grad lr" + str(self.W.shape))
        self.gradW = gradOutput.T.dot(input)
        self.gradb = gradOutput.sum(axis=0)
#         print("grad b" + str(self.gradb.shape))
#         print("self.b" + str(self.b.shape))
        #self.W -= self.lr * self.gradW
        #self.b -= self.lr * self.gradb
        # Your code goes here. ################################################
        pass
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

This one is probably the hardest but as others only takes 5 lines of code in total. 
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [73]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()
            
    def softmax_func_for_grad(self, y):
        return np.array(np.divide(y, np.sum(y, keepdims=True)))
    
    def jacobian_one_vector(self, x):
        diag = np.diagflat(np.ones(len(x)))
        non_diag = np.ones((len(x), len(x))) - diag
        diag_grad = np.multiply(diag,
                                np.multiply
                                (self.softmax_func_for_grad(x),
                                 (np.ones(len(x)) - self.softmax_func_for_grad(x))[:, np.newaxis]))
        non_diag_grad = np.multiply(non_diag,  
                                    -np.multiply(self.softmax_func_for_grad(x), 
                                         self.softmax_func_for_grad(x)[:, np.newaxis]))
#         print("matrix", np.multiply
#                                 (self.softmax_func_for_grad(x),
#                                  (np.ones(len(x)) - self.softmax_func_for_grad(x))[:, np.newaxis]).shape)

        return np.add(diag_grad, non_diag_grad)
    
    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.exp(np.subtract(input, input.max(axis=1, keepdims=True)))
        self.output = np.divide(self.output, np.sum(self.output, axis=1, keepdims=True))
        # Your code goes here. ################################################
        return self.output

    
    def updateGradInput(self, input, gradOutput):
        input = np.exp(np.subtract(input, input.max(axis=1, keepdims=True)))
        cur_grad = np.array(list(map(lambda x: self.jacobian_one_vector(x), input)))
        result = np.array(list(map(lambda i:
                                   np.dot(gradOutput[i], cur_grad[i]), range(gradOutput.shape[0]))))
        self.gradInput = result
        return self.gradInput
        
#         self.gradInput = np.zeros(input.shape)
#         self.gradInput  = np.add(self.gradInput, np.multiply(gradOutput, self.output))
#         self.gradInput = np.subtract(self.gradInput, (np.sum(gradOutput * self.output, axis=1) * self.output.T).T)
        
#         return self.gradInput
    
    def __repr__(self):
        return "SoftMax"

In [75]:
# x = np.array([[1, 2], [30, 40]]).astype(float)
# y = np.array([[100, 200], [3000, 4000]])
# a = SoftMax()
# b = SoftMaxS()
# print(a.forward(x))
# print(b.forward(x))

# print(a.updateGradInput(x,y))
# print(b.updateGradInput(x,y))

[[  2.68941421e-01   7.31058579e-01]
 [  4.53978687e-05   9.99954602e-01]]
[[  2.68941421e-01   7.31058579e-01]
 [  4.53978687e-05   9.99954602e-01]]
[[-19.66119332  19.66119332]
 [ -0.04539581   0.04539581]]
[[-19.66119332  19.66119332]
 [ -0.04539581   0.04539581]]


5.8466440680388885e-09

Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. 

This is a very cool regularizer. In fact, when you see your net is overfitting try to add more dropout.

While training (`self.training == True`) it should sample a mask on each iteration (for every batch). When testing this module should implement identity transform i.e. `self.output = input`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [84]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()       
        self.p = p
        self.mask = None
        
    def updateOutput(self, input):
        if self.training:
            self.mask = np.random.binomial(1, self.p, input.shape)
        else:
            self.mask = np.ones_like(input)
        
        self.output = np.multiply(self.mask, input)
        # Your code goes here. ################################################
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput =  np.multiply(gradOutput, self.mask)
        # Your code goes here. ################################################
        return self.gradInput
        
    def __repr__(self):
        return "Dropout"

# Activation functions

Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**): 

In [78]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput
    
    def __repr__(self):
        return "ReLU"

Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope. 

In [9]:
class LeakyReLU(Module):
    def __init__(self, slope = 0.03):
        super(LeakyReLU, self).__init__()
           
        self.slope = slope
        
    def updateOutput(self, input):
        slope = 0.1
        positive = np.multiply (input, input > 0)
        negative = np.multiply(np.multiply (input, input < 0), self.slope)
        self.output=  np.add(positive, negative)
        # Your code goes here. ################################################
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        positive = np.multiply (gradOutput, input > 0)
        negative = np.multiply(np.multiply (gradOutput, input < 0), self.slope)
        self.gradInput = np.add(positive, negative)
        # Your code goes here. ################################################
        return self.gradInput
    
    def __repr__(self):
        return "LeakyReLU"

# Criterions

Criterions are used to score the models answers. 

In [10]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        
    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function 
            associated to the criterion and return the result.
            
            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        # Your code goes here. ################################################
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result. 

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)
    
    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.

In [11]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()
        
    def updateOutput(self, input, target): 
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output 
 
    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss). Nevertheless there is a sum over `y` (target) in that formula, 
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. 

In [12]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()
        
    def updateOutput(self, input, target): 
        
        # Use this trick to avoid numerical errors
        eps = 1e-15 
        input_clamp = np.clip(input, eps, 1 - eps)
#         print(input_clamp)
        self.output = np.multiply(np.log(input_clamp), target)
        self.output = -np.sum(np.sum(self.output, axis=1)) / target.shape[0]
        
        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, target):
        
        # Use this trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(input, 1 - 1e-15) )
        self.gradInput = -(np.divide(target, input_clamp) / target.shape[0])
                        
        # Your code goes here. ################################################
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterion"