### History

This was an attempt to build an LSTM using the `ParamOperation` material from the first three chapters of `lincoln`.

# Notebook

In [1]:
# Add Lincoln to system path
import sys
sys.path.append("/Users/seth/development/lincoln/")

In [2]:
# imports
from torch import Tensor
import torch

from typing import Tuple, Dict, List

In [395]:
# Operation with multiple inputs, gradients
class Operation(object):

    def __init__(self):
        pass

    def forward(self, *inputs) -> Tuple[Tensor]:

        if len(inputs) == 1:
        
            self.inputs = inputs[0]
        

        self.outputs = self._outputs()

        return self.outputs


    def backward(self, *output_grads) -> Tuple[Tensor]:

        assert_same_shapes(self.outputs, output_grads)

        self._compute_grads(output_grads)

        assert_same_shapes(self.inputs, self.input_grads)
        return self.input_grads


    def _compute_grads(self, output_grads: Tuple[Tensor]) -> Tensor:


        
        self.input_grads = self._input_grads(output_grads)

        return self.input_grads

    def _outputs(self) -> Tuple[Tensor]:
        raise NotImplementedError()

    def _input_grads(self, *output_grads) -> Tuple[Tensor]:
        raise NotImplementedError()

In [396]:
# Testing old Operation definition with new, "multiple input" definition
class ReLU(Operation):
    def __init__(self):
        super().__init__()

    def _outputs(self) -> Tensor:
        self.output = torch.clamp(self.inputs, 0, 1e5)
        return self.output

    def _input_grads(self, output_grad: Tensor) -> Tensor:
        relu_backward = (self.output > 0).type(self.output.dtype)
        return relu_backward * output_grad

    def __repr__(self):
        return "ReLU"

In [398]:
m1 = torch.rand(2,2)
r = ReLU()
r.forward(m1)
r.backward(torch.ones_like(m1))

tensor([[1., 1.],
        [1., 1.]])

In [4]:
# Multiply class
class Multiply2(Operation):
   
    def __init__(self):
        pass

    def _outputs(self) -> Tuple[Tensor]:
        '''
        Element-wise multiplication
        '''
        assert len(self.inputs) == 2
        
        return self.inputs[0] * self.inputs[1]

    def _input_grads(self, output_grads: Tuple[Tensor]) -> Tuple[Tensor]:

        return self.inputs[1] * output_grads[0],\
               self.inputs[0] * output_grads[0]
     

In [5]:
# assert_same_shapes function
def assert_same_shapes(tensors: Tuple[Tensor],
                       tensor_grads: Tuple[Tensor]):

    assert len(tensors) == len(tensor_grads)
    
    if len(tensors) == 1:
        tensors = tensors[0]
    if len(tensor_grads) == 1:
        tensor_grads = tensor_grads[0]        

    for tensor, tensor_grad in zip(tensors, tensor_grads):
        assert tensor.shape == tensor_grad.shape, \
        '''
        Two tensors should have the same shape; instead, first Tensor's shape is {0}
        and second Tensor's shape is {1}.
        '''.format(tuple(tensor_grad.shape), tuple(tensor.shape))
        return None

In [6]:
# define inputs to Multiply Operation
m = Multiply()
torch.manual_seed(102218)
, torch.rand(2,2)

In [7]:
# test Multiply (Operation with multiple inputs) forward
m.forward(m1, m2) # tuple of length 1

tensor([[0.8841, 0.2608],
        [0.1206, 0.4546]])

In [8]:
# test Multiply (Operation with multiple inputs) backward
m_grad = torch.ones_like(m1)
m.backward(m_grad)

(tensor([[0.9344, 0.4609],
         [0.4948, 0.5119]]), tensor([[0.9462, 0.5658],
         [0.2438, 0.8881]]))

## PyTorch `Multiply`

### Without the class

In [9]:
# m1 initially
print(m1)
print(m1.requires_grad)

tensor([[0.9462, 0.5658],
        [0.2438, 0.8881]])
False


In [10]:
# detach to copy values
m1_g = m1.detach()
m2_g = m2.detach()

In [11]:
# require gradients
m1_g.requires_grad = True
m2_g.requires_grad = True

In [12]:
# perform operation
out = m1_g * m2_g

In [13]:
# define grad
mul_grad = torch.ones_like(out)

In [14]:
# send gradient backward
out.backward(gradient=mul_grad)

In [15]:
# same gradients as above!
print(m1_g.grad)
print(m2_g.grad)

tensor([[0.9344, 0.4609],
        [0.4948, 0.5119]])
tensor([[0.9462, 0.5658],
        [0.2438, 0.8881]])


### `PyTorchOperation` base class

In [16]:
# PyTorch Operation class
class PyTorchOperation(Operation):

    def __init__(self) -> Tensor:
        super().__init__()


    def forward(self, *inputs) -> Tuple[Tensor]:

        self.inputs = inputs

        self.inputs_with_grad = self._inputs_autograd()
        
        self.outputs = self._outputs()

        return self.outputs
        
        
    def _outputs(self) -> Tuple[Tensor]:
        raise NotImplementedError
        

    def _inputs_autograd(self) -> Tuple[Tensor]:
        inputs_with_grad = tuple(inp.detach() for inp in self.inputs)
        for inp in inputs_with_grad:
            inp.requires_grad = True
        return inputs_with_grad
        

    def _input_grads(self, output_grads: Tensor) -> Tensor:

        for out, grad in zip(self.outputs, output_grads):
            out.backward(gradient=grad)

        input_grads = tuple()
        for inp in self.inputs_with_grad:
            input_grads = input_grads + (inp.grad,)
        
        return input_grads

In [17]:
# Multiply_PyTorch class
class Multiply_PyTorch(PyTorchOperation):
   
    def __init__(self):
        super().__init__()

    def _outputs(self) -> Tuple[Tensor]:
        
        assert len(self.inputs) == 2
        
        return self.inputs_with_grad[0] * self.inputs_with_grad[1],

In [18]:
# Initialize Multiply stuff
m = Multiply_PyTorch()
mp1, mp2 = torch.rand(2,2), torch.rand(2,2)

In [19]:
# Print initial tensors
print(mp1, mp2)

tensor([[0.3988, 0.1799],
        [0.6886, 0.0705]]) tensor([[0.3258, 0.4976],
        [0.6935, 0.2254]])


In [20]:
# Forward
m.forward(mp1, mp2)

(tensor([[0.1299, 0.0895],
         [0.4775, 0.0159]], grad_fn=<ThMulBackward>),)

In [21]:
# Initialize grad and backward
m.backward(torch.ones_like(mp1))

(tensor([[0.3258, 0.4976],
         [0.6935, 0.2254]]), tensor([[0.3988, 0.1799],
         [0.6886, 0.0705]]))

## More complicated example

* Divide two inputs into:
    * `A`, `B`
    * `A` gets copied into `A1` and `A2`
    * `A1` and `A2` each get multiplied, bias added, activation, to create `C1` and `C2`.
    * These get added together to create `D`.
    * `B` and `D` get multiplied to create `E`. 
    
Then the question is: what is the gradient of `E` with respect to `A`.

### Without class

In [22]:
from lincoln.operations.operations import WeightMultiply
from lincoln.operations.operations import BiasAdd

#### Define base operations

In [111]:
# Copy class - 1 to many
class Copy(Operation):
   
    def __init__(self, num=2):
        self.num = num

    def _outputs(self) -> Tuple[Tensor]:
        '''
        Element-wise multiplication
        '''
        output = tuple()
        for i in range(self.num):
            output = output + self.inputs
            
        return output

    def _input_grads(self, output_grads: Tuple[Tensor]) -> Tensor:
        input_grad = torch.zeros_like(output_grads[0])
        for grad in output_grads:
             input_grad = input_grad + grad
        return input_grad

In [112]:
# Add2 Operation
class Add2(Operation):
   
    def __init__(self):
        pass

    def _outputs(self) -> Tuple[Tensor]:
        '''
        Element-wise addition
        '''
        assert len(self.inputs) == 2
        
        return self.inputs[0] + self.inputs[1]

    def _input_grads(self, output_grads: Tuple[Tensor]) -> Tuple[Tensor]:

        return output_grads[0], output_grads[0]
     

In [113]:
# Concat operation - two to one
class Concat2(Operation):
   
    def __init__(self):
        pass

    def _outputs(self) -> Tensor:
        '''
        Element-wise multiplication
        '''
        assert len(self.inputs) == 2
        
        self.input_shapes = [inp.shape[1] for inp in self.inputs]
        
        return torch.cat(list(self.inputs), dim=1)

    def _input_grads(self, output_grads: Tuple[Tensor]) -> Tuple[Tensor]:
        return torch.split(output_grads[0], 
                           self.input_shapes,
                           dim=1)

In [114]:
# printing "a" and "b" tensors
print(a)
print(b)

tensor([[1., 2.],
        [3., 4.]])
tensor([[5., 6.],
        [7., 8.]])


In [116]:
c = Concat2()
out = c.forward(a,b)
print(out)
c.backward(torch.ones_like(out))

tensor([[1., 2., 5., 6.],
        [3., 4., 7., 8.]])


(tensor([[1., 1.],
         [1., 1.]]), tensor([[1., 1.],
         [1., 1.]]))

**TODO**: incorporate these into Lincoln somehow

In [117]:
# initialize tensors
torch.manual_seed(102218)
a, b = torch.rand(2,2), torch.rand(2,2)
w1, w2 = torch.rand(2,2), torch.rand(2,2)
b1, b2 = torch.rand(1,2), torch.rand(1,2)

In [118]:
# set requires_grad = True for all tensors
for t in [a, b, w1, w2, b1, b2]:
    t.requires_grad = True

In [119]:
# define operations
c = Copy()
add2 = Add2()
mul = Multiply()
wm1 = WeightMultiply(w1)
ba1 = BiasAdd(b1)
wm2 = WeightMultiply(w2)
ba2 = BiasAdd(b2)

In [120]:
# define forward path
a1, a2 = c.forward(a)
c1 = ba1.forward(wm1.forward(a1))
c2 = ba2.forward(wm2.forward(a2))
d = add2.forward(c1, c2)
e = mul.forward(b, d)

In [121]:
e # output

tensor([[2.4230, 1.1577],
        [1.2516, 1.0909]], grad_fn=<ThMulBackward>)

In [122]:
torch.sum(e) # sum

tensor(5.9232, grad_fn=<SumBackward0>)

In [123]:
a.grad # None

In [124]:
# passing gradient backwards
e.backward(gradient=torch.ones_like(e))

In [125]:
# all grads work
print(a.grad)

print(b.grad)

print(w1.grad)

print(b1.grad)

tensor([[0.9893, 1.4278],
        [0.7053, 0.8353]])
tensor([[2.5932, 2.5116],
        [2.5298, 2.1312]])
tensor([[1.0048, 0.5609],
        [0.9681, 0.7154]])
tensor([[1.4291, 0.9728]])


This implies that increasing `a[0][0]` by `0.1` will increase the sum from `5.9232` to `5.9232 + 0.1 * 0.9893`.

In [126]:
# define what sum with "a_new" should be: 
5.9232 + 0.1 * 0.9893

6.02213

In [127]:
# define helper function
def sum_with_a(a):
    a1, a2 = c.forward(a)
    c1 = ba1.forward(wm1.forward(a1))
    c2 = ba2.forward(wm2.forward(a2))
    d = add2.forward(c1, c2)
    e = mul.forward(b, d)
    return torch.sum(e)

In [128]:
# define new version of a with one value incremented
a_new = a.clone()
a_new[0][0] += 0.1
print(a_new)

tensor([[1.0462, 0.5658],
        [0.2438, 0.8881]], grad_fn=<CopySlices>)


In [129]:
# test that sum equals expected i.e. that gradients are correct
print(sum_with_a(a))
print(sum_with_a(a_new))

tensor(5.9232, grad_fn=<SumBackward0>)
tensor(6.0222, grad_fn=<SumBackward0>)


Works!

### Turning this into a class

`Layer` class:

* Has forward and backward methods. These simply loop through the operations, passing things forwards or backards.
* On setup, requires a `num_in`

`AutogradBlock` class:

* Has forward method, which calls the `_output` method and `_setup` if it is the first iteration.
* On setup, requires we define all ops in a `Dict[Operation]`. In addition, we define weights and give them a gradient.
* On `_output`, we actually compute the output from the inputs.
* We get the `param_grads` in a similar way we get them from `Layer`s - looping through a dictionary now instead of through a list.

Trouble is: this solution begs the question: why not write an autograd library and do everything that way?

Options for autograd from scratch:

* Could invest a week and unpack the example at `autodiff` in README.
   * He makes heavy use of `einsum`, which is its own overhead.
* Just use PyTorch (bad - why the operations? Might be ok to illustrate LSTM). 

Point is to illustrate how things work. Period.

In [130]:
# Import lincoln operations
from lincoln.operations.operations import WeightMultiply
from lincoln.operations.operations import BiasAdd
from lincoln.operations.base import ParamOperation
from lincoln.operations.activations import Sigmoid, Tanh

In [254]:
# AutogradBlock: block of operations with multiple inputs and outputs
class AutogradBlock(object):

    def __init__(self) -> Tensor:
        super().__init__()
        self.params: Dict[Tensor] = {}
        self.param_grads: List[Tensor] = []
        self.ops: Dict[Operation] = {}
        self.first: bool = True

            
    def _setup_block(self) -> Tuple[Tensor]:
        pass
        

    def forward(self, *inputs) -> Tuple[Tensor]:

        if self.first:
            self._setup_block()
            self.first = False
            
        self.inputs = inputs
        
        self.inputs_with_grad = self._inputs_autograd()
        
        self.params_with_grad = self._params_autograd()
        self._gradify_operations()

        self.outputs = self._outputs()

        return self.outputs

    def _inputs_autograd(self) -> Tuple[Tensor]:
        inputs_with_grad = tuple(inp.detach() for inp in self.inputs)
        for inp in inputs_with_grad:
            inp.requires_grad = True
        return inputs_with_grad

    
    def _params_autograd(self) -> Tuple[Tensor]:
        params_with_grad = tuple(param.detach() for param in self.params.values())
        for param in params_with_grad:
            param.requires_grad = True
        return params_with_grad
    

    def _gradify_operations(self) -> Tuple[Tensor]:
        for op, tensor in zip([op for op in self.ops.values() 
                               if issubclass(op.__class__, ParamOperation)],
                              self.params_with_grad):
            setattr(op, "param", tensor)
    
    
    def backward(self, *output_grads) -> Tuple[Tensor]:

        assert_same_shapes(self.outputs, output_grads)

        self.input_grads = self._input_grads(output_grads)
        
        if self.params:
            self.param_grads = self._param_grads()
    
        assert_same_shapes(self.inputs, self.input_grads)
        return self.input_grads


    def _outputs(self) -> Tuple[Tensor]:
        raise NotImplementedError()


    def _input_grads(self, output_grads: Tuple[Tensor]) -> Tuple[Tensor]:
        
        if len(output_grads) == 1:
            self.outputs.backward(output_grads)
        else: 
            for out, grad in zip(self.outputs, output_grads):
                out.backward(gradient=grad, retain_graph=True)

        input_grads = tuple()
        for inp in self.inputs_with_grad:
            input_grads = input_grads + (inp.grad,)
        
        return input_grads

    
    def _param_grads(self) -> List[Tensor]:
        return tuple(param.grad for param in self.params_with_grad)

    
    def _params(self) -> None:
        return tuple(param.data for param in self.params_with_grad)

In [255]:
# ToyExample class, inheriting from AutogradBlock. 
# Analogous to a custom Layer
class ToyExample(AutogradBlock):
   
    def __init__(self, 
                 seed=12345):
        super().__init__()
        self.seed = seed
    
        
    def _setup_block(self) -> Tuple[Tensor]:

        torch.manual_seed(self.seed)

        self.ops['cp'] = Copy()
        self.ops['add2'] = Add2()
        self.ops['mul'] = Multiply()
        
        self.params['w1'] = torch.rand(2,2)
        self.ops['wm1'] = WeightMultiply(self.params['w1'])
        
        self.params['w2'] = torch.rand(2,2)
        self.ops['wm2'] = WeightMultiply(self.params['w2'])
        
        self.params['b1'] = torch.rand(1,2)
        self.ops['ba1'] = BiasAdd(self.params['b1']) 
        
        self.params['b2'] = torch.rand(1,2)
        self.ops['ba2'] = BiasAdd(self.params['b2'])

        
    def _outputs(self) -> Tuple[Tensor]:
        
        a, b = self.inputs_with_grad 
        a1, a2 = self.ops['cp'].forward(a) 
        c1 = self.ops['ba1'].forward(self.ops['wm1'].forward(a1))
        c2 = self.ops['ba2'].forward(self.ops['wm2'].forward(a2))
        d = self.ops['add2'].forward(c1, c2)
        
        return self.ops['mul'].forward(b, d)

In [256]:
# instantiate
t = ToyExample(seed=102218)

In [257]:
# new input tensors
torch.manual_seed(102218)
a = torch.rand(2,2)
b = torch.rand(2,2)

In [258]:
# result
out = t.forward(a, b)
print(out)
print(torch.sum(out))

tensor([[3.0692, 0.9283],
        [1.0894, 0.8928]], grad_fn=<ThMulBackward>)
tensor(5.9798, grad_fn=<SumBackward0>)


In [259]:
a.grad # no grad

In [260]:
# sending gradient backwards
t.backward(torch.ones_like(out))

(tensor([[2.2304, 1.3354],
         [1.4560, 1.0821]]), tensor([[3.2847, 2.0140],
         [2.2020, 1.7442]]))

In [261]:
a.grad # still no grad

#### Testing gradients

In [262]:
# new version of a
a_new = a.clone()
a_new[0][0] += 0.1
print(a_new)

tensor([[1.0462, 0.5658],
        [0.2438, 0.8881]])


In [263]:
# printing what output should be
out_new = t.forward(a_new, b)
print(out_new)
print(torch.sum(out_new))

tensor([[3.2449, 0.9756],
        [1.0894, 0.8928]], grad_fn=<ThMulBackward>)
tensor(6.2028, grad_fn=<SumBackward0>)


In [264]:
# predicted value
# old_sum + 0.1 * gradient
5.9798 + 0.1 * 2.2304

6.20284

Works!

#### Testing `param_grads`

In [265]:
# we have grads
t.param_grads

(tensor([[1.0048, 0.5609],
         [0.9681, 0.7154]]), tensor([[1.0048, 0.5609],
         [0.9681, 0.7154]]), tensor([[1.4291, 0.9728]]), tensor([[1.4291, 0.9728]]))

In [266]:
# we have grads
for val in t.params.values():
    print(val)

tensor([[0.9462, 0.5658],
        [0.2438, 0.8881]])
tensor([[0.9344, 0.4609],
        [0.4948, 0.5119]])
tensor([[0.3988, 0.1799]])
tensor([[0.6886, 0.0705]])


### LSTMs and beyond

LSTM Node will be an `AutogradBlock` with three inputs and three outputs. 

LSTM Layer, on the other hand...

## LSTM using `AutogradBlock`

#### `LSTMLayer`
* Will have params.
* Instead of `operations` will have `AutogradBlocks`

#### `LSTMNode`
* `AutogradBlock`
* `forward` method will take in `params` as an argument
* `backward` method will produce `input_grads`, and also need to increment the overall `param_grads`
    * Latter should be ok as long as we have `requires_grad = True` for the params passed into the `Node`
    

In [365]:
# LSTMNode series of operations
class LSTMNode(AutogradBlock):
    
    def __init__(self, 
                 hidden_size: int,
                 vocab_size: int,
                 seed: int=12345):

        super().__init__()
        self.seed = seed
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        
    def forward(self, 
                lstm_params: Dict[str, Tensor],
                *inputs) -> Tuple[Tensor]:

        if self.first:
            self._setup_block(lstm_params)
            self.first = False
            
        self.inputs = inputs
        
        self.inputs_with_grad = self._inputs_autograd()
        
        self.params_with_grad = self._params_autograd()
        self._gradify_operations()

        self.outputs = self._outputs()

        return self.outputs

    
    def backward(self, 
                 lstm_params: Dict[str, Tensor],
                 *output_grads) -> Tuple[Tensor]:

        assert_same_shapes(self.outputs, output_grads)

        self.input_grads = self._input_grads(output_grads)

        if self.params:
            self.param_grads = self._param_grads()

        assert_same_shapes(self.inputs, self.input_grads)
        return self.input_grads

    
    def _setup_block(self,
                     lstm_params: Dict[str, Tensor]) -> Tuple[Tensor]:

        torch.manual_seed(self.seed)

        self.ops['con'] = Concat2()
        self.ops['copy'] = Copy(4)
        self.ops['sig1'] = Sigmoid()
        self.ops['sig2'] = Sigmoid()
        self.ops['sig3'] = Sigmoid()
        self.ops['tan1'] = Tanh()
        self.ops['tan2'] = Tanh()
        self.ops['mul1'] = Multiply()
        self.ops['mul2'] = Multiply()
        self.ops['mul3'] = Multiply()
        self.ops['add1'] = Add2()
        self.ops['add2'] = Add2()
#         import pdb; pdb.set_trace()
        self.ops['Wf'] = WeightMultiply(lstm_params['Wf'])
        self.ops['Bf'] = BiasAdd(lstm_params['Bf'])
        
        self.ops['Wi'] = WeightMultiply(lstm_params['Wi'])
        self.ops['Bi'] = BiasAdd(lstm_params['Bi'])
        
        self.ops['Wc'] = WeightMultiply(lstm_params['Wc'])
        self.ops['Bc'] = BiasAdd(lstm_params['Bc'])

        self.ops['Wo'] = WeightMultiply(lstm_params['Wo'])
        self.ops['Bo'] = BiasAdd(lstm_params['Bo'])        

        self.ops['Wv'] = WeightMultiply(lstm_params['Wv'])
        self.ops['Bv'] = BiasAdd(lstm_params['Bv'])  
        
        
    def _outputs(self) -> Tuple[Tensor]:
        
        X_in, H_in, C_in = self.inputs_with_grad 
        Z = self.ops['con'].forward(X_in, H_in)
        z1, z2, z3, z4 = self.ops['copy'].forward(Z) 
#         import pdb; pdb.set_trace()
        F = self.ops['Wf'].forward(z1)
        F = self.ops['Bf'].forward(F)
        F_out = self.ops['sig1'].forward(F)

        I = self.ops['Wi'].forward(z2)
        I = self.ops['Bi'].forward(I)
        I_out = self.ops['sig2'].forward(I)
        
        C = self.ops['Wc'].forward(z3)
        C = self.ops['Bc'].forward(C)
        C_bar = self.ops['tan1'].forward(C)
        
        c1 = self.ops['mul1'].forward(F_out, C_in)
        c2 = self.ops['mul2'].forward(I_out, C_bar)
        
        c_new = self.ops['add1'].forward(c1, c2)

        O = self.ops['Wo'].forward(z4)
        O = self.ops['Bo'].forward(O)
        O_out = self.ops['sig3'].forward(O)
        
        C_out = self.ops['tan2'].forward(c_new)
        
        H_out = self.ops['mul3'].forward(O_out, C_out)

        X = self.ops['Wv'].forward(H_out)
        X_out = self.ops['Bv'].forward(X)

        return X_out, H_out, C_out

In [376]:
# LSTMLayer class - series of operations
class LSTMLayer(object):
    
    def __init__(self, 
                 max_len: int,
                 vocab_size: int,
                 hidden_size: int = 100):
        super().__init__()
        self.nodes = [LSTMNode(hidden_size, vocab_size) for _ in range(max_len)]
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.first: bool = True
        self.start_H: Tensor = None
        self.start_C: Tensor = None
        self.params: Dict[Tensor] = {}
        
    
    def _init_params(self, input_: Tensor) -> Tensor:
        '''
        First dimension of input_ will be batch size
        '''
        self.start_H = torch.zeros(input_.shape[0], self.hidden_size)
        self.start_C = torch.zeros(input_.shape[0], self.hidden_size)
        self.params['Wf'] = torch.rand(self.hidden_size + self.vocab_size, 
                                       self.hidden_size)
        self.params['Bf'] = torch.rand(1, self.hidden_size)        

        self.params['Wi'] = torch.rand(self.hidden_size + self.vocab_size, 
                                       self.hidden_size)
        self.params['Bi'] = torch.rand(1, self.hidden_size)
        
        self.params['Wc'] = torch.rand(self.hidden_size + self.vocab_size, 
                                       self.hidden_size)
        self.params['Bc'] = torch.rand(1, self.hidden_size)
    
        self.params['Wo'] = torch.rand(self.hidden_size + self.vocab_size, 
                                       self.hidden_size)
        self.params['Bo'] = torch.rand(1, self.hidden_size)

        self.params['Wv'] = torch.rand(self.hidden_size, 
                                       self.vocab_size)
        self.params['Bv'] = torch.rand(1, self.vocab_size)

        for param in self.params.values():
            param.requires_grad = True        

            
    def _zero_param_grads(self) -> None:
        for param in self.params.values():
            if param.grad is not None:
                param.grad.data.zero_() 

                
    def _params(self) -> Tuple[Tensor]:
        return tuple(self.params.values())

    
    def _param_grads(self) -> Tuple[Tensor]:
        return tuple(param.grad for param in self.params.values())    

    
    def forward(self, input_: Tensor) -> Tensor:
        if self.first:
            self._init_params(input_)
            self.first = False        
    
        # shape: batch size by sequence length by vocab_size
        self.input_ = input_
        
        H_in = torch.clone(self.start_H)
        C_in = torch.clone(self.start_C)

        self.output = torch.zeros_like(self.input_)
        
        seq_len = self.input_.shape[1]
        
        for i in range(seq_len):
            
            # pass info forward through the nodes 
            elem_out, H_in, C_in = self.nodes[i].forward(self.params, self.input_[:, i, :], 
                                                         H_in, C_in)
            
            self.output[:, i, :] = elem_out
            
        self.start_H = H_in
        self.start_C = C_in

        return self.output


    def backward(self, output_grad: Tensor) -> Tensor:
        
#         self._zero_param_grads()
        
        dH_next = torch.zeros_like(self.start_H)
        dC_next = torch.zeros_like(self.start_C)

        self.input_grad = torch.zeros_like(self.input_)

        for i in reversed(range(self.input_.shape[1])):

            # pass info forward through the nodes 
            grad_out, dH_next, dC_next = \
                self.nodes[i].backward(self.params, output_grad[:, i, :], 
                                       dH_next, dC_next)

            self.input_grad[:, i, :] = grad_out
            
        return self.input_grad

#### Testing

In [377]:
# define input, shape: (batch_size, sequence_length, vocab_size)
batch_size = 10
sequence_length = 20
vocab_size = 62
lstm_in = torch.rand(batch_size, sequence_length, vocab_size)
lstm_in.shape

torch.Size([10, 20, 62])

In [378]:
lstm_in[:, 0, :].shape

torch.Size([10, 62])

In [379]:
# instantiate objects
lay = LSTMLayer(sequence_length, vocab_size)

In [380]:
# test passing forward
lstm_out = lay.forward(lstm_in)
print(lstm_out.shape)

torch.Size([10, 20, 62])


In [381]:
# testing backward "retain_graph = True"
x = torch.ones(2, 2, requires_grad=True)
x1 = x + 2
x2 = x + 3
y = x1 + 2
z = x2 + 2
y.backward(torch.ones(2, 2))
print(x.grad)

tensor([[1., 1.],
        [1., 1.]])


In [383]:
# test backward
lstm_grad = torch.ones_like(lstm_out)
lstm_in_grad = lay.backward(lstm_grad)

In [387]:
# params and param grads
lp = lay._params()
lg = lay._param_grads()

print(len(lp))
print(len(lg))
for p, g in zip(lp, lg):
    print(p.shape == g.shape)

10
10
True
True
True
True
True
True
True
True
True
True


TODO: 

* Put this into Lincoln
* Make a demo