The key thing I'll work towards here is a `Model` that has a dictionary of `params` and a `forward` method.

In [1]:
import inspect
from typing import List, NamedTuple, Callable, Optional, Union, Iterator, Dict, Tuple

import numpy as np
np.set_printoptions(precision=4)

In [2]:
Arrayable = Union[float, list, np.ndarray]

Tensorable = Union['Tensor', float, np.ndarray]

def ensure_tensor(tensorable: Tensorable) -> 'Tensor':
    if isinstance(tensorable, Tensor):
        return tensorable
    else:
        return Tensor(tensorable)
    
def ensure_array(arrayable: Arrayable) -> np.ndarray:
    if isinstance(arrayable, np.ndarray):
        return arrayable
    else:
        return np.array(arrayable)

class Dependency(NamedTuple):
    tensor: 'Tensor'
    grad_fn: Callable[[np.ndarray], np.ndarray]
        
def collapse_sum(grad: np.ndarray,
                 t: 'Tensor') -> np.ndarray:

    # Sum out added dims
    ndims_added = grad.ndim - t.data.ndim
    for _ in range(ndims_added):
        grad = grad.sum(axis=0)

    # Sum across broadcasted (but non-added dims)
    for i, dim in enumerate(t.shape):
        if dim == 1:
            grad = grad.sum(axis=i, keepdims=True)
    
    return grad

In [3]:
class Tensor:
    
    def __init__(self,
                 data: np.ndarray,
                 depends_on: List[Dependency] = None,
                 no_grad: bool = False) -> None:
        self.data = ensure_array(data)
        self.depends_on = depends_on or []
        self.no_grad = no_grad
        self.shape = self.data.shape
        self.grad: Optional['Tensor'] = None
        if not self.no_grad:
            self.zero_grad()

    def __repr__(self) -> str:
        return f"Tensor({np.round(self.data, 4)})"
            
    def __add__(self, other: Tensorable) -> 'Tensor':
        """gets called if I do t + other"""
        return _add(self, ensure_tensor(other))

    def __radd__(self, other: Tensorable) -> 'Tensor':
        """gets called if I do other + t"""
        return _add(ensure_tensor(other), self)

    def __iadd__(self, other: Tensorable) -> 'Tensor':
        """when we do t += other"""
        self.data = self.data + ensure_tensor(other).data
        return self    

    def __isub__(self, other: Tensorable) -> 'Tensor':
        """when we do t -= other"""
        self.data = self.data - ensure_tensor(other).data
        return self
    
    def __imul__(self, other: Tensorable) -> 'Tensor':
        """when we do t *= other"""
        self.data = self.data * ensure_tensor(other).data
        return self

    def __mul__(self, other: Tensorable) -> 'Tensor':
        return _mul(self, ensure_tensor(other))

    def __rmul__(self, other: Tensorable) -> 'Tensor':
        return _mul(ensure_tensor(other), self)

    def __matmul__(self, other: Tensorable) -> 'Tensor':
        return _matmul(self, other)

    def __neg__(self) -> 'Tensor':
        return _neg(self)

    def __sub__(self, other: Tensorable) -> 'Tensor':
        return _sub(self, ensure_tensor(other))

    def __rsub__(self, other: Tensorable) -> 'Tensor':
        return _sub(ensure_tensor(other), self)
    
    def __getitem__(self, idxs) -> 'Tensor':
        return _slice(self, idxs)
    
    def concat(self, other: Tensorable) -> 'Tensor':
        return _concat(self, ensure_tensor(other))
    
    def repeat(self, repeats: int) -> 'Tensor':
        return _repeat(self, repeats)

    def expand_dims_axis_1(self) -> 'Tensor':
        return _expand_dims_axis_1(self)

    def append_axis_1(self, other: Tensorable) -> 'Tensor':
        return _append_axis_1(self, ensure_tensor(other))

    def select_index_axis_1(self, ind: int) -> 'Tensor':
        return _select_index_axis_1(self, ind)
    
    def zero_grad(self) -> None:
        self.grad = Tensor(np.zeros_like(self.data, dtype=np.float64),
                           no_grad = True)
    
    def sum(self) -> 'Tensor':
        return tensor_sum(self)
    
    def backward(self, grad: 'Tensor' = None) -> None:
        # backward mostly going to be called on the loss after calling "sum"
        if self.no_grad:
            return
        
        if self.shape == ():
            grad = Tensor(np.array(1.0))
        
        self.grad.data = self.grad.data + grad.data

        for dependency in self.depends_on:
            backward_grad = dependency.grad_fn(grad.data)
            dependency.tensor.backward(Tensor(backward_grad))

#### Backward functions

In [4]:
def _add(t1: Tensor, t2: Tensor) -> Tensor:

    def _forward(t1: Tensor, t2: Tensor) -> np.ndarray:
        return t1.data + t2.data

    def t1_grad(grad: np.ndarray) -> np.ndarray:

        grad = collapse_sum(grad, t1)

        return grad

    def t2_grad(grad: np.ndarray) -> np.ndarray:

        grad = collapse_sum(grad, t2)

        return grad

    data = _forward(t1, t2)
    depends_on = [
        Dependency(t1, t1_grad),
        Dependency(t2, t2_grad)
    ]
    return Tensor(data, depends_on)

def _mul(t1: Tensor, t2: Tensor) -> Tensor:

    def _forward(t1: Tensor, t2: Tensor) -> np.ndarray:
        return t1.data * t2.data

    def t1_grad(grad: np.ndarray) -> np.ndarray:
        grad = grad * t2.data
        grad = collapse_sum(grad, t1)

        return grad

    def t2_grad(grad: np.ndarray) -> np.ndarray:
        grad = grad * t1.data
        grad = collapse_sum(grad, t2)

        return grad

    data = _forward(t1, t2)
    depends_on = [
        Dependency(t1, t1_grad),
        Dependency(t2, t2_grad)
    ]
    return Tensor(data, depends_on)

def _matmul(t1: Tensor, t2: Tensor) -> Tensor:

    assert t1.shape[1] == t2.shape[0]

    def _forward(t1: Tensor, t2: Tensor) -> np.ndarray:
        return t1.data @ t2.data

    def t1_grad(grad: np.ndarray) -> np.ndarray:
        grad = grad @ t2.data.T

        return grad

    def t2_grad(grad: np.ndarray) -> np.ndarray:
        grad = t1.data.T @ grad

        return grad

    data = _forward(t1, t2)
    depends_on = [
        Dependency(t1, t1_grad),
        Dependency(t2, t2_grad)
    ]
    return Tensor(data, depends_on)

def _neg(t: Tensor) -> Tensor:

    def _forward(t: Tensor) -> np.ndarray:
        return -t.data

    def t_grad(grad: np.ndarray) -> np.ndarray:
        return -grad

    data = _forward(t)
    depends_on = [
        Dependency(t, t_grad),
    ]
    return Tensor(data, depends_on)

def _sub(t1: Tensor, t2: Tensor) -> Tensor:
    return t1 + -t2

def _slice(t: Tensor, idxs: slice) -> Tensor:

    def _forward(t: Tensor, idxs):
        return t.data[idxs]

    data = _forward(t, idxs)    
    
    def t_grad(grad: np.ndarray) -> np.ndarray:
        bigger_grad = np.zeros_like(t.data)
        bigger_grad[idxs] = grad
        return bigger_grad

    depends_on = [
        Dependency(t, t_grad),
    ]

    return Tensor(data, depends_on, t.no_grad)

def tensor_sum(t: Tensor) -> Tensor:

    def _forward(t: Tensor):
        return t.data.sum()

    def t_grad(grad: np.ndarray) -> np.ndarray:
        return grad * np.ones_like(t.data)

    data = _forward(t)
    depends_on = [
        Dependency(t, t_grad),
    ]

    return Tensor(data, depends_on)        
        

#### Extra functions

In [5]:
def _concat(t1: Tensor, t2: Tensor) -> Tensor:
    
    assert t1.shape[0] == t2.shape[0],\
    "Concatenated Tensors must have the same shape along first dimension"
    
    def _forward(t1: Tensor, t2: Tensor):
        return np.concatenate([t1.data, t2.data], axis=1)    
    
    def t1_grad(grad: np.ndarray) -> np.ndarray:
        return grad[:,:t1.shape[1]]
    
    def t2_grad(grad: np.ndarray) -> np.ndarray:
        return grad[:,t1.shape[1]:]

    data = _forward(t1, t2)
    
    depends_on = [
        Dependency(t1, t1_grad),
        Dependency(t2, t2_grad)
    ]

    return Tensor(data, depends_on)

In [6]:
def _repeat(t: Tensor, repeats: int) -> Tensor:

    assert t.shape[0] == 1,\
    "Repeat operation should only be used on rows"
    
    def _forward(t: Tensor, repeats: int) -> np.ndarray:
        return np.repeat(t.data, repeats, axis=0)    
    
    def t_grad(grad: np.ndarray) -> np.ndarray:
        return grad.sum(axis=0)

    data = _forward(t, repeats)
    
    depends_on = [
        Dependency(t, t_grad)
    ]

    return Tensor(data, depends_on)

In [7]:
def _stack(t1: Tensor, t2: Tensor) -> Tensor:
    '''
    Stacks two 2d Tensors along axis 1, creating a 
    '''
    assert t.shape[0] == 1,\
    "Repeat operation should only be used on rows"
    
    def _forward(t: Tensor, repeats: int) -> np.ndarray:
        return np.repeat(t.data, repeats, axis=0)    
    
    def t_grad(grad: np.ndarray) -> np.ndarray:
        return grad.sum(axis=0)

    data = _forward(t, repeats)
    
    depends_on = [
        Dependency(t, t_grad)
    ]

    return Tensor(data, depends_on)

In [8]:
def _expand_dims_axis_1(t: Tensor) -> Tensor:
    
    assert t.data.ndim == 2
    
    def _forward(t: Tensor) -> np.ndarray:
        return np.expand_dims(t.data, axis=1)    
    
    def t_grad(grad: np.ndarray) -> np.ndarray:
        
        assert grad.ndim == 3
        
        return grad[:, 0, :]

    data = _forward(t)
    
    depends_on = [
        Dependency(t, t_grad)
    ]

    return Tensor(data, depends_on)

In [9]:
def _append_axis_1(t1: Tensor, t2: Tensor) -> Tensor:
    
    assert t1.data.ndim == t2.data.ndim == 3
    
    def _forward(t1: Tensor, t2: Tensor) -> np.ndarray:
        return np.append(t1.data, t2.data, axis=1)    
    
    def t1_grad(grad: np.ndarray) -> np.ndarray:
        
        assert grad.ndim == 3
        
        return grad[:, :t1.shape[1], :]
    
    def t2_grad(grad: np.ndarray) -> np.ndarray:
        
        assert grad.ndim == 3
        
        return grad[:, t1.shape[1]:, :]

    data = _forward(t1, t2)
    
    depends_on = [
        Dependency(t1, t1_grad),
        Dependency(t2, t2_grad)
    ]

    return Tensor(data, depends_on)

In [10]:
def _select_index_axis_1(t: Tensor, ind: int) -> Tensor:
    
    assert t.data.ndim == 3
    
    def _forward(t: Tensor, ind: int) -> np.ndarray:
        return t.data[:, ind, :]    

    data = _forward(t, ind)
    
    def t_grad(grad: np.ndarray) -> np.ndarray:
        
        assert grad.ndim == 2
        
        bigger_grad = np.zeros_like(t.data)
        bigger_grad[:, ind, :] = grad
        return bigger_grad

    depends_on = [
        Dependency(t, t_grad)
    ]

    return Tensor(data, depends_on, t.no_grad)

#### Test `concat`

In [11]:
# testing concat forward

a = np.random.randn(2, 3)
b = np.random.randn(2, 5)

c = np.concatenate([a, b], axis=1)
c.shape

(2, 8)

In [12]:
# concat grad calculations
a_grad = c[:,:a.shape[1]]
b_grad = c[:,a.shape[1]:]

In [13]:
print(a_grad.shape)
print(b_grad.shape)

(2, 3)
(2, 5)


In [14]:
# testing concat forward

aT = Tensor(a)
bT = Tensor(b)
c = aT.concat(bT)

d = c.sum()

d.backward()

In [15]:
print(aT.grad)
print(bT.grad)

Tensor([[1. 1. 1.]
 [1. 1. 1.]])
Tensor([[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]])


#### Test `repeat`

In [16]:
# testing repeat forward

a = np.random.randn(1, 3)

np.repeat(a, 2, axis=0)

array([[-0.8143,  0.8007, -0.0606],
       [-0.8143,  0.8007, -0.0606]])

In [17]:
# repeat backward

a = np.random.randn(1, 3)

a2 = np.repeat(a, 2, axis=0)

a2.sum(axis=0)

array([ 1.2144, -4.5926, -0.6135])

In [18]:
# repeat with tensors

a = np.random.randn(1, 3)
b = np.random.randn(3, 3)

aT = Tensor(a)
bT = Tensor(b)
a2 = aT.repeat(3)
c = (a2 + bT).sum()
c.backward()

In [19]:
aT.grad

Tensor([[3. 3. 3.]])

#### Testing `slice`

In [20]:
a = Tensor(np.random.randn(3,3,4))
b = a[1:2, :, :]
print(b.shape)
c = b.sum()

(1, 3, 4)


In [21]:
print(c)
c.backward()

Tensor(-0.3911)


In [22]:
a.grad

Tensor([[[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]])

#### Testing `select_index_i`

In [23]:
a = Tensor(np.random.randn(3,3,4))
b = a.select_index_axis_1(1)
print(b.shape)
c = b.sum()

(3, 4)


In [24]:
c.backward()

In [25]:
a.grad

Tensor([[[0. 0. 0. 0.]
  [1. 1. 1. 1.]
  [0. 0. 0. 0.]]

 [[0. 0. 0. 0.]
  [1. 1. 1. 1.]
  [0. 0. 0. 0.]]

 [[0. 0. 0. 0.]
  [1. 1. 1. 1.]
  [0. 0. 0. 0.]]])

#### Other classes

In [26]:
class Parameter(Tensor):
    def __init__(self, *shape) -> None:
        data = np.random.randn(*shape)
        super().__init__(data)

In [27]:
class Model:
    def parameters(self) -> Iterator[Parameter]:
        for name, value in inspect.getmembers(self):
            if isinstance(value, Parameter):
                yield value
            elif isinstance(value, Model):
                yield from value.parameters()

    def zero_grad(self):
        for parameter in self.parameters():
            parameter.zero_grad()

In [28]:
class SGD:
    def __init__(self, lr: float = 0.01) -> None:
        self.lr = lr

    def step(self, model: Model) -> None:
        for parameter in model.parameters():
            parameter -= parameter.grad * self.lr


#### Activations

In [29]:
def tanh(t: Tensor) -> Tensor:
    def _forward(t: Tensor):
        return np.tanh(t.data)

    data = _forward(t)
    
    def t_grad(grad: np.ndarray) -> np.ndarray:
        return grad * (1 - data * data)

    depends_on = [
        Dependency(t, t_grad)
    ]
    
    return Tensor(data, depends_on)

In [30]:
def sigmoid(t: Tensor) -> Tensor:
    
    def _forward(t: Tensor) -> np.ndarray:
        return 1.0 / (1.0 + np.exp(-(t.data)))

    data = _forward(t)
    
    def t_grad(grad: np.ndarray) -> np.ndarray:
        return grad * data * (1.0 - data)

    depends_on = [
        Dependency(t, t_grad)
    ]
    
    return Tensor(data, depends_on)

### Boston data

In [31]:
from sklearn.datasets import load_boston

boston = load_boston()

data = boston.data
target = boston.target
features = boston.feature_names

from sklearn.preprocessing import StandardScaler
s = StandardScaler()
data = s.fit_transform(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=80718)

y_train, y_test = y_train.reshape((-1,1)), y_test.reshape((-1,1))

X_train, X_test, y_train, y_test = (Tensor(X_train, no_grad=True),
                                    Tensor(X_test, no_grad=True),
                                    Tensor(y_train, no_grad=True),
                                    Tensor(y_test, no_grad=True))

In [32]:
class BostonModel(Model):
    def __init__(self, 
                 num_hidden: int = 13,
                 seed: int = 1) -> None:
        np.random.seed(seed)
        self.w1 = Parameter(13, num_hidden)
        self.b1 = Parameter(num_hidden)

        self.w2 = Parameter(num_hidden, 1)
        self.b2 = Parameter(1)

    def predict(self, inputs: Tensor) -> Tensor:
        # inputs will be (batch_size, 13)
        x1 = inputs @ self.w1 + self.b1  # (batch_size, num_hidden)
        x2 = tanh(x1)                    # (batch_size, num_hidden)
        x3 = x2 @ self.w2 + self.b2      # (batch_size, 4)

        return x3

### Train model

In [33]:
optimizer = SGD(lr=0.001)
batch_size = 32
model = BostonModel(seed=112418)
train_size = X_train.shape[0]

for epoch in range(10):
    epoch_loss = 0.0

    for start in range(0, train_size, batch_size):
        end = start + batch_size

        model.zero_grad()

        inputs = X_train[start:end]
#         inputs.no_grad = True

        predicted = model.predict(inputs)
        actual = y_train[start:end]
#         actual.no_grad = True

        errors = predicted - actual
        loss = (errors * errors).sum()
        loss.backward()
        optimizer.step(model)
        
        
        # test predictions
    predicted = model.predict(X_test)
    errors = predicted - y_test
    loss = (errors * errors).sum()
        
    print(epoch, loss)

0 Tensor(10929.9298)
1 Tensor(5931.3808)
2 Tensor(5032.0439)
3 Tensor(4765.6974)
4 Tensor(4596.3881)
5 Tensor(4585.5644)
6 Tensor(4554.1706)
7 Tensor(4486.7719)
8 Tensor(4400.3648)
9 Tensor(4251.6124)


# Simple Tensor Examples 

### Simple forward with addition

```
a 
 \
  c - s
 /
b
```

In [34]:
np.random.seed(112518)
a = Tensor(np.random.randn(2,2))
b = Tensor(np.random.randn(2,2))
c = a + b
s = c.sum()

print(a.grad)
s.backward()
print(a.grad)

Tensor([[0. 0.]
 [0. 0.]])
Tensor([[1. 1.]
 [1. 1.]])


### Simple forward with branching

```
a 
 \
  c1 
 /  \
b
      s
a 
 \   /
  c2 
 /
b 
```

In [35]:
np.random.seed(112518)
a = Tensor(np.random.randn(2,2))
b = Tensor(np.random.randn(2,2))
c1 = a + b
c2 = a * b
s = (c1 + c2).sum()

print(a.grad)
s.backward()
print(a.grad)
print()
print("a:", a)
print("s:", s)

Tensor([[0. 0.]
 [0. 0.]])
Tensor([[0.6462 2.4838]
 [1.0027 1.1719]])

a: Tensor([[ 2.1617 -0.4313]
 [ 0.3818  0.7105]])
s: Tensor(2.8455)


This implies that increasing `a[0][0]` by 0.1 from `2.1617` to `2.2617` would increase `s` from `2.8455` to:

In [36]:
2.8455 + 0.1 * 0.6462

2.91012

Let's check:

In [37]:
def check_sum(a_first_val):
    
    np.random.seed(112518)
    a = np.random.randn(2,2)
    b = np.random.randn(2,2)
    a2 = a.copy()
    a2[0][0] = a_first_val
    c1 = a2 + b
    c2 = a2 * b
    return (c1 + c2).sum()

In [38]:
print(check_sum(2.1617))
print(check_sum(2.2617))

2.8454767470862894
2.910093928197016


Works!

### Using the same array multiple times

In [39]:
w = Tensor(np.random.randn(2,2))
a = Tensor(np.random.randn(2,2))
b = Tensor(np.random.randn(2,2))
c = w + a
s = (w * b).sum()

s.backward()

print("w:", w)
print("s:", s)
print("w.grad:", w.grad)

w: Tensor([[-0.0683  0.6959]
 [ 1.4791  0.5237]])
s: Tensor(-1.2067)
w.grad: Tensor([[-0.5692 -0.3029]
 [-0.0847 -1.7367]])


In [40]:
-0.0683 + 0.1

0.031700000000000006

This implies that increasing `w[0][0]` by 0.1 from `-0.0683` to `0.0317` would increase `s` from `-1.2067` to:

In [41]:
-1.2067 + 0.1 * -0.5692

-1.2636200000000002

In [42]:
def check_sum(w_first_val):
    
    w2 = w.data.copy()
    w2[0][0] = w_first_val
    c.data = w2 + a.data
    return (w2 * b.data).sum()


In [43]:
print(check_sum(-0.0683))
print(check_sum(0.0317))

-1.206689062693263
-1.263613361478857


### Adding bias multiple times

Will mimic adding hidden state multiple times in an LSTM.

### Ultimate goal:

```python
def lstm_node(inputs: Tensor, 
              hiddens: Tensor, 
              cells: Tensor, 
              params: Dict[str, Tensor]):
    
    assert input.shape[0] == hidden.shape[0] == cell.shape[0]
    
    Z = inputs.concat(hidden)
    
    forget = sigmoid(Z @ params['Wf'] + params['Bf'])

    ingate = sigmoid(Z @ params['Wi'] + params['Bi'])
    
    outgate = sigmoid(Z @ params['Wo'] + params['Bo'])
    
    change = tanh(Z @ params['Wc'] + params['Bc'])
    
    cells = cells * forget + ingate * change
    
    hiddens = outgate * tanh(cells)
    
    outputs = hidden @ params['Wv'] + params['Bv']
    
    return outputs, hiddens, cells
```

In [44]:
def init_params(batch_size: int,
                state_size: int,
                vocab_size: int) -> Dict[str, Parameter]:
    
    params = {}
    params['Wf'] = Parameter(state_size + vocab_size, state_size)
    params['Wi'] = Parameter(state_size + vocab_size, state_size)
    params['Wo'] = Parameter(state_size + vocab_size, state_size)
    params['Wc'] = Parameter(state_size + vocab_size, state_size)
    params['Wv'] = Parameter(state_size, vocab_size)
    
    params['Bf'] = Parameter(state_size)
    params['Bi'] = Parameter(state_size)
    params['Bo'] = Parameter(state_size)
    params['Bc'] = Parameter(state_size)
    params['Bv'] = Parameter(vocab_size)
    
    return params

In [45]:
# constants
batch_size = 12
state_size = 20
vocab_size = 30

np.random.seed(112518)

# initial data
h_init = Tensor(np.random.randn(1, state_size))
c_init = Tensor(np.random.randn(1, state_size))
inputs = Tensor(np.random.randn(batch_size, vocab_size))

# initialize params
params = init_params(batch_size, state_size, vocab_size)

# repeat cell and hidden
hiddens = h_init.repeat(batch_size)
cells = c_init.repeat(batch_size)

In [46]:
def lstm_node(inputs: Tensor, 
              hiddens: Tensor, 
              cells: Tensor, 
              params: Dict[str, Parameter]):

    assert inputs.shape[0] == hiddens.shape[0] == cells.shape[0]

    Z = inputs.concat(hiddens)

    forget = sigmoid(Z @ params['Wf'] + params['Bf'])

    ingate = sigmoid(Z @ params['Wi'] + params['Bi'])

    outgate = sigmoid(Z @ params['Wo'] + params['Bo'])

    change = tanh(Z @ params['Wc'] + params['Bc'])

    cells = cells * forget + ingate * change

    hiddens = outgate * tanh(cells)

    outputs = hiddens @ params['Wv'] + params['Bv']

    return outputs, hiddens, cells

In [47]:
o_out, h_out, c_out = lstm_node(inputs, hiddens, cells, params)

In [48]:
o_out.shape # (batch_size, vocab_size)

(12, 30)

In [49]:
s = o_out.sum()
s.backward()

In [50]:
s

Tensor(70.9797)

In [51]:
print(h_init)
print(h_init.grad)

Tensor([[ 2.1617 -0.4313  0.3818  0.7105 -0.3538  1.4838  0.0027  0.1719 -0.0683
   0.6959  1.4791  0.5237  1.0117  0.0821 -0.2638 -1.4929 -0.5692 -0.3029
  -0.0847 -1.7367]])
Tensor([[  5.8457  -3.3987   3.0234   9.0145   7.7679  -2.7426   3.7362 -25.35
   -7.5784 -19.3007  17.2762   8.8289 -21.3072 -16.8012 -21.413   19.6036
    1.4624  10.1015   4.6027   2.511 ]])


This implies that if I increase `h_init[0]` from `2.1617` to `2.2617`, `s` will change to:

In [52]:
70.9797 + 0.1 * 5.8457

71.56427

In [53]:
def lstm_node_np(inputs: np.ndarray, 
              hiddens: np.ndarray, 
              cells: np.ndarray, 
              params: Dict[str, Tensor]):

    assert inputs.shape[0] == hiddens.shape[0] == cells.shape[0]

    def _sigmoid(arr: np.ndarray) -> np.ndarray:
        return 1.0 / (1.0 + np.exp(-arr))
        
    Z = np.concatenate([inputs, hiddens], axis=1)

    forget = _sigmoid(Z @ params['Wf'].data + params['Bf'].data)

    ingate = _sigmoid(Z @ params['Wi'].data + params['Bi'].data)

    outgate = _sigmoid(Z @ params['Wo'].data + params['Bo'].data)

    change = np.tanh(Z @ params['Wc'].data + params['Bc'].data)

    cells = cells * forget + ingate * change

    hiddens = outgate * np.tanh(cells)

    outputs = hiddens @ params['Wv'].data + params['Bv'].data

    return outputs, hiddens, cells

In [54]:
h_init_np = h_init.data
print(h_init_np)
h_init_np[0][0] += 0.1
print(h_init_np)

[[ 2.1617 -0.4313  0.3818  0.7105 -0.3538  1.4838  0.0027  0.1719 -0.0683
   0.6959  1.4791  0.5237  1.0117  0.0821 -0.2638 -1.4929 -0.5692 -0.3029
  -0.0847 -1.7367]]
[[ 2.2617 -0.4313  0.3818  0.7105 -0.3538  1.4838  0.0027  0.1719 -0.0683
   0.6959  1.4791  0.5237  1.0117  0.0821 -0.2638 -1.4929 -0.5692 -0.3029
  -0.0847 -1.7367]]


In [55]:
hiddens = np.repeat(h_init_np, batch_size, axis=0)
hiddens.shape

(12, 20)

In [56]:
out_np, h_np, c_np = lstm_node_np(inputs.data, 
                                  hiddens,
                                  cells.data,
                                  params)

In [57]:
out_np.shape

(12, 30)

In [58]:
out_np.sum()

71.57204981645923

Close!

### Testing weights

Will pass weights through `lstm_node` function twice.

In [59]:
def lstm_node_test_weights(inputs: Tensor, 
              hiddens: Tensor, 
              cells: Tensor, 
              params: Dict[str, Parameter],
              params_Bc: Tensor):

    assert inputs.shape[0] == hiddens.shape[0] == cells.shape[0]

    Z = inputs.concat(hiddens)

    forget = sigmoid(Z @ params['Wf'] + params['Bf'])

    ingate = sigmoid(Z @ params['Wi'] + params['Bi'])

    outgate = sigmoid(Z @ params['Wo'] + params['Bo'])

    change = tanh(Z @ params['Wc'] + params_Bc)

    cells = cells * forget + ingate * change

    hiddens = outgate * tanh(cells)

    outputs = hiddens @ params['Wv'] + params['Bv']

    return outputs, hiddens, cells

In [60]:
# np.random.seed(112518)

# # initial data
# h_init = Tensor(np.random.randn(1, state_size))
# c_init = Tensor(np.random.randn(1, state_size))
# inputs = Tensor(np.random.randn(batch_size, vocab_size))

# # initialize params
# params = init_params(batch_size, state_size, vocab_size)

# # repeat cell and hidden
# hiddens = h_init.repeat(batch_size)
# cells = c_init.repeat(batch_size)

# # params to test changing
# params_Bc = params['Bc'].data

In [61]:
np.random.seed(112518)

# initial data
h_init = Tensor(np.random.randn(1, state_size))
c_init = Tensor(np.random.randn(1, state_size))
inputs = Tensor(np.random.randn(batch_size, vocab_size))

# initialize params
params = init_params(batch_size, state_size, vocab_size)

# repeat cell and hidden
hiddens = h_init.repeat(batch_size)
cells = c_init.repeat(batch_size)

# params to test changing
params_Bc = Parameter(state_size)

In [62]:
# two iterations of lstm_node
outputs_1, hiddens_1, cells_1 = lstm_node_test_weights(inputs, 
                                                       hiddens, 
                                                       cells, 
                                                       params, params_Bc)

outputs_2, hiddens_2, cells_2 = lstm_node_test_weights(outputs_1, 
                                                       hiddens_1, 
                                                       cells_1, 
                                                       params, 
                                                       params_Bc)

In [63]:
# backpropagation
s = outputs_2.sum()
s.backward()

In [64]:
print(s)

Tensor(181.6665)


In [65]:
print(params_Bc)
print(params_Bc.grad)

Tensor([-0.3151 -0.6871  1.6564 -2.2223  0.4094  0.7848 -0.0607  0.4536  0.1576
  0.5422 -1.9662 -0.5593  0.6145  0.3875  0.8967 -0.0577 -0.3317  1.0005
 -1.8615  1.6397])
Tensor([ -0.6133  -2.2566   0.0367  -4.0249  -0.1883  -1.6268   0.0197   0.2878
 -16.2883   0.2083  -7.0469  -0.02    -0.2627 -11.7528  -0.0853  -0.3949
  -0.0702   0.2086   0.0351   5.2966])


This implies that increasing `params_Bc[0]` from `-0.3151` to `-0.2151` will decrease `s` to:

In [66]:
181.6665 + 0.1 * -0.6133

181.60517000000002

#### Testing initial weight change

In [67]:
def lstm_node_test_weights_np(inputs: np.ndarray, 
              hiddens: np.ndarray, 
              cells: np.ndarray,
              params: Dict[str, Tensor],
                           params_Bc: np.ndarray) -> Tuple[Tensor]:

    assert inputs.shape[0] == hiddens.shape[0] == cells.shape[0]

    def _sigmoid(arr: np.ndarray) -> np.ndarray:
        return 1.0 / (1.0 + np.exp(-arr))
        
    Z = np.concatenate([inputs, hiddens], axis=1)

    forget = _sigmoid(Z @ params['Wf'].data + params['Bf'].data)

    ingate = _sigmoid(Z @ params['Wi'].data + params['Bi'].data)

    outgate = _sigmoid(Z @ params['Wo'].data + params['Bo'].data)

    change = np.tanh(Z @ params['Wc'].data + params_Bc)

    cells = cells * forget + ingate * change

    hiddens = outgate * np.tanh(cells)

    outputs = hiddens @ params['Wv'].data + params['Bv'].data

    return outputs, hiddens, cells

In [68]:
params_Bc_2 = params_Bc.data.copy()
params_Bc_2[0] += 0.1

In [69]:
# two iterations of lstm_node
outputs_1, hiddens_1, cells_1 = lstm_node_test_weights_np(inputs.data, 
                                                       hiddens.data, 
                                                       cells.data, 
                                                       params, params_Bc.data)

outputs_2, hiddens_2, cells_2 = lstm_node_test_weights_np(outputs_1, 
                                                       hiddens_1, 
                                                       cells_1, 
                                                       params, 
                                                       params_Bc.data)

In [70]:
outputs_2.sum()

181.66652035171722

In [71]:
# two iterations of lstm_node
outputs_1, hiddens_1, cells_1 = lstm_node_test_weights_np(inputs.data, 
                                                       hiddens.data, 
                                                       cells.data, 
                                                       params, params_Bc_2)

outputs_2, hiddens_2, cells_2 = lstm_node_test_weights_np(outputs_1, 
                                                       hiddens_1, 
                                                       cells_1, 
                                                       params, 
                                                       params_Bc_2)

outputs_2.sum()

181.60007968835646

Good! Passing weights twice worked.

### 3D Input -> 3D Output

Will need to build output up incrementally (as in example from autograd package).

In [72]:
def lstm_node(inputs: Tensor, 
              hiddens: Tensor, 
              cells: Tensor, 
              params: Dict[str, Parameter]):

    assert inputs.shape[0] == hiddens.shape[0] == cells.shape[0]

    Z = inputs.concat(hiddens)
    
#     import pdb; pdb.set_trace()
    forget = sigmoid(Z @ params['Wf'] + params['Bf'])

    ingate = sigmoid(Z @ params['Wi'] + params['Bi'])

    outgate = sigmoid(Z @ params['Wo'] + params['Bo'])

    change = tanh(Z @ params['Wc'] + params['Bc'])

    cells = cells * forget + ingate * change

    hiddens = outgate * tanh(cells)

    outputs = hiddens @ params['Wv'] + params['Bv']

    return outputs, hiddens, cells

In [73]:
np.random.seed(112518)

sequence_length = 1

# initial data
h_init = Tensor(np.random.randn(1, state_size))
c_init = Tensor(np.random.randn(1, state_size))

# 3d input
inputs = Tensor(np.random.randn(batch_size, sequence_length, vocab_size))

# initialize params
params = init_params(batch_size, state_size, vocab_size)

# repeat cell and hidden
hiddens = h_init.repeat(batch_size)
cells = c_init.repeat(batch_size)

In [74]:
inputs[:, 0, :].shape

(12, 30)

In [75]:
for i in range(inputs.shape[1]):
    if i == 0:
        outputs_single, hiddens, cells = lstm_node(inputs.select_index_axis_1(i), 
                                                   hiddens, cells, params)
        outputs = outputs_single.expand_dims_axis_1()
        print(outputs.shape)
    else:
        output_single, hiddens, cells = lstm_node(inputs.select_index_axis_1(i), 
                                                  hiddens, cells, params)
        output = output_single.expand_dims_axis_1()
        outputs = outputs.append_axis_1(output)

(12, 1, 30)


In [76]:
outputs.shape
s = outputs.sum()
s.backward()

In [77]:
s

Tensor(70.9797)

In [78]:
inputs.grad.shape

(12, 1, 30)

In [79]:
print(inputs[1,0,1])
print(inputs.grad[1,0,1])

Tensor(-0.1656)
Tensor(-1.4009)


This implies that if we increase `inputs[1][0][1]` by 0.1, `s` will increase to:

In [80]:
70.9797 + -1.4009 * 0.1

70.83961

#### Testing

In [81]:
def lstm_node_test_input(inputs: np.ndarray, 
              hiddens: Tensor, 
              cells: Tensor, 
              params: Dict[str, Parameter]):

    Z = Tensor(np.concatenate([inputs, hiddens.data], axis=1))
    
    forget = sigmoid(Z @ params['Wf'] + params['Bf'])

    ingate = sigmoid(Z @ params['Wi'] + params['Bi'])

    outgate = sigmoid(Z @ params['Wo'] + params['Bo'])

    change = tanh(Z @ params['Wc'] + params['Bc'])

    cells = cells * forget + ingate * change

    hiddens = outgate * tanh(cells)

    outputs = hiddens @ params['Wv'] + params['Bv']

    return outputs, hiddens, cells

In [82]:
def sum_input_val(input_val):
    np.random.seed(112518)

    sequence_length = 1

    # initial data
    h_init = Tensor(np.random.randn(1, state_size))
    c_init = Tensor(np.random.randn(1, state_size))

    # 3d input
    inputs = Tensor(np.random.randn(batch_size, sequence_length, vocab_size))

    # initialize params
    params = init_params(batch_size, state_size, vocab_size)

    # repeat cell and hidden
    hiddens = h_init.repeat(batch_size)
    cells = c_init.repeat(batch_size)
    
    inputs_data = inputs.data
    inputs_data_2 = inputs_data.copy()
    inputs_data_2[1][0][1] = input_val
    
    for i in range(inputs_data.shape[1]):
        if i == 0:
            outputs_single, hiddens, cells = lstm_node_test_input(inputs_data_2[:, i, :], 
                                                       hiddens, cells, params)
            outputs = outputs_single.expand_dims_axis_1()
        else:
            output_single, hiddens, cells = lstm_node_test_input(inputs_data_2[:, i, :], 
                                                      hiddens, cells, params)
            output = output_single.expand_dims_axis_1()
            outputs = outputs.append_axis_1(output)
        
    return outputs.sum().data.item()

In [83]:
[sum_input_val(x) for x in np.arange(-0.4, 0.1, 0.1)]

[71.33558110395718,
 71.17716269928641,
 71.02855643442268,
 70.89009869126181,
 70.76228974043892]

In [84]:
(70.89009869126181 - 71.02855643442268) * 10

-1.3845774316087045

Grad is correct!

### Sequence length 2

In [116]:
np.random.seed(112518)

sequence_length = 6 

# initial data
h_init = Tensor(np.random.randn(1, state_size))
c_init = Tensor(np.random.randn(1, state_size))

# 3d input
inputs = Tensor(np.random.randn(batch_size, sequence_length, vocab_size), no_grad=True)

# initialize params
params = init_params(batch_size, state_size, vocab_size)

# repeat cell and hidden
hiddens = h_init.repeat(batch_size)
cells = c_init.repeat(batch_size)

In [117]:
for i in range(inputs.shape[1]):
    if i == 0:
        outputs_single, hiddens, cells = lstm_node(inputs.select_index_axis_1(i), 
                                                   hiddens, cells, params)
        outputs = outputs_single.expand_dims_axis_1()
    else:
        output_single, hiddens, cells = lstm_node(inputs.select_index_axis_1(i), 
                                                  hiddens, cells, params)
        output = output_single.expand_dims_axis_1()
        outputs = outputs.append_axis_1(output)

In [118]:
s = outputs.sum()
print(s)

Tensor(405.8718)


In [119]:
import time
start = time.time()
s.backward()
print({time.time() - start})

{1.9740800857543945}


### Test time vs. sequence length

In [89]:
def time_vs_sequence_length(seq_len):
    
    return time

In [90]:
np.random.seed(112518)

sequence_length = 5

# initial data
h_init = Tensor(np.random.randn(1, state_size))
c_init = Tensor(np.random.randn(1, state_size))

# 3d input
inputs = Tensor(np.random.randn(batch_size, sequence_length, vocab_size))

# initialize params
params = init_params(batch_size, state_size, vocab_size)

# repeat cell and hidden
hiddens = h_init.repeat(batch_size)
cells = c_init.repeat(batch_size)

In [91]:
def sum_input_val(input_val, sequence_length):
    np.random.seed(112518)

    # initial data
    h_init = Tensor(np.random.randn(1, state_size))
    c_init = Tensor(np.random.randn(1, state_size))

    # 3d input
    inputs = Tensor(np.random.randn(batch_size, sequence_length, vocab_size))

    # initialize params
    params = init_params(batch_size, state_size, vocab_size)

    # repeat cell and hidden
    hiddens = h_init.repeat(batch_size)
    cells = c_init.repeat(batch_size)
    
    inputs_data = inputs.data
    inputs_data_2 = inputs_data.copy()
    inputs_data_2[1][sequence_length-1][1] = input_val
    
    for i in range(inputs_data.shape[1]):
        if i == 0:
            outputs_single, hiddens, cells = lstm_node_test_input(inputs_data_2[:, i, :], 
                                                       hiddens, cells, params)
            outputs = outputs_single.expand_dims_axis_1()
        else:
            output_single, hiddens, cells = lstm_node_test_input(inputs_data_2[:, i, :], 
                                                      hiddens, cells, params)
            output = output_single.expand_dims_axis_1()
            outputs = outputs.append_axis_1(output)
    
    s = outputs.sum()    
    
    s.backward()
    
    return s.data.item(), inputs

In [92]:
[sum_input_val(x, 2) for x in np.arange(1.4, 1.9, 0.1)]

[(11.052709279981144,
  Tensor([[[-4.9570e-01 -1.2251e+00  9.8840e-01  2.7400e-02  8.6630e-01
     -5.2580e-01  1.0630e-01  2.7198e+00 -1.1489e+00 -3.6740e-01
      4.4880e-01 -7.3370e-01 -2.0529e+00 -2.0540e-01 -1.1125e+00
      6.1160e-01  6.1490e-01  1.3727e+00 -8.8930e-01  4.7830e-01
     -1.0350e+00 -8.4850e-01 -2.5330e-01  7.3520e-01  1.5818e+00
     -1.0196e+00  1.3350e-01 -7.7300e-02  1.1130e+00  1.5340e-01]
    [ 1.2619e+00 -1.6560e-01  9.8200e-02  1.4630e+00 -3.2620e-01
     -2.6550e-01 -1.1930e-01 -1.0223e+00 -5.6000e-03  1.0546e+00
      9.7970e-01 -9.3050e-01 -5.0980e-01  1.0708e+00  2.3520e-01
     -8.4260e-01  4.9080e-01  8.9230e-01 -2.8100e-02 -2.3800e-02
     -5.5800e-02  7.1210e-01 -2.7930e-01 -1.0040e-01 -5.5520e-01
      6.8150e-01  1.4822e+00 -4.3980e-01  2.0110e-01 -4.3610e-01]]
  
   [[ 2.6810e-01  3.8080e-01 -1.2532e+00  1.2541e+00 -2.3490e-01
     -8.6770e-01  7.5060e-01 -1.9595e+00  8.5710e-01 -7.4740e-01
      1.5610e-01  5.8700e-02  6.5660e-01 -6.8740e-01 -4

In [93]:
(11.546474090837279 - 11.37184627929096) * 10

1.7462781154631912

Good! Gradients are correct.

In [94]:
[sum_input_val(x, 11) for x in np.arange(-1.2, -0.8, 0.1)]

[(-416.0378316897645,
  Tensor([[[-0.4957 -1.2251  0.9884 ... -0.0773  1.113   0.1534]
    [ 1.2619 -0.1656  0.0982 ... -0.4398  0.2011 -0.4361]
    [ 0.2681  0.3808 -1.2532 ... -0.2891  0.5531 -1.1989]
    ...
    [-2.1346 -0.9519 -0.1124 ...  0.0515 -1.2297  0.2146]
    [ 1.2736  0.7889 -1.5609 ...  0.4688  0.308  -0.1725]
    [ 0.3723 -0.9119  1.0292 ...  0.8877  0.2504 -0.2204]]
  
   [[ 0.6355 -0.007   1.5405 ... -0.3527 -0.1827  0.8867]
    [-1.8614 -0.4224 -0.2571 ... -0.8535 -0.0968  0.9494]
    [ 0.4398 -2.0366  1.0579 ...  0.8555 -0.29   -0.1914]
    ...
    [-0.1639  0.4193 -0.9304 ...  0.3736 -0.0844 -0.8258]
    [ 1.907  -0.4342 -2.1665 ...  0.0076 -1.2245  1.0287]
    [-0.1237 -0.1498 -1.3146 ... -2.3476  0.6635 -0.2686]]
  
   [[ 0.766   1.2944 -0.2433 ...  0.5502 -0.2871 -1.0069]
    [ 0.0664 -0.4838 -0.1887 ... -0.9835 -0.4388 -0.041 ]
    [ 2.2724 -0.1125  1.1493 ... -0.4063  1.4525 -0.0086]
    ...
    [ 2.0698  0.2094 -0.8956 ...  0.178  -0.2295  1.0218]
    [ 1.234

In [95]:
(188.85741144545887 - 188.79192644280812) * 10

0.6548500265074608