# Setup

In [1]:
from torch import Tensor
import torch
from numpy import ndarray as array
from numpy import random

from helper import (to_2d,
                    one_col_to_two)

from typing import List, Tuple

### Assertions

In [2]:
def assert_same_shape(output: Tensor, 
                      output_grad: Tensor):
    assert output.shape == output_grad.shape, \
    '''
    Two tensors should have the same shape; instead, first Tensor's shape is {0}
    and second Tensor's shape is {1}.
    '''.format(tuple(output_grad.shape), tuple(output.shape))
    return None

In [3]:
def assert_dim(tensor: Tensor, 
               dim: int):
    assert tensor.dim() == dim, \
    '''
    Tensor should have dimension {0}, instead it has dimension {1}
    '''.format(dim, tensor.dim())
    return None

## `Layer` base class

In [4]:
class Layer(object):
    '''
    Defining basic functions that all classes inheriting from Layer must implement.
    '''

    def forward(self, input):
        raise NotImplementedError()

    def backward(self, output_grad):
        raise NotImplementedError()

## `Loss` base class

In [5]:
class Loss(object):
    '''
    Defining basic functions that all classes inheriting from Loss must implement.
    '''

    def loss_grad(self, input):
        """
        This function should
        * Return the loss as a number.
        * Store the gradient of the loss as self.grad
        """
        raise NotImplementedError()

## Individual layers

Here's a concept I wish had been explained to me when I started learning about neural nets, and makes understanding how to implement them much more clear:

* Each layer has a `forward` and `backward` method as before.
* The `forward` method receives `input` as (you guessed it) input and outputs `self.output`. It stores as class variables:
    * `input` as `self.last_input`.
    * `self.output` as `self.output`.
* The `backward` method receives `output_grad` as input and returns `input_grad` as its output. Along the way, it checks that:
    * `output_grad` has the same shape as `self.output`.
    * `input_grad` has the same shape as `self.last_input`.
    
When you try to trace what is going on in neural nets, it can often get confusing what layers are sending to and receiving from each other. This should make it clearer.

This also gives us a template for `Layer`s in general. They should all look like:

```python
def forward(self, input: Tensor) -> Tensor:
    
    self.last_input = input
    
    ###############
    # stuff happens
    ###############
    
    return self.output
```

```python
def backward(self, output_grad: Tensor) -> Tensor:
    
    assert_same_shape(self.output, output_grad)
    
    ###############
    # stuff happens
    ###############
    
    assert_same_shape(self.last_input, input_grad)    
    return input_grad
```

Writing batch norm, convolutions (messy but already done), transformers etc. can be done using this structure!

**Question**: is there a way to do this using decorators?

Finally, there's a similar assertion with the `loss_grad` function in the `Loss` class. This function will take in `prediction` as input and calculate `self.grad`, and it must assert that these are the same shape.

```python
def loss_grad(self, prediction: Tensor) -> Tensor:
    
    
    ###############
    # stuff happens
    ###############
    
    assert_same_shape(prediction, self.grad)
    return loss_grad
```

I think introducing all of these concepts will help students generalize from implementing the "basic, fully connected" neural nets from below, to the more complicated stuff like convolutions.

### `Linear` layer

In [6]:
class Linear(Layer):
    def __init__(self, 
                 n_out: int, 
                 bias: bool = False) -> None:
        self.n_out = n_out
        self.first = True
        self.bias = bias

    def forward(self, input: Tensor) -> Tensor:

        assert_dim(input, 2)

        self.last_input = input
        
        # Sets up the weights on the first iteration
        if self.first:
            n_input = input.size()[1]
            self.W = torch.randn((n_input, self.n_out))
            self.B = torch.randn((1, self.n_out))
                
            self.first = False

        self.output = torch.mm(self.last_input, self.W) + self.B
        
        return self.output

    def backward(self, output_grad: Tensor) -> Tensor:

        # Key assertion
        assert_same_shape(self.output, output_grad)
        
        n = output_grad.shape[0]
        
        self.dW = torch.mm(self.last_input.transpose(0, 1), output_grad)
        self.dB = to_2d(torch.sum(output_grad, dim=0), "row")
        
        input_grad = output_grad.mm(self.W.transpose(0, 1))
        
        # Key assertion        
        assert_same_shape(self.last_input, input_grad)
        
        return input_grad
    
    def update_params(self,  
                      method: str, 
                      learning_rate: float) -> None:
        
        methods = ["sgd"]
        
        assert method in methods,\
        "Method must be one of {0}".format(methods)

        if method == "sgd":
            self.W = self.W - learning_rate * self.dW
            self.B = self.B - learning_rate * self.dB

#### With bias

In [7]:
# class Linear(ParamLayer):
#     def __init__(self, n_out: int) -> None:
#         self.n_out = n_out
#         self.first = True

#     def forward(self, input: Tensor) -> Tensor:
        
#         assert input.dim() == 2, \
#         "Input to Linear layer must have dimension 2, instead input has dimension {0}".\
#         format(input.dim())
        
#         if self.first:
#             n_input = input.size()[1]
#             self.W = torch.randn((n_input, self.n_out))
#             self.b = torch.ones(self.n_out)
#             self.first = False
#         self.last_input = input
#         self.output = torch.mm(self.last_input, self.W) + self.b
        
#         return self.output

#     def backward(self, output_grad: Tensor) -> Tensor:
        
#         assert_grad_shape(self.output, output_grad)
        
#         n = output_grad.shape[0]
        
#         self.dW = torch.mm(self.last_input.T, output_grad)/n
#         self.db = torch.mean(output_grad, axis=0)
        
#         return output_grad.mm(self.W.T)
    
#     def params(self) -> Tuple[Tensor]:
#         return self.W, self.b
    
#     def param_derivs(self) -> Tuple[Tensor]:
#         return self.dW, self.db

In [8]:
class Sigmoid(Layer):
    '''
    Sigmoid activation function
    '''
    def __init__(self):
        pass
        
    def forward(self, input: Tensor) -> Tensor:
        
        self.last_input = input
        
        self.output = 1.0/(1.0+torch.exp(-1.0 * input))
        
        return self.output

    def backward(self, output_grad: Tensor) -> Tensor:

        assert_same_shape(self.output, output_grad)           
        
        sigmoid_backward = self.output*(1.0-self.output)
        
        input_grad = sigmoid_backward * output_grad
        
        assert_same_shape(self.last_input, input_grad)
        
        return input_grad

In [9]:
class SoftmaxLogistic(Layer):
    '''
    This layer is just for logistic regression:
    The forward pass transforms the one column output of logistic regression into two 
    columns, with the second column as "1 minus" the first.
    The backward pass - in keeping with the forward-backward structure established above -
    receives two dimensional input from the loss function above it and passes one
    dimensional input to the layer below it.
    '''
    def __init__(self):
        pass
        
    def forward(self, input: Tensor) -> Tensor:
        
        self.last_input = input
        
        input_2d = one_col_to_two(input)        

        def _softmax_row(row: Tensor) -> Tensor:

            exp_obs = torch.exp(row)
            sum_exp_obs = exp_obs.sum().item()
            softmax_obs = exp_obs / sum_exp_obs

            return softmax_obs
        
        output_rows = []
        for obs in range(input_2d.shape[0]):
            output_row = to_2d(_softmax_row(input_2d[obs]), "row")
            output_rows.append(output_row)

        self.output = torch.cat(output_rows)
        
        return self.output

    def backward(self, output_grad: Tensor) -> Tensor:
        '''
        For the purposes of this logistic regression, we'll select just the first 
        column of the output_grad.
        '''
        
        assert_same_shape(self.output, output_grad)
        
        input_grad = to_2d(output_grad[:, 0], "col")
    
        assert_same_shape(self.last_input, input_grad)
        
        return input_grad

### Cross entropy loss

In [10]:
class CrossEntropy(Loss):
    '''
    Receives the two column output of a logistic regression where the output has been
    fed through softmax.
    '''
    def __init__(self):
        pass
        
    def loss_grad(self, 
                  prediction: Tensor, 
                  actual: Tensor) -> float:
        
        assert_dim(prediction, 2)
        
        # Compute log loss
        log_loss = -1.0 * actual * torch.log(prediction) - (1.0 - actual) * torch.log(1 - prediction)        
        log_loss_val = torch.mean(log_loss).item()
        
        # Compute derivative - see calculations in book chapter
        self.grad = prediction - actual
        assert_same_shape(prediction, self.grad)
        
        return log_loss_val

## `NeuralNetwork` class

In [11]:
class NeuralNetwork:
    '''
    Neural network class. All the functions here are pretty straightforward.
    '''
    def __init__(self, layers: List[Layer], 
                 loss: Loss, 
                 learning_rate: float = 0.01, 
                 update_rule: str = "sgd") -> None:
        self.layers = layers
        self.loss = loss
        self.learning_rate = learning_rate
        self.update_rule = update_rule
        
    def forward(self, x_batch: Tensor) -> Tensor:
        
        assert_dim(x_batch, 2)
        
        x_out = x_batch
        for layer in self.layers:
            x_out = layer.forward(x_out)

        return x_out
    
    def batch_loss(self, 
                   prediction: Tensor, 
                   y_batch: Tensor) -> Tensor:
        
        # Key assertion
        assert_same_shape(prediction, y_batch)
        self.loss_val = self.loss.loss_grad(prediction, y_batch)
        return self.loss_val
    
    def backward(self, loss_grad: Tensor) -> None:
    
        grad = loss_grad
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
            
        return None
    
    def train_batch(self, 
                    x_batch: Tensor,
                    y_batch: Tensor) -> float:
        
        predictions = self.forward(x_batch)
        
        loss = self.loss.loss_grad(predictions, y_batch)
        
        self.backward(self.loss.grad)
        
        self.update_params()
        
        return loss
    
    def _get_param_layers(self) -> List[Layer]:
        return [layer for layer in self.layers if hasattr(layer, "update_params")]
            
    def update_params(self) -> None:
        param_layers = self._get_param_layers()
        for layer in param_layers:
            layer.update_params(self.update_rule, 
                                self.learning_rate)

## Training

### Prep breast cancer data

#### `sklearn` loading

In [12]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target
features = breast_cancer.feature_names

#### Changing to `Tensor`

In [13]:
data_tensor, target_tensor = Tensor(data), Tensor(target).resize_(569, 1)
target_tensor = one_col_to_two(target_tensor)

#### Generate random batches

Need to clean these functions up but they'll do for now.

In [14]:
Batch = Tuple[Tensor, Tensor]

def generate_batch(x: Tensor, 
                   y: Tensor, 
                   batch_size: int = 10) -> Batch:
    
    assert (x.dim() == 2) and (y.dim() == 2), \
    "X and Y must be 2 dimensional"
    
    indices = choose_random_row_indices(x, batch_size)
    x_batch = select_rows(x, indices)
    y_batch = select_rows(y, indices)
    
    return x_batch, y_batch

In [15]:
def choose_random_row_indices(x: Tensor, 
                              num: int = 10) -> array:

    assert x.dim() == 2, \
    'x must be a 2D array'
    
    return random.choice(range(x.shape[0]), num, replace=False)

In [16]:
def select_rows(data: Tensor, 
                indices: array) -> Tensor:

    return Tensor(data.numpy()[indices, :])

#### Standardize data

In [17]:
def standardize_data(arr: array) -> array:
    
    assert arr.ndim == 2
    
    means = arr.mean(axis=0)
    stds = arr.std(axis=0)
    
    return (arr - means) / stds

In [19]:
data_standard = standardize_data(data)
data_standard = Tensor(data_standard)

### Training loop

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
def train(nn: NeuralNetwork,
          X: Tensor,
          y: Tensor,
          num_iter: int = 1000, 
          print_every: int = 200,
          batch_size: int = 64) -> NeuralNetwork:

    assert_dim(X, 2)
    assert_dim(y, 2)
    
    assert X.shape[0] == y.shape[0]
    
    random.seed(63018)
    torch.manual_seed(63018)
    for i in range(num_iter):        
        x_batch, y_batch = generate_batch(X, y, batch_size)
        nn.train_batch(x_batch, y_batch)
        if print_every:
            if i % print_every == 0:
                predictions = nn.forward(data_standard)
                loss = nn.batch_loss(predictions, target_tensor)
                print("Loss after iteration {0} is {1}".\
                      format(i, round(loss, 3)))
    return nn

In [22]:
def eval_accuracy(nn: NeuralNetwork, 
                  X: Tensor, 
                  y: Tensor) -> float:

    assert_dim(X, 2)
    assert_dim(y, 2)
    
    assert X.shape[0] == y.shape[0]
    
    preds = nn.forward(X)[:, 0] > 0.5
    y = y[:, 0]
    
    return round(accuracy_score(y, preds), 3)

In [23]:
# constants
num_iter = 500
print_every = 100
batch_size = 128

Learning rate of 0.01.

In [24]:
nn = NeuralNetwork(
    layers=[Linear(n_out=1), 
            Sigmoid(),
            SoftmaxLogistic()],
    loss=CrossEntropy(),
    learning_rate=0.01,
)

nn = train(nn, data_standard, target_tensor, 
           num_iter = num_iter, 
           print_every = print_every, 
           batch_size = batch_size)
eval_accuracy(nn, data_standard, target_tensor)

Loss after iteration 0 is 0.603
Loss after iteration 100 is 0.345
Loss after iteration 200 is 0.339
Loss after iteration 300 is 0.337
Loss after iteration 400 is 0.335


0.986

Learning rate of 1.

In [25]:
nn2 = NeuralNetwork(
    layers=[Linear(n_out=1), 
            Sigmoid(),
            SoftmaxLogistic()],
    loss=CrossEntropy(),
    learning_rate=1,
)

nn2 = train(nn2, data_standard, target_tensor)
eval_accuracy(nn2, data_standard, target_tensor)

Loss after iteration 0 is 0.375
Loss after iteration 200 is 0.326
Loss after iteration 400 is 0.324
Loss after iteration 600 is 0.323
Loss after iteration 800 is 0.323


0.991

## Real neural network with hidden layer

In [26]:
nn3 = NeuralNetwork(
    layers=[Linear(n_out=10), 
            Sigmoid(),
            Linear(n_out=1),
            Sigmoid(),
            SoftmaxLogistic()],
    loss=CrossEntropy(),
    learning_rate=1,
)

nn3 = train(nn3, data_standard, target_tensor)
eval_accuracy(nn3, data_standard, target_tensor)

Loss after iteration 0 is 0.474
Loss after iteration 200 is 0.322
Loss after iteration 400 is 0.321
Loss after iteration 600 is 0.321
Loss after iteration 800 is 0.32


0.993

Neural network with one hidden layer with 10 neurons and a learning rate of 1 does slightly better than a simple logistic regression.

But more importantly - the code works! We have a framework we can use for Deep Learning! I hope it isn't hard to see how you could add in Droput, learning rate momentum, etc.