# Sections

Frameworks:

* PyTorch first pass
    * `Model` with `forward` method.
    * Manual training loop
* PyTorch second pass
    * `Model` with `forward` method.
    * `Trainer` class that takes in:
        * `Model`
        * `Optimizer`
        * `_Loss`

Models:

* Boston dataset (used for testing)
* Fashion MNIST
* AE
* GAN
* Transformer
* NTM

In [1]:
# imports
from typing import Tuple, List

import torch
import torch.optim as optim
from torch.optim import Optimizer

import numpy as np
from torch import Tensor

import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.loss import _Loss

from lincoln.utils import permute_data, assert_dim

from lincoln.pytorch.model import PyTorchModel
from lincoln.pytorch.train import PyTorchTrainer

In [2]:
# %load_ext autoreload
# %autoreload 2

# Boston dataset

In [3]:
from sklearn.datasets import load_boston

boston = load_boston()

data = boston.data
target = boston.target
features = boston.feature_names

from sklearn.preprocessing import StandardScaler
s = StandardScaler()
data = s.fit_transform(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=80718)

X_train, X_test, y_train, y_test = Tensor(X_train), Tensor(X_test), Tensor(y_train), Tensor(y_test)

### Defining Boston model

In [4]:
class BostonModel(PyTorchModel):

    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(13, 13)
        self.fc2 = nn.Linear(13, 1)

    def forward(self, x):
        
        assert_dim(x, 2)
        
        assert x.shape[1] == 13
        
        x = torch.tanh(self.fc1(x))
        x = self.fc2(x)
        return x

net = BostonModel()
print(net)

BostonModel(
  (fc1): Linear(in_features=13, out_features=13, bias=True)
  (fc2): Linear(in_features=13, out_features=1, bias=True)
)


In [5]:
# model, optimizer, loss
optimizer = optim.SGD(net.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [6]:
# constants
epochs = 20
batch_size = 32

In [7]:
def generate_batches(X: Tensor,
                      y: Tensor,
                      size: int = 32) -> Tuple[Tensor]:

    N = X.shape[0]

    for ii in range(0, N, size):
        X_batch, y_batch = X[ii:ii+size], y[ii:ii+size]

        yield X_batch, y_batch

In [8]:
# constants
for e in range(epochs):
    X_train, y_train = permute_data(X_train, y_train)  

    batch_generator = generate_batches(X_train, y_train,
                                       batch_size)

    for ii, (X_batch, y_batch) in enumerate(batch_generator):

        optimizer.zero_grad()   
        output = net(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()    
    
    optimizer.zero_grad() 
    output = net(X_test)
    loss = criterion(output, y_test)
    print(e, loss.item())

0 524.85205078125
1 478.5111389160156
2 420.73828125
3 353.9407958984375
4 289.292724609375
5 233.33729553222656
6 189.20748901367188
7 154.6373748779297
8 127.28923034667969
9 112.42050170898438
10 103.9015884399414
11 97.83414459228516
12 94.86502838134766
13 92.8217544555664
14 92.03717803955078
15 91.60162353515625
16 91.26419830322266
17 91.21491241455078
18 91.09342956542969
19 91.04370880126953


## `Trainer` class

In [9]:
from sklearn.datasets import load_boston

boston = load_boston()

data = boston.data
target = boston.target
features = boston.feature_names

from sklearn.preprocessing import StandardScaler
s = StandardScaler()
data = s.fit_transform(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=80718)

X_train, X_test, y_train, y_test = Tensor(X_train), Tensor(X_test), Tensor(y_train), Tensor(y_test)

In [10]:
net = BostonModel()
optimizer = optim.SGD(net.parameters(), lr=0.001)
criterion = nn.MSELoss()

trainer = PyTorchTrainer(net, optimizer, criterion)

trainer.fit(X_train, y_train, X_test, y_test,
            epochs=10,
            eval_every=1)

0 tensor(559.4208, grad_fn=<MseLossBackward>)
1 tensor(526.6153, grad_fn=<MseLossBackward>)
2 tensor(489.2494, grad_fn=<MseLossBackward>)
3 tensor(441.0061, grad_fn=<MseLossBackward>)
4 tensor(378.1341, grad_fn=<MseLossBackward>)
5 tensor(309.1067, grad_fn=<MseLossBackward>)
6 tensor(247.6548, grad_fn=<MseLossBackward>)
7 tensor(204.3143, grad_fn=<MseLossBackward>)
8 tensor(154.8956, grad_fn=<MseLossBackward>)
9 tensor(129.8301, grad_fn=<MseLossBackward>)


### MNIST data

In [11]:
from torchvision.datasets import MNIST
mnist_trainset = MNIST(root="../exploratory/data/", train=True, download=True, transform=None)
mnist_testset = MNIST(root="../exploratory/data/", train=False, download=True, transform=None)

In [12]:
data = mnist_trainset
num_labels = len(data.train_labels)
train_labels = torch.zeros(num_labels, 10)
for i in range(num_labels):
    train_labels[i][data.train_labels[i]] = 1
train_labels.shape

torch.Size([60000, 10])

In [13]:
data = mnist_testset
num_labels = len(data.test_labels)
test_labels = torch.zeros(num_labels, 10)
for i in range(num_labels):
    test_labels[i][data.test_labels[i]] = 1
test_labels.shape

torch.Size([10000, 10])

In [14]:
mnist_train = mnist_trainset.train_data.type(torch.float32).unsqueeze(3) / 255.0
mnist_test = mnist_testset.test_data.type(torch.float32).unsqueeze(3) / 255.0

In [15]:
X_train = mnist_train
X_test = mnist_test
y_train = train_labels
y_test = test_labels

In [16]:
X_train_perm = X_train.permute(0, 3, 1, 2)
X_test_perm = X_test.permute(0, 3, 1, 2)

In [17]:
class MNIST_ConvNet(PyTorchModel):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        assert_dim(x, 4)
        
        # num channgels
        assert x.shape[1] == 1        
        
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.shape[0], x.shape[1] * x.shape[2] * x.shape[3])
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [18]:
model = MNIST_ConvNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [20]:
trainer = PyTorchTrainer(model, optimizer, criterion)

trainer.fit(X_train_perm, mnist_trainset.train_labels, 
            X_test_perm, mnist_testset.test_labels,
            epochs=1,
            eval_every=1)

0 tensor(0.1087, grad_fn=<NllLossBackward>)


In [23]:
out = model.forward(X_train_perm)


## AEs

Writing a custom trainer.

In [18]:
def permute_data(X: Tensor, seed=1):
    perm = torch.randperm(X.shape[0])
    return X[perm]

In [19]:
class AutoEncoderTrainer(PyTorchTrainer):
    def __init__(self,
                 model: PyTorchModel,
                 optim: Optimizer,
                 criterion: _Loss):
        super().__init__(model, optim, criterion)
        self._check_optim_net_aligned()
        
    def _generate_batches(self,
                          X: Tensor,
                          size: int = 32) -> Tuple[Tensor]:

        N = X.shape[0]

        for ii in range(0, N, size):
            X_batch = X[ii:ii+size]

            yield X_batch


    def fit(self, X_train: Tensor,
            X_test: Tensor,
            epochs: int=100,
            eval_every: int=10,
            batch_size: int=32):

        for e in range(epochs):
            X_train = permute_data(X_train)

            batch_generator = self._generate_batches(X_train, batch_size)

            for ii, X_batch in enumerate(batch_generator):
                
                self.optim.zero_grad()   # zero the gradient buffers
                encoding, output = self.model(X_batch)
                loss = self.loss(output, X_batch)
                loss.backward()
                self.optim.step()    # Does the update

            self.optim.zero_grad()
            _, output = self.model(X_test)
            loss = self.loss(output, X_test)
            print(e, loss)


Model follows [here](https://github.com/L1aoXingyu/pytorch-beginner/blob/master/08-AutoEncoder/conv_autoencoder.py)

In [20]:
class AutoEncoder(PyTorchModel):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, 3, stride=3, padding=1),  # b, 16, 10, 10
            nn.ReLU(True),
            nn.MaxPool2d(2, stride=2),  # b, 16, 5, 5
            nn.Conv2d(16, 8, 3, stride=2, padding=1),  # b, 8, 3, 3
            nn.ReLU(True),
            nn.MaxPool2d(2, stride=1)
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(8, 16, 3, stride=2),  # b, 16, 5, 5
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 8, 5, stride=3, padding=1),  # b, 8, 15, 15
            nn.ReLU(True),
            nn.ConvTranspose2d(8, 1, 2, stride=2, padding=1),  # b, 1, 28, 28
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [21]:
class AutoEncoderSplit(PyTorchModel):
    def __init__(self):
        super().__init__()
        self.conv2d1 = nn.Conv2d(1, 16, 3, stride=3, padding=1)
        self.maxpool1 = nn.MaxPool2d(2, stride=2)
        self.conv2d2 = nn.Conv2d(16, 8, 3, stride=2, padding=1)
        self.maxpool2 = nn.MaxPool2d(2, stride=1)
        
        self.conv2dT1 = nn.ConvTranspose2d(8, 16, 3, stride=2)
        self.conv2dT2 = nn.ConvTranspose2d(16, 8, 5, stride=3, padding=1)
        self.conv2dT3 = nn.ConvTranspose2d(8, 1, 2, stride=2, padding=1)  # b, 1, 28, 28

        self.relu = nn.ReLU(True)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.conv2d1(x)
        x = self.relu(x)
        x = self.maxpool1(x)
        x = self.conv2d2(x)
        x = self.relu(x)
        x = self.maxpool2(x)
        encoded = x.view(x.shape[0], -1)
        
        x = self.conv2dT1(x)
        x = self.relu(x)
        x = self.conv2dT2(x)
        x = self.relu(x)
        x = self.conv2dT3(x)
        decoded = self.tanh(x)
        return encoded, decoded

In [24]:
learning_rate = 0.01

model = AutoEncoderSplit()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                             weight_decay=1e-5)

In [25]:
trainer = AutoEncoderTrainer(model, optimizer, criterion)

trainer.fit(X_train_perm, X_train_perm, 
            epochs=1,
            eval_every=1)

0 tensor(0.0405, grad_fn=<MseLossBackward>)


In [26]:
encoded, _ = model(X_test_perm)

In [27]:
encoded.shape

torch.Size([10000, 32])

Seems to work! GANs could be done similarly. Will examine later.

To really do AE or GAN, you'll need to write custom trainer.

## LSTM

Working backwards:

* Want a character level model - predict next char.

How to do it?

Pass in sequences. 

### New classes: `NextCharacterModel` and `LSTMTrainer`

In [22]:
a = torch.randn(1, 1, 5)
print(a)
a.repeat(1, 3, 1).shape
a.repeat(1, 3, 1).mean(dim=1)

tensor([[[ 1.3714, -0.1111,  0.7676, -0.9703,  0.0697]]])


tensor([[ 1.3714, -0.1111,  0.7676, -0.9703,  0.0697]])

In [23]:
class NextCharacterModel(PyTorchModel):
    def __init__(self,
                 vocab_size: int,
                 neurons: int = 256,
                 sequence_length: int = 25,
                 reset_every: int = 100):
        super().__init__()
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.hidden_size = neurons
        self.lstm = nn.LSTM(vocab_size, neurons,
                            batch_first=True)
        self.fc_last = nn.Linear(neurons, vocab_size)
        self.count = 0
        self.reset_every = reset_every

    def forward(self,
                inputs: Tensor):
        assert_dim(inputs, 3) # batch_size, sequence_length, vocab_size

        if self.count % self.reset_every == 0:
            self.hidden, self.cells = torch.randn(1, 1, self.hidden_size),\
                torch.randn(1, 1, self.hidden_size)
        
        self.count += 1
        
        hidden, cells = self.hidden.repeat(1, inputs.shape[0], 1),\
            self.cells.repeat(1, inputs.shape[0], 1)

        out, (hidden_out, cells_out) = self.lstm(inputs, (hidden.data, cells.data))
        
        self.hidden.data, self.cells.data = hidden_out.data.mean(dim=1), cells_out.data.mean(dim=1)
        
        out = self.fc_last(out)
        
        return out

In [24]:
class LSTMTrainer(PyTorchTrainer):
    def __init__(self,
                 model: NextCharacterModel,
                 optim: Optimizer,
                 criterion: _Loss):
        super().__init__(model, optim, criterion)
        self.vocab_size = self.model.vocab_size
        self.max_len = self.model.sequence_length
        
    def fit(self,
            data: str,
            epochs: int=10,
            eval_every: int=1,
            batch_size: int=32,
            seed: int = 121718)-> None:
        
        self.data = data
        self.train_data, self.test_data = self._train_test_split_text()
        self.chars = list(set(self.data))
        self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)}
        self.idx_to_char = {i: ch for i, ch in enumerate(self.chars)}

        torch.manual_seed(seed)

        for e in range(epochs):

            batch_generator = self.generate_batches_next_char(batch_size)

            for ii, (X_batch, y_batch) in enumerate(batch_generator):
                print(ii)
#                 if ii == 1:
#                     import pdb; pdb.set_trace()
                self.optim.zero_grad()                
                outputs = self.model(X_batch)
                loss = self.loss(outputs, y_batch)
                print(loss)
                loss.backward()
                self.optim.step()    # Does the update

            if (e+1) % eval_every == 0:

                X_test, y_test = self.generate_test_data()
            
                test_preds = self.net.forward(X_test)
                loss = self.net.loss.forward(test_preds, y_test)
                print(f"Validation loss after {e+1} epochs is {loss:.3f}")

    def _train_test_split_text(self, pct=0.8) -> Tuple[str]:

        n = len(self.data)
        return self.data[:int(n * pct)], self.data[int(n * pct):]

    def generate_batches_next_char(self,
                                   batch_size: int) -> Tuple[Tensor]:
        N = len(self.train_data)
        # add batch size
        for ii in range(0, N, batch_size):

            features_tensors = []
            target_tensors = []

            for char in range(batch_size):

                features_str, target_str =\
                 self.train_data[ii+char:ii+char+self.max_len],\
                 self.train_data[ii+char+1:ii+char+self.max_len+1]

                features_array, target_array =\
                    self._string_to_one_hot_array(features_str),\
                    self._string_to_one_hot_array(target_str)

                features_tensors.append(features_array)
                target_tensors.append(target_array)

            yield torch.stack(features_tensors), torch.stack(target_tensors)

    def _string_to_one_hot_array(self, input_string: str) -> Tuple[Tensor]:

        ind = [self.char_to_idx[ch] for ch in input_string]

        array = self._one_hot_text_data(ind)

        return array

    def _one_hot_text_data(self,
                           sequence: List):

        sequence_length = len(sequence)
        batch = torch.zeros(sequence_length, self.vocab_size)
        for i in range(sequence_length):
            batch[i, sequence[i]] = 1.0

        return Tensor(batch)

    def generate_test_data(self) -> Tuple[Tensor]:

        features_str, target_str = self.test_data[:-1], self.test_data[1:]

        X_tensors = []
        y_tensors = []

        N = len(self.test_data)

        for start in range(0, N, self.max_len):

            features_str, target_str =\
             self.test_data[start:start+self.max_len],\
             self.test_data[start+1:start+self.max_len+1]

            features_array, target_array =\
                self._string_to_one_hot_array(features_str),\
                self._string_to_one_hot_array(target_str)

            X_tensors.append(features_array)
            y_tensors.append(target_array)

        return torch.stack(X_tensors), torch.stack(y_tensors)


In [65]:
data = open('data/input.txt', 'r').read()
vocab_size = len(set(data))
model = NextCharacterModel(vocab_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                             weight_decay=1e-5)

In [66]:
lstm_trainer = LSTMTrainer(model, optimizer, criterion)

In [67]:
lstm_trainer.fit(data)

0
tensor(0.0187, grad_fn=<MseLossBackward>)
1
tensor(0.0155, grad_fn=<MseLossBackward>)
2
tensor(0.0160, grad_fn=<MseLossBackward>)
3
tensor(0.0158, grad_fn=<MseLossBackward>)
4
tensor(0.0154, grad_fn=<MseLossBackward>)
5
tensor(0.0150, grad_fn=<MseLossBackward>)
6
tensor(0.0151, grad_fn=<MseLossBackward>)
7
tensor(0.0157, grad_fn=<MseLossBackward>)
8
tensor(0.0148, grad_fn=<MseLossBackward>)
9
tensor(0.0152, grad_fn=<MseLossBackward>)
10
tensor(0.0160, grad_fn=<MseLossBackward>)
11
tensor(0.0148, grad_fn=<MseLossBackward>)
12
tensor(0.0154, grad_fn=<MseLossBackward>)
13
tensor(0.0149, grad_fn=<MseLossBackward>)
14
tensor(0.0144, grad_fn=<MseLossBackward>)
15
tensor(0.0143, grad_fn=<MseLossBackward>)
16
tensor(0.0144, grad_fn=<MseLossBackward>)
17
tensor(0.0148, grad_fn=<MseLossBackward>)
18
tensor(0.0144, grad_fn=<MseLossBackward>)
19
tensor(0.0142, grad_fn=<MseLossBackward>)
20
tensor(0.0140, grad_fn=<MseLossBackward>)
21
tensor(0.0146, grad_fn=<MseLossBackward>)
22
tensor(0.0144, gr

181
tensor(0.0145, grad_fn=<MseLossBackward>)
182
tensor(0.0142, grad_fn=<MseLossBackward>)
183
tensor(0.0143, grad_fn=<MseLossBackward>)
184
tensor(0.0147, grad_fn=<MseLossBackward>)
185
tensor(0.0137, grad_fn=<MseLossBackward>)
186
tensor(0.0142, grad_fn=<MseLossBackward>)
187
tensor(0.0134, grad_fn=<MseLossBackward>)
188
tensor(0.0137, grad_fn=<MseLossBackward>)
189
tensor(0.0140, grad_fn=<MseLossBackward>)
190
tensor(0.0144, grad_fn=<MseLossBackward>)
191
tensor(0.0134, grad_fn=<MseLossBackward>)
192
tensor(0.0137, grad_fn=<MseLossBackward>)
193
tensor(0.0144, grad_fn=<MseLossBackward>)
194
tensor(0.0143, grad_fn=<MseLossBackward>)
195
tensor(0.0143, grad_fn=<MseLossBackward>)
196
tensor(0.0135, grad_fn=<MseLossBackward>)
197
tensor(0.0136, grad_fn=<MseLossBackward>)
198
tensor(0.0150, grad_fn=<MseLossBackward>)
199
tensor(0.0145, grad_fn=<MseLossBackward>)
200
tensor(0.0162, grad_fn=<MseLossBackward>)
201
tensor(0.0138, grad_fn=<MseLossBackward>)
202
tensor(0.0139, grad_fn=<MseLos

KeyboardInterrupt: 

TODO:

* Write code to generate next character from this.
* Write early stopping code.

## Section name: Grokking Advanced Architectures

## Transformer

Same data source as LSTM: 

* TODO: Draw computational graph
* TODO: Describe input and output data

## Neural Turing Machine