In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import requests, tarfile, os

url = "http://files.fast.ai/data/examples/human_numbers.tgz"
filename = "human_numbers.tgz"
folder = "human_numbers"

# Download
response = requests.get(url)
with open(filename, "wb") as f:
    f.write(response.content)

# Extract
with tarfile.open(filename, "r:gz") as tar:
    tar.extractall()

# Read data
train_path = os.path.join(folder, "train.txt")
valid_path = os.path.join(folder, "valid.txt")

with open(train_path) as f:
    train_lines = f.read().splitlines()

with open(valid_path) as f:
    valid_lines = f.read().splitlines()

lines = train_lines + valid_lines

In [3]:
print(len(train_lines), len(valid_lines), len(lines))

7999 1999 9998


In [4]:
text = ' . '.join([l.strip() for l in lines])
print(text[:100])

one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo


In [5]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [6]:
vocab = list(dict.fromkeys(tokens))
vocab

['one',
 '.',
 'two',
 'three',
 'four',
 'five',
 'six',
 'seven',
 'eight',
 'nine',
 'ten',
 'eleven',
 'twelve',
 'thirteen',
 'fourteen',
 'fifteen',
 'sixteen',
 'seventeen',
 'eighteen',
 'nineteen',
 'twenty',
 'thirty',
 'forty',
 'fifty',
 'sixty',
 'seventy',
 'eighty',
 'ninety',
 'hundred',
 'thousand']

In [7]:
word2idx = {w:i for i,w in enumerate(vocab)}
nums = [word2idx[i] for i in tokens]
nums[:10]

[0, 1, 2, 1, 3, 1, 4, 1, 5, 1]

# Prediction based on the previous 3 words

In [8]:
sequence_pairs = [(tokens[i:i+3], tokens[i+3]) for i in range(0, len(tokens)-4, 3)]
sequence_pairs[:10]

[(['one', '.', 'two'], '.'),
 (['.', 'three', '.'], 'four'),
 (['four', '.', 'five'], '.'),
 (['.', 'six', '.'], 'seven'),
 (['seven', '.', 'eight'], '.'),
 (['.', 'nine', '.'], 'ten'),
 (['ten', '.', 'eleven'], '.'),
 (['.', 'twelve', '.'], 'thirteen'),
 (['thirteen', '.', 'fourteen'], '.'),
 (['.', 'fifteen', '.'], 'sixteen')]

In [9]:
seqs = [(torch.tensor(nums[i:i+3]), torch.tensor(nums[i+3])) for i in range(0, len(nums)-4, 3)]
seqs[:10]

[(tensor([0, 1, 2]), tensor(1)),
 (tensor([1, 3, 1]), tensor(4)),
 (tensor([4, 1, 5]), tensor(1)),
 (tensor([1, 6, 1]), tensor(7)),
 (tensor([7, 1, 8]), tensor(1)),
 (tensor([1, 9, 1]), tensor(10)),
 (tensor([10,  1, 11]), tensor(1)),
 (tensor([ 1, 12,  1]), tensor(13)),
 (tensor([13,  1, 14]), tensor(1)),
 (tensor([ 1, 15,  1]), tensor(16))]

In [10]:
cut = int(len(seqs) * 0.8)
train_seqs = seqs[:cut]
valid_seqs = seqs[cut:]

# Unzipping
x_train, y_train = zip(*train_seqs)
x_valid, y_valid = zip(*valid_seqs)

# Stacking
x_train = torch.stack(x_train)
y_train = torch.stack(y_train)
x_valid = torch.stack(x_valid)
y_valid = torch.stack(y_valid)

# Wrap into dataset
train_ds = TensorDataset(x_train, y_train)
valid_ds = TensorDataset(x_valid, y_valid)

# Create DataLoaders
bs = 64
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=bs, shuffle=False)

# Unrolled RNN with no inter-sequence memory

In [11]:
class LMModel1(nn.Module) :
    def __init__(self, vocab_sz, n_hidden) :
        super().__init__()
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)

    def forward(self, x) :
        h = F.relu(self.h_h(self.i_h(x[:, 0])))
        h = F.relu(self.h_h(h + self.i_h(x[:, 1])))
        h = F.relu(self.h_h(h + self.i_h(x[:, 2])))
        return self.h_o(h)

In [12]:
model = LMModel1(len(vocab), 64).to(device)
loss_func = F.cross_entropy
optimizer = optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 4

for epoch in range(n_epochs) :
    model.train()
    total_train_loss, train_count = 0.0, 0
    for xb, yb in train_dl :
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * xb.size(0)
        train_count += xb.size(0)

    avg_train_loss = total_train_loss / train_count

    model.eval()
    total_valid_loss, valid_count, correct = 0.0, 0, 0
    with torch.no_grad() :
        for xb, yb in valid_dl :
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = loss_func(preds, yb)
            total_valid_loss += loss.item() * xb.size(0)
            predicted = preds.argmax(dim=1)
            correct += (predicted == yb).sum().item()
            valid_count += yb.size(0)

    avg_valid_loss = total_valid_loss / valid_count
    acc = correct / valid_count
    print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}, Accuracy: {acc:.4f}")

Epoch 1/4 - Train Loss: 1.7328, Valid Loss: 1.7726, Accuracy: 0.3601
Epoch 2/4 - Train Loss: 1.4059, Valid Loss: 1.7477, Accuracy: 0.4207
Epoch 3/4 - Train Loss: 1.3545, Valid Loss: 1.8860, Accuracy: 0.3551
Epoch 4/4 - Train Loss: 1.3384, Valid Loss: 1.8298, Accuracy: 0.3601


Not particularly impressive. First, let's rewrite the same model in a rolled way.

# Rolled RNN with no inter-sequence memory

In [13]:
class LMModel2(nn.Module) :
    def __init__(self, vocab_sz, n_hidden) :
        super().__init__()
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)

    def forward(self, x) :
        h = 0
        for i in range(3) :
            h = F.relu(self.h_h(h + self.i_h(x[:, i])))
        return self.h_o(h)

In [14]:
model = LMModel2(len(vocab), 64).to(device)
loss_func = F.cross_entropy
optimizer = optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 4

for epoch in range(n_epochs) :
    model.train()
    total_train_loss, train_count = 0.0, 0
    for xb, yb in train_dl :
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * xb.size(0)
        train_count += xb.size(0)

    avg_train_loss = total_train_loss / train_count

    model.eval()
    total_valid_loss, valid_count, correct = 0.0, 0, 0
    with torch.no_grad() :
        for xb, yb in valid_dl :
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = loss_func(preds, yb)
            total_valid_loss += loss.item() * xb.size(0)
            predicted = preds.argmax(dim=1)
            correct += (predicted == yb).sum().item()
            valid_count += yb.size(0)

    avg_valid_loss = total_valid_loss / valid_count
    acc = correct / valid_count
    print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}, Accuracy: {acc:.4f}")

Epoch 1/4 - Train Loss: 1.7565, Valid Loss: 1.7618, Accuracy: 0.4733
Epoch 2/4 - Train Loss: 1.4184, Valid Loss: 1.8412, Accuracy: 0.3630
Epoch 3/4 - Train Loss: 1.3678, Valid Loss: 1.7521, Accuracy: 0.4271
Epoch 4/4 - Train Loss: 1.3440, Valid Loss: 1.7816, Accuracy: 0.4186


The result is the same which is expected.

# Remembering the hidden state between sequences
Now, let us translate the hidden state between sequences.

In [15]:
class LMModel3(nn.Module) :
    def __init__(self, vocab_sz, n_hidden) :
        super().__init__()
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0

    def forward(self, x) :
        for i in range(3) :
            self.h = F.relu(self.h_h(self.h + self.i_h(x[:, i])))
        out = self.h_o(self.h)
        self.h = self.h.detach()
        return out

    def reset(self) : self.h = 0

In [16]:
m = len(seqs)//bs
m, bs, len(seqs)

(328, 64, 21031)

In [17]:
def group_chunks(ds, bs) :
    m = len(ds) // bs
    new_ds = []
    for i in range(m) :
        for j in range(bs) :
            new_ds.append(ds[i + m*j])
    return new_ds

In [18]:
cut = int(len(seqs) * 0.8)

train_ds = group_chunks(seqs[:cut], bs)
valid_ds = group_chunks(seqs[cut:], bs)

train_dl = DataLoader(train_ds, batch_size=bs, shuffle=False, drop_last=True)
valid_dl = DataLoader(valid_ds, batch_size=bs, shuffle=False, drop_last=True)

In [19]:
model = LMModel3(len(vocab), 64).to(device)
loss_func = F.cross_entropy
optimizer = optim.Adam(model.parameters(), lr=3e-3)

n_epochs = 10

for epoch in range(n_epochs) :
    model.train()
    model.reset()  # Reset at start of training epoch
    total_train_loss, train_count = 0.0, 0
    for xb, yb in train_dl :
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * xb.size(0)
        train_count += xb.size(0)

    avg_train_loss = total_train_loss / train_count

    model.eval()
    model.reset()  # Reset at start of validation
    total_valid_loss, valid_count, correct = 0.0, 0, 0
    with torch.no_grad() :
        for xb, yb in valid_dl :
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = loss_func(preds, yb)
            total_valid_loss += loss.item() * xb.size(0)
            predicted = preds.argmax(dim=1)
            correct += (predicted == yb).sum().item()
            valid_count += yb.size(0)

    avg_valid_loss = total_valid_loss / valid_count
    acc = correct / valid_count
    print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}, Accuracy: {acc:.4f}")

Epoch 1/10 - Train Loss: 1.5023, Valid Loss: 1.8441, Accuracy: 0.3969
Epoch 2/10 - Train Loss: 1.1535, Valid Loss: 1.6308, Accuracy: 0.4618
Epoch 3/10 - Train Loss: 1.0777, Valid Loss: 1.6461, Accuracy: 0.4995
Epoch 4/10 - Train Loss: 1.0316, Valid Loss: 1.6676, Accuracy: 0.5476
Epoch 5/10 - Train Loss: 0.9905, Valid Loss: 1.8342, Accuracy: 0.5695
Epoch 6/10 - Train Loss: 0.9910, Valid Loss: 1.8305, Accuracy: 0.5656
Epoch 7/10 - Train Loss: 0.9670, Valid Loss: 1.7401, Accuracy: 0.5683
Epoch 8/10 - Train Loss: 0.9510, Valid Loss: 1.8490, Accuracy: 0.5531
Epoch 9/10 - Train Loss: 0.9448, Valid Loss: 1.8022, Accuracy: 0.5666
Epoch 10/10 - Train Loss: 0.9395, Valid Loss: 1.9025, Accuracy: 0.5779


A bit better. Let's increase the length of sequences and predict not just the next token but all the tokens in-between.

In [20]:
sl = 16
seqs = [(torch.tensor(nums[i:i+sl]), torch.tensor(nums[i+1:i+sl+1]))
        for i in range(0, len(nums)-sl-1, sl)]
cut = int(len(seqs) * 0.8)

train_ds = group_chunks(seqs[:cut], bs)
valid_ds = group_chunks(seqs[cut:], bs)

train_dl = DataLoader(train_ds, batch_size=bs, shuffle=False, drop_last=True)
valid_dl = DataLoader(valid_ds, batch_size=bs, shuffle=False, drop_last=True)

In [21]:
[[vocab[o] for o in s] for s in seqs[0]]

[['one',
  '.',
  'two',
  '.',
  'three',
  '.',
  'four',
  '.',
  'five',
  '.',
  'six',
  '.',
  'seven',
  '.',
  'eight',
  '.'],
 ['.',
  'two',
  '.',
  'three',
  '.',
  'four',
  '.',
  'five',
  '.',
  'six',
  '.',
  'seven',
  '.',
  'eight',
  '.',
  'nine']]

# Keeping predictions of each token, not just the last

In [22]:
class LMModel4(nn.Module) :
    def __init__(self, vocab_sz, n_hidden) :
        super().__init__()
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0

    def forward(self, x) :
        outs = []
        for i in range(sl) :
            self.h = F.relu(self.h_h(self.h + self.i_h(x[:, i])))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)

    def reset(self) :
        self.h = 0

In [23]:
model = LMModel4(len(vocab), 64).to(device)
optimizer = optim.Adam(model.parameters(), lr=3e-3)

def loss_func(inp, targ) :
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

n_epochs = 15

for epoch in range(n_epochs) :
    model.train()
    model.reset()  # Reset at start of training epoch
    total_train_loss, train_count = 0.0, 0
    for xb, yb in train_dl :
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * xb.numel()
        train_count += xb.numel()

    avg_train_loss = total_train_loss / train_count

    model.eval()
    model.reset()  # Reset at start of validation
    total_valid_loss, valid_count, correct = 0.0, 0, 0
    with torch.no_grad() :
        for xb, yb in valid_dl :
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = loss_func(preds, yb)
            total_valid_loss += loss.item() * xb.numel()
            predicted = preds.argmax(dim=-1)
            correct += (predicted == yb).sum().item()
            valid_count += yb.numel()

    avg_valid_loss = total_valid_loss / valid_count
    acc = correct / valid_count
    print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}, Accuracy: {acc:.4f}")

Epoch 1/15 - Train Loss: 1.8517, Valid Loss: 1.8244, Accuracy: 0.4530
Epoch 2/15 - Train Loss: 1.4244, Valid Loss: 1.9519, Accuracy: 0.4695
Epoch 3/15 - Train Loss: 1.2804, Valid Loss: 1.8565, Accuracy: 0.5285
Epoch 4/15 - Train Loss: 1.1383, Valid Loss: 1.8024, Accuracy: 0.5619
Epoch 5/15 - Train Loss: 0.9835, Valid Loss: 1.9948, Accuracy: 0.6082
Epoch 6/15 - Train Loss: 0.8950, Valid Loss: 1.9774, Accuracy: 0.6244
Epoch 7/15 - Train Loss: 0.8068, Valid Loss: 2.0698, Accuracy: 0.6212
Epoch 8/15 - Train Loss: 0.7745, Valid Loss: 1.9909, Accuracy: 0.6497
Epoch 9/15 - Train Loss: 0.7305, Valid Loss: 2.1343, Accuracy: 0.6559
Epoch 10/15 - Train Loss: 0.6878, Valid Loss: 2.0868, Accuracy: 0.6768
Epoch 11/15 - Train Loss: 0.6328, Valid Loss: 2.1290, Accuracy: 0.6842
Epoch 12/15 - Train Loss: 0.6430, Valid Loss: 2.2555, Accuracy: 0.6459
Epoch 13/15 - Train Loss: 0.5980, Valid Loss: 2.1459, Accuracy: 0.6689
Epoch 14/15 - Train Loss: 0.5504, Valid Loss: 2.4222, Accuracy: 0.6951
Epoch 15/15 - T

The accuracy is around 60%. Let us stack several layers.

# Implementing multi-layer RNN

In [24]:
class LMModel5(nn.Module) :
    def __init__(self, vocab_sz, n_hidden, num_layers) :
        super().__init__()
        self.num_layers = num_layers
        self.i_h = nn.ModuleList([nn.Embedding(vocab_sz, n_hidden)])
        self.h_h = nn.ModuleList([nn.Linear(n_hidden, n_hidden)])
        for _ in range(1, num_layers) :
            self.i_h.append(nn.Linear(n_hidden, n_hidden))
            self.h_h.append(nn.Linear(n_hidden, n_hidden))
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(bs, n_hidden) for _ in range(num_layers)]

    def forward(self, x) :
        outs = []
        for i in range(sl) :
            i_t = self.i_h[0](x[:, i])
            self.h[0] = F.relu(self.h_h[0](self.h[0]) + i_t)
            for n in range(1, self.num_layers) :
                hn_in = self.i_h[n](self.h[n-1])
                self.h[n] = F.relu(self.h_h[n](self.h[n]) + hn_in)
            outs.append(self.h_o(self.h[-1]))
        self.h = [h.detach() for h in self.h]
        return torch.stack(outs, dim=1)

    def reset(self) :
        self.h = [h.detach().zero_() for h in self.h]

In [25]:
model = LMModel5(len(vocab), 64, 2).to(device)
optimizer = optim.Adam(model.parameters(), lr=3e-3)

def loss_func(inp, targ) :
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

n_epochs = 15

for epoch in range(n_epochs) :
    model.train()
    model.reset()  # Reset at start of training epoch
    total_train_loss, train_count = 0.0, 0
    for xb, yb in train_dl :
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * xb.numel()
        train_count += xb.numel()

    avg_train_loss = total_train_loss / train_count

    model.eval()
    model.reset()  # Reset at start of validation
    total_valid_loss, valid_count, correct = 0.0, 0, 0
    with torch.no_grad() :
        for xb, yb in valid_dl :
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = loss_func(preds, yb)
            total_valid_loss += loss.item() * xb.numel()
            predicted = preds.argmax(dim=-1)
            correct += (predicted == yb).sum().item()
            valid_count += yb.numel()

    avg_valid_loss = total_valid_loss / valid_count
    acc = correct / valid_count
    print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}, Accuracy: {acc:.4f}")

Epoch 1/15 - Train Loss: 1.8761, Valid Loss: 1.7912, Accuracy: 0.4659
Epoch 2/15 - Train Loss: 1.3987, Valid Loss: 1.7455, Accuracy: 0.4932
Epoch 3/15 - Train Loss: 1.1992, Valid Loss: 1.7987, Accuracy: 0.5452
Epoch 4/15 - Train Loss: 0.9913, Valid Loss: 1.8842, Accuracy: 0.5468
Epoch 5/15 - Train Loss: 0.8194, Valid Loss: 1.8764, Accuracy: 0.6131
Epoch 6/15 - Train Loss: 0.7007, Valid Loss: 2.1711, Accuracy: 0.6082
Epoch 7/15 - Train Loss: 0.5846, Valid Loss: 2.1935, Accuracy: 0.6261
Epoch 8/15 - Train Loss: 0.5048, Valid Loss: 2.2806, Accuracy: 0.6772
Epoch 9/15 - Train Loss: 0.4831, Valid Loss: 2.0081, Accuracy: 0.6990
Epoch 10/15 - Train Loss: 0.3640, Valid Loss: 2.3589, Accuracy: 0.7424
Epoch 11/15 - Train Loss: 0.3089, Valid Loss: 2.4489, Accuracy: 0.7367
Epoch 12/15 - Train Loss: 0.3052, Valid Loss: 2.3807, Accuracy: 0.7336
Epoch 13/15 - Train Loss: 0.2520, Valid Loss: 2.2906, Accuracy: 0.7096
Epoch 14/15 - Train Loss: 0.2530, Valid Loss: 2.4756, Accuracy: 0.7558
Epoch 15/15 - T

The accuracy is higher but the model needs more regularization.

# Implementing multi-layer LSTM

In [26]:
class LSTMCell(nn.Module) :
    def __init__(self, ni, nh) :
        super().__init__()
        
        # Combining 4 matrix multiplications into 2
        self.ih = nn.Linear(ni, 4*nh)
        self.hh = nn.Linear(nh, 4*nh)

    def forward(self, x, state) :
        h, c = state
        # One big multiplication for all the gates
        gates = (self.ih(x) + self.hh(h)).chunk(4, 1)
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3])
        cellgate = torch.tanh(gates[3])

        c = (forgetgate*c) + (ingate*cellgate)
        h = outgate * c.tanh()
        return h, (h, c)

In [27]:
class LMModel6(nn.Module) :
    def __init__(self, vocab_sz, n_hidden, n_lay) :
        super().__init__()
        self.ih = nn.Embedding(vocab_sz, n_hidden)
        self.cells = nn.ModuleList([
            LSTMCell(n_hidden, n_hidden) for _ in range(n_lay)
        ])
        self.out = nn.Linear(n_hidden, vocab_sz)
        self.state = [
            (torch.zeros(bs, n_hidden),
             torch.zeros(bs, n_hidden))
            for _ in range(n_lay)
        ]
        
    def forward(self, x) :
        embedded = self.ih(x)  # [bs, sl, n_hidden]
        outputs = []

        for t in range(sl) :
            inp = embedded[:, t]

            for i, cell in enumerate(self.cells) :
                h, c = self.state[i]
                h, (h, c) = cell(inp, (h, c))
                self.state[i] = (h, c)
                inp = h

            outputs.append(self.out(inp))

        self.state = [(h.detach(), c.detach()) for h, c in self.state]

        return torch.stack(outputs, dim=1)  # [bs, sl, vocab_sz]

    def reset(self) :
        self.state = [
            (h.detach().zero_(), c.detach().zero_())
            for (h, c) in self.state
        ]

In [28]:
model = LMModel6(len(vocab), 64, 2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-2)

def loss_func(inp, targ) :
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

n_epochs = 15

for epoch in range(n_epochs) :
    model.train()
    model.reset()  # Reset at start of training epoch
    total_train_loss, train_count = 0.0, 0
    for xb, yb in train_dl :
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * xb.numel()
        train_count += xb.numel()

    avg_train_loss = total_train_loss / train_count

    model.eval()
    model.reset()  # Reset at start of validation
    total_valid_loss, valid_count, correct = 0.0, 0, 0
    with torch.no_grad() :
        for xb, yb in valid_dl :
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = loss_func(preds, yb)
            total_valid_loss += loss.item() * xb.numel()
            predicted = preds.argmax(dim=-1)
            correct += (predicted == yb).sum().item()
            valid_count += yb.numel()

    avg_valid_loss = total_valid_loss / valid_count
    acc = correct / valid_count
    print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}, Accuracy: {acc:.4f}")

Epoch 1/15 - Train Loss: 1.8802, Valid Loss: 1.7954, Accuracy: 0.3247
Epoch 2/15 - Train Loss: 1.3469, Valid Loss: 1.8920, Accuracy: 0.3837
Epoch 3/15 - Train Loss: 1.1755, Valid Loss: 2.0014, Accuracy: 0.4937
Epoch 4/15 - Train Loss: 1.0609, Valid Loss: 1.9211, Accuracy: 0.5290
Epoch 5/15 - Train Loss: 0.9369, Valid Loss: 1.9311, Accuracy: 0.5703
Epoch 6/15 - Train Loss: 0.7917, Valid Loss: 1.8632, Accuracy: 0.6178
Epoch 7/15 - Train Loss: 0.6542, Valid Loss: 1.7285, Accuracy: 0.6795
Epoch 8/15 - Train Loss: 0.4961, Valid Loss: 1.6021, Accuracy: 0.6965
Epoch 9/15 - Train Loss: 0.3598, Valid Loss: 1.7341, Accuracy: 0.7048
Epoch 10/15 - Train Loss: 0.2724, Valid Loss: 1.5876, Accuracy: 0.7143
Epoch 11/15 - Train Loss: 0.2034, Valid Loss: 1.5857, Accuracy: 0.7818
Epoch 12/15 - Train Loss: 0.1524, Valid Loss: 1.4648, Accuracy: 0.7882
Epoch 13/15 - Train Loss: 0.1002, Valid Loss: 1.4923, Accuracy: 0.7961
Epoch 14/15 - Train Loss: 0.0760, Valid Loss: 1.5252, Accuracy: 0.8009
Epoch 15/15 - T

Now the accuracy is over 80%, but we still need to regularize the model.

# Adding regularization techniques:
* Dropout
* Activation Regularization (AR)
* Temporal Activation Regularization (TAR)

In [29]:
class MyDropout(nn.Module) :
    def __init__(self, p) :
        super().__init__()
        self.p = p
        
    def forward(self, x) :
        if not self.training: return x
        mask = x.new(*x.shape).bernoulli_(1-self.p)
        return x * mask.div_(1-self.p)

In [30]:
class MyRNNRegularizer :
    def __init__(self, alpha=2., beta=1.) :
        self.alpha, self.beta = alpha, beta

    def __call__(self, pred) :
        # Skip regularization if pred is not a 3-tuple
        if not (isinstance(pred, (tuple, list)) and len(pred) == 3) :
            return 0.0

        logits, raw, out = pred
        ar_loss = self.alpha * out.pow(2).mean()
        tar_loss = self.beta * (raw[:, 1:] - raw[:, :-1]).pow(2).mean()
        
        return ar_loss + tar_loss

In [31]:
class LMModel7(nn.Module) :
    def __init__(self, vocab_sz, n_hidden, n_lay, p) :
        super().__init__()
        self.ih = nn.Embedding(vocab_sz, n_hidden)
        self.cells = nn.ModuleList([
            LSTMCell(n_hidden, n_hidden) for _ in range(n_lay)
        ])
        self.drop = MyDropout(p)
        self.out = nn.Linear(n_hidden, vocab_sz)
        self.out.weight = self.ih.weight
        self.state = [
            (torch.zeros(bs, n_hidden),
             torch.zeros(bs, n_hidden))
            for _ in range(n_lay)
        ]
        
    def forward(self, x) :
        embedded = self.ih(x)  # [bs, sl, n_hidden]
        raw = []

        for t in range(sl) :
            inp = embedded[:, t]

            for i, cell in enumerate(self.cells) :
                h, c = self.state[i]
                h, (h, c) = cell(inp, (h, c))
                self.state[i] = (h, c)
                inp = h

            raw.append(inp)

        raw = torch.stack(raw, dim=1)
        out = self.drop(raw)
        logits = self.out(out)
        
        self.state = [(h.detach(), c.detach()) for h, c in self.state] 

        if self.training :
            return logits, raw, out
        else :
            return logits

    def reset(self) :
        self.state = [
            (h.detach().zero_(), c.detach().zero_())
            for (h, c) in self.state
        ]

In [32]:
# learn = Learner(dls, LMModel7(len(vocab), 64, 3, 0.5),
#                 loss_func=loss_func2, metrics=accuracy,
#                 cbs=[ModelResetter, MyRNNRegularizer()])
# learn.fit_one_cycle(15, 1e-2, wd=0.1)

model = LMModel7(len(vocab), 64, 3, 0.5).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
regularizer = MyRNNRegularizer(alpha=2., beta=1.)

def loss_func(inp, targ) :
    if isinstance(inp, tuple) :
        logits = inp[0]
    else :
        logits = inp
    return F.cross_entropy(logits.view(-1, len(vocab)), targ.view(-1))

n_epochs = 15

for epoch in range(n_epochs) :
    model.train()
    model.reset()  # Reset at start of training epoch
    total_train_loss, train_count = 0.0, 0
    for xb, yb in train_dl :
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_func(preds, yb) + regularizer(preds)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * xb.numel()
        train_count += xb.numel()

    avg_train_loss = total_train_loss / train_count

    model.eval()
    model.reset()  # Reset at start of validation
    total_valid_loss, valid_count, correct = 0.0, 0, 0
    with torch.no_grad() :
        for xb, yb in valid_dl :
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = loss_func(preds, yb)
            total_valid_loss += loss.item() * xb.numel()
            predicted = preds.argmax(dim=-1)
            correct += (predicted == yb).sum().item()
            valid_count += yb.numel()

    avg_valid_loss = total_valid_loss / valid_count
    acc = correct / valid_count
    print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}, Accuracy: {acc:.4f}")

Epoch 1/15 - Train Loss: 1.9306, Valid Loss: 1.6843, Accuracy: 0.5283
Epoch 2/15 - Train Loss: 1.0089, Valid Loss: 1.0166, Accuracy: 0.6985
Epoch 3/15 - Train Loss: 0.5120, Valid Loss: 0.6242, Accuracy: 0.8263
Epoch 4/15 - Train Loss: 0.2701, Valid Loss: 0.5663, Accuracy: 0.8388
Epoch 5/15 - Train Loss: 0.1885, Valid Loss: 0.5122, Accuracy: 0.8545
Epoch 6/15 - Train Loss: 0.1476, Valid Loss: 0.5333, Accuracy: 0.8486
Epoch 7/15 - Train Loss: 0.1254, Valid Loss: 0.4981, Accuracy: 0.8538
Epoch 8/15 - Train Loss: 0.1122, Valid Loss: 0.5444, Accuracy: 0.8421
Epoch 9/15 - Train Loss: 0.1024, Valid Loss: 0.5230, Accuracy: 0.8494
Epoch 10/15 - Train Loss: 0.0929, Valid Loss: 0.5928, Accuracy: 0.8445
Epoch 11/15 - Train Loss: 0.0869, Valid Loss: 0.5411, Accuracy: 0.8425
Epoch 12/15 - Train Loss: 0.0842, Valid Loss: 0.5585, Accuracy: 0.8490
Epoch 13/15 - Train Loss: 0.0803, Valid Loss: 0.5226, Accuracy: 0.8464
Epoch 14/15 - Train Loss: 0.0796, Valid Loss: 0.6197, Accuracy: 0.8416
Epoch 15/15 - T

The accuracy is around 86% and the model is much more regularized.