# A language model using a WaveNet

In this notebook I try to follow the WaveNet approach to language modelling that [Andrej Karpathy](https://karpathy.ai/zero-to-hero.html) describes in his [makemore](https://github.com/karpathy/makemore) repository.
It follows the paper ["WaveNet: A Generative Model for Raw Audio"](https://arxiv.org/abs/1609.03499) by van den Oord et al. (2016).

In [None]:
import torch
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt
import struct
import nn
%matplotlib inline

In [None]:
#
# hyperparameters
#
block_size = 8          # the size of the context (how many characters of input)
feature_size = 5        # the size of an embedding vector
hidden_size = 100       # the size of the hidden tanh layer

torch.manual_seed(42);

In [None]:
# read names from file and 
# create encoding and decoding dictionaries
# to map characters to integers
words = open("../../../TrainingData/names.txt").read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocabulary_size = len(itos)

In [None]:
def build_dataset(words):
    X,Y = [],[]
    for w in words:
        #print(w)
        context = [0]*block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '->', itos[ix])
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X,Y

random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])
X = torch.cat((Xtr, Xdev, Xte), dim=0)

In [None]:
# create the layers
network = nn.Sequential([
    nn.Embedding(vocabulary_size, feature_size),
    nn.Flatten(),
    nn.Linear(feature_size * block_size, hidden_size, bias=False),
    nn.BatchNorm1d(hidden_size),
    nn.Tanh(),
    nn.Linear(hidden_size, vocabulary_size)]
)

# initialize parameters
with torch.no_grad():
    network.layers[-1].weight *= 0.1 # reduce confidence in initial output    
for p in network.parameters():
    p.requires_grad = True

sum(p.nelement() for p in network.parameters())

In [None]:
batch_size = 64
lossi = []
network.training = True
for i in range(10000):
    # create minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # forward pass
    logits = network(Xtr[ix])
    
    loss = F.cross_entropy(logits, Ytr[ix])
    lossi.append(loss)

    # backward pass
    network.zero_grad()
    loss.backward()

    # update
    learning_rate = 0.1 / (10 ** (i // 100000))
    network.update(learning_rate)

network.training = False
loss

In [None]:
plt.plot(torch.tensor(lossi).view(-1,100).mean(1))


In [None]:
@torch.no_grad()
def print_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val' : (Xdev, Ydev),
        'test' : (Xte, Yte)
    }[split]
    logits = network(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

network.training = False
print_loss('train')
print_loss('val')
print_loss('test')


In [None]:
@torch.no_grad()
def print_samples(count):
    for _ in range(count):
        out = []
        context = [0] * block_size
        while True:
            x = torch.tensor([context])
            x = network(x)
            probs = F.softmax(x, dim=1)
            ix = torch.multinomial(probs, num_samples=1).item()
            context = context[1:] + [ix]
            out.append(ix)
            if ix == 0:
                break
        
        print(''.join(itos[i] for i in out))

network.training = False
print_samples(20)