# A language model using a WaveNet

In this notebook I try to follow the WaveNet approach to language modelling that [Andrej Karpathy](https://karpathy.ai/zero-to-hero.html) describes in his [makemore](https://github.com/karpathy/makemore) repository.
It follows the paper ["WaveNet: A Generative Model for Raw Audio"](https://arxiv.org/abs/1609.03499) by van den Oord et al. (2016).

In [43]:
import torch
import torch.nn
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt
import struct
%matplotlib inline

In [3]:
# Set the size of the feature vector in the C matrix.
# This is typically much much smaller than the vocabulary
# size.
feature_size = 15

# The size of the input samples. We take
# block_size consecutive elements as input
# and predict the following element.
block_size = 16

# Hidden layer size. Object of experiments.
hidden_size = 350

# Use generator to get reproducible results for debugging
generator = torch.Generator().manual_seed(2147483647)

In [4]:
# read names from file and 
# create encoding and decoding dictionaries
# to map characters to integers
words = open("../../../TrainingData/names.txt").read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocabulary_size = len(itos)

max(len(s) for s in words)

15

In [5]:
def build_dataset(words):
    X,Y = [],[]
    for w in words:
        #print(w)
        context = [0]*block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '->', itos[ix])
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X,Y

random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])
X = torch.cat((Xtr, Xdev, Xte), dim=0)

In [6]:
# create the embedding matrix that assigns a 
# "feature_size"-dimensional feature vector 
# to each character
C = torch.randn((vocabulary_size,feature_size), generator=generator)

# create the hidden layer weights (no need for biases, because of batch normalization)
W1 = torch.randn(feature_size * block_size, hidden_size, generator=generator) * (5/3)/((feature_size*block_size)**0.5)
# create the output layer weigts and biases
W2 = torch.randn((hidden_size, vocabulary_size), generator=generator) * 0.01
b2 = torch.tensor([0.] * vocabulary_size)

# batch normalization
bngain = torch.ones((1, hidden_size))
bnbias = torch.zeros((1, hidden_size))
bnmean = torch.zeros((1, hidden_size))
bnstd = torch.ones((1, hidden_size))

# collect parameters for efficient update
parameters = [C, W1, W2, b2, bngain, bnbias]

for p in parameters:
    p.requires_grad = True

sum(p.nelement() for p in parameters)

63051

In [7]:
batch_size = 128

for i in range(500000):
    # create minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # embedd the inputs by indexing into the C matrix
    # This creates a new (trainingdatarows,block_size,feature_size) 
    # tensor by assigning the appropriate vector in C
    # to the label in the (trainingdatarows,block_size) X tensor
    embedded = C[Xtr[ix]]

    # but we need this as a (trainingdatarows,block_size*feature_size) 
    # tensor as input to the net so we want for all the trainingdatarows 
    # in "embedded" to concat all the feature_size values in there 
    # block_size label tensors
    input = embedded.view(-1, block_size*feature_size)

    # forward pass
    hidden_preact = input @ W1
    bnmeani = hidden_preact.mean(0, keepdim=True)
    bnstdi = hidden_preact.std(0, keepdim = True)
    hidden_preact = bngain * (hidden_preact - bnmeani) / bnstdi + bnbias
    hidden_output = torch.tanh(hidden_preact)
    logits = hidden_output @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])

    with torch.no_grad():
        bnmean = 0.999 * bnmean + 0.001 * bnmeani
        bnstd = 0.999 * bnstd + 0.001 * bnstdi

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    learning_rate = 0.1 / (10 ** (i // 100000))

    # update
    for p in parameters:
        p.data += -learning_rate * p.grad

In [16]:
@torch.no_grad()
def print_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val' : (Xdev, Ydev),
        'test' : (Xte, Yte)
    }[split]
    embedded = C[x]
    input = embedded.view(-1, block_size*feature_size)
    hidden_preact = input @ W1
    hidden_preact = bngain * (hidden_preact - bnmean) / bnstd + bnbias
    hidden_output = torch.tanh(hidden_preact)
    logits = hidden_output @ W2 + b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

print_loss('train')
print_loss('val')
print_loss('test')


train 1.768717646598816
val 2.0449447631835938
test 2.0438926219940186


In [None]:
@torch.no_grad()
def print_samples(count):
    for _ in range(count):
        out = []
        context = [0] * block_size
        while True:
            emb = C[torch.tensor([context])]
            input = emb.view(1, -1)
            hidden_preact = input @ W1
            hidden_preact = bngain * (hidden_preact - bnmean) / bnstd + bnbias
            hidden_output = torch.tanh(hidden_preact)
            logits = hidden_output @ W2 + b2
            probs = F.softmax(logits, dim=1)
            ix = torch.multinomial(probs, num_samples=1, generator=generator).item()
            context = context[1:] + [ix]
            out.append(ix)
            if ix == 0:
                break
        
        print(''.join(itos[i] for i in out))

print_samples(20)

In [22]:
b_feature_size = struct.pack('i', feature_size)
b_block_size = struct.pack('i', block_size)
b_hidden_size = struct.pack('i', hidden_size)

l_C = C.view(-1).tolist()
b_C = struct.pack(f'{len(l_C)}f', *l_C)

l_W1 = W1.view(-1).tolist()
b_W1 = struct.pack(f'{len(l_W1)}f', *l_W1)

l_bngain = bngain.view(-1).tolist()
b_bngain = struct.pack(f'{len(l_bngain)}f', *l_bngain)
l_bnbias = bnbias.view(-1).tolist()
b_bnbias = struct.pack(f'{len(l_bnbias)}f', *l_bnbias)
l_bnmean = bnmean.view(-1).tolist()
b_bnmean = struct.pack(f'{len(l_bnmean)}f', *l_bnmean)
l_bnstd = bnstd.view(-1).tolist()
b_bnstd = struct.pack(f'{len(l_bnstd)}f', *l_bnstd)

l_W2 = W2.view(-1).tolist()
b_W2 = struct.pack(f'{len(l_W2)}f', *l_W2)
l_b2 = b2.view(-1).tolist()
b_b2 = struct.pack(f'{len(l_b2)}f', *l_b2)

with open('names.dat', 'wb') as f:
    f.write(b_feature_size)
    f.write(b_block_size)
    f.write(b_hidden_size)
    f.write(b_C)
    f.write(b_W1)
    f.write(b_bngain)
    f.write(b_bnbias)
    f.write(b_bnmean)
    f.write(b_bnstd)
    f.write(b_W2)
    f.write(b_b2)

In [42]:
b2

tensor([ 0.6691,  1.4858, -0.4469, -0.2040, -0.0691,  1.2503, -1.0167, -0.4872,
         0.3681,  0.7203, -0.6931,  0.0090,  0.4676,  0.0786,  0.9308,  0.6023,
        -0.9886, -1.4780,  0.8872,  0.6303,  0.3457, -0.4251, -0.2993, -0.7164,
        -1.3817,  0.2789, -0.5161], requires_grad=True)