# A language model using a WaveNet

In this notebook I try to follow the WaveNet approach to language modelling that [Andrej Karpathy](https://karpathy.ai/zero-to-hero.html) describes in his [makemore](https://github.com/karpathy/makemore) repository.
It follows the paper ["WaveNet: A Generative Model for Raw Audio"](https://arxiv.org/abs/1609.03499) by van den Oord et al. (2016).

In [1]:
import torch
import torch.nn
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt
import struct
%matplotlib inline

In [20]:
#
# hyperparameters
#
block_size = 10          # the size of the context (how many characters of input)
feature_size = 10        # the size of an embedding vector
hidden_size = 250       # the size of the hidden tanh layer

torch.manual_seed(42);

In [4]:
# read names from file and 
# create encoding and decoding dictionaries
# to map characters to integers
words = open("../../../TrainingData/names.txt").read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocabulary_size = len(itos)

In [21]:
def build_dataset(words):
    X,Y = [],[]
    for w in words:
        #print(w)
        context = [0]*block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '->', itos[ix])
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X,Y

random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])
X = torch.cat((Xtr, Xdev, Xte), dim=0)

In [8]:
#
# Layer classes
#
class Linear:

    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])
    
class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        self.mean = torch.zeros(dim)
        self.var = torch.ones(dim)

    def __call__(self, x):
        if self.training:
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True)
        else:
            xmean = self.mean
            xvar = self.var
        
        xhat = (x - xmean)/torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        if self.training:
            with torch.no_grad():
                self.mean = (1 - self.momentum) * self.mean + self.momentum * xmean
                self.var = (1 - self.momentum) * self.var + self.momentum * xvar

        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
    
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []


In [22]:
# create the embedding matrix that assigns a 
# "feature_size"-dimensional feature vector 
# to each character
C = torch.randn((vocabulary_size,feature_size), generator=generator)

# create the layers
layers = [Linear(feature_size * block_size, hidden_size, bias=False),
          BatchNorm1d(hidden_size),
          Tanh(),
          Linear(hidden_size, vocabulary_size)
          ]

# initialize parameters
with torch.no_grad():
    layers[-1].weight *= 0.1 # reduce confidence in initial output    
parameters = [C] + [p for layer in layers for p in layer.parameters()]
for p in parameters:
    p.requires_grad = True

sum(p.nelement() for p in parameters)

32547

In [23]:
batch_size = 64
layers[1].training = True

for i in range(100000):
    # create minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # embedd the inputs by indexing into the C matrix
    # This creates a new (trainingdatarows,block_size,feature_size) 
    # tensor by assigning the appropriate vector in C
    # to the label in the (trainingdatarows,block_size) X tensor
    embedded = C[Xtr[ix]]

    # but we need this as a (trainingdatarows,block_size*feature_size) 
    # tensor as input to the net so we want for all the trainingdatarows 
    # in "embedded" to concat all the feature_size values in there 
    # block_size label tensors
    out = embedded.view(-1, block_size*feature_size)

    # forward pass
    for layer in layers:
        out = layer(out)
    
    loss = F.cross_entropy(out, Ytr[ix])

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    learning_rate = 0.1 / (10 ** (i // 100000))

    # update
    for p in parameters:
        p.data += -learning_rate * p.grad

layers[1].training = False
loss

In [14]:
@torch.no_grad()
def print_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val' : (Xdev, Ydev),
        'test' : (Xte, Yte)
    }[split]
    embedded = C[x]
    out = embedded.view(-1, block_size*feature_size)
    for layer in layers:
        out = layer(out)
    loss = F.cross_entropy(out, y)
    print(split, loss.item())

layers[1].training = False
print_loss('train')
print_loss('val')
print_loss('test')


train 2.291414737701416
val 2.29761004447937
test 2.2945852279663086


In [19]:
@torch.no_grad()
def print_samples(count):
    for _ in range(count):
        out = []
        context = [0] * block_size
        while True:
            emb = C[torch.tensor([context])]
            logits = emb.view(1, -1)
            for layer in layers: 
                logits = layer(logits)
            probs = F.softmax(logits, dim=1)
            ix = torch.multinomial(probs, num_samples=1).item()
            context = context[1:] + [ix]
            out.append(ix)
            if ix == 0:
                break
        
        print(''.join(itos[i] for i in out))

layers[1].training = False
print_samples(20)

ric.
koten.
cin.
aiga.
taeveina.
zestin.
eliya.
merricia.
candastofan.
kutailafa.
mayeghole.
naizah.
shoe.
lle.
marlatobusem.
maildlayley.
anora.
sorianmikusin.
amate.
azela.


In [22]:
b_feature_size = struct.pack('i', feature_size)
b_block_size = struct.pack('i', block_size)
b_hidden_size = struct.pack('i', hidden_size)

l_C = C.view(-1).tolist()
b_C = struct.pack(f'{len(l_C)}f', *l_C)

l_W1 = layers[0].weight.view(-1).tolist()
b_W1 = struct.pack(f'{len(l_W1)}f', *l_W1)

l_bngain = layers[1].gamma.view(-1).tolist()
b_bngain = struct.pack(f'{len(l_bngain)}f', *l_bngain)
l_bnbias = layers[1].beta.view(-1).tolist()
b_bnbias = struct.pack(f'{len(l_bnbias)}f', *l_bnbias)
l_bnmean = layers[1].mean.view(-1).tolist()
b_bnmean = struct.pack(f'{len(l_bnmean)}f', *l_bnmean)
l_bnstd = layers[1].var.view(-1).tolist()               # THIS IS NOW VARIANCE, SO STD^2 !!!!!
b_bnstd = struct.pack(f'{len(l_bnstd)}f', *l_bnstd)

l_W2 = layer[3].weight.view(-1).tolist()
b_W2 = struct.pack(f'{len(l_W2)}f', *l_W2)
l_b2 = layer[3].bias.view(-1).tolist()
b_b2 = struct.pack(f'{len(l_b2)}f', *l_b2)

with open('names.dat', 'wb') as f:
    f.write(b_feature_size)
    f.write(b_block_size)
    f.write(b_hidden_size)
    f.write(b_C)
    f.write(b_W1)
    f.write(b_bngain)
    f.write(b_bnbias)
    f.write(b_bnmean)
    f.write(b_bnstd)
    f.write(b_W2)
    f.write(b_b2)