E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2

E02: I was not careful with the intialization of the network in this video. (1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? (2) Can you tune the initialization to get a starting loss that is much more similar to (1)?

E03: Read the Bengio et al 2003 paper (link above), implement and try any idea from the paper. Did it work?

In [34]:
import torch
import random

In [39]:
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [70]:
# Setup data sets

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}


def build_dataset(words: list):
    chunk_size = 3
    X, Y = [], []

    for word in words:
        context = [0] * chunk_size

        for ch in word + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)

    return X, Y

# Split into training (80%) dev/ validation (10%), training (10%)

random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [87]:
# Initialize network

# create character embeddings (27, 30)
C = torch.randn((27,10))
W1 = torch.randn((30, 300)) # chunk size of 3 chars * 10 dim char emmbedding, 200 neurons
b1 = torch.randn(300) # biases
W2 = torch.randn((300, 27))
b2 = torch.randn(27)

parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad=True

In [93]:
for i in range(50000):
    # Forward pass

    ix = torch.randint(0, Xtr.shape[0], (200,))
    embeddings = C[Xtr[ix]]

    h = torch.tanh(embeddings.view(-1, 30) @ W1 + b1)
    logits = h @ W2 + b2

    loss = torch.nn.functional.cross_entropy(logits, Ytr[ix])


    # Backward pass
    for p in parameters:
        p.grad = None

    loss.backward()

    # Updata paramaters
    step_size = 0.01
    for p in parameters:
        p.data -= step_size * p.grad

print(loss.data)

tensor(2.1397)


In [94]:
# Loss on dev set
emb = C[Xdev]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
loss = torch.nn.functional.cross_entropy(logits, Ydev)
loss

tensor(2.1599, grad_fn=<NllLossBackward0>)

In [95]:
# Sample model
chunk_size = 3

for i in range(20):

    out = []
    context = [0] * chunk_size
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = torch.nn.functional.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break

    print(''.join(itos[i] for i in out))

ashani.
nehialle.
zuyamarkesssev.
javeevriichara.
amorziah.
brixson.
ashoni.
laya.
esrii.
alylynn.
aurusson.
car.
mity.
ler.
caudrielie.
graytoneleiah.
imayson.
khashleelizsonamillarmyla.
nai.
bastyanne.
