- E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2


- E02: I was not careful with the intialization of the network in this video. 

(1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? 

(2) Can you tune the initialization to get a starting loss that is much more similar to (1)?


- E03: Read the Bengio et al 2003 paper (link above), implement and try any idea from the paper. Did it work?

In [1]:
import torch

In [3]:
#  load the names
names = open("../names.txt", "r").read().splitlines()
names[:3]

['emma', 'olivia', 'ava']

In [7]:
# count the number of letters and prepare the index
letters = set()
for name in names:
    for c in name:
        letters.add(c)

letter_list = list(letters)
letter_list.append(".")
letter_list.sort()

stoi = {s:i for i, s in enumerate(letter_list)}
itos = {i:s for s, i in stoi.items()}

In [81]:
# create the dataset
# the number of letters you take in as context should be the amount of padding you place at the start

# this will be the inputs
x = []

# this will be the outputs
y = []

# padding size
block_size = 3

for name in names[:50]:
    # add the necessary padding to the name
    modified_string = block_size * "." + name + "."
    for ch1, ch2, ch3, ch4 in zip(modified_string, modified_string[1:], modified_string[2:], modified_string[3:]):
        x.append([stoi[ch1], stoi[ch2], stoi[ch3]])
        y.append(stoi[ch4])

In [82]:
# convert X and Y into tensors
X = torch.tensor(x)
X.shape

torch.Size([337, 3])

In [83]:
Y = torch.tensor(y)
Y.shape

torch.Size([337])

In [84]:
# set a random seed for the generator
g = torch.Generator().manual_seed(2147483647)

# create an encoding format for the model
# start with a 27, 2 -> each character is represented by a 2 dimension vector
enc = torch.randn((27, 2), generator=g)

In [85]:
# note to self from video! Indexing can be done with tensors as well, so this is super convenient! :D
enc[X[:3]].shape

torch.Size([3, 3, 2])

In [86]:
# create the first hidden layer for the network
# (number of inputs that should be taken, number of outputs that should be produced)

# this is the first layer
w1 = torch.randn((3*2, 100), generator=g)
b1 = torch.randn(100, generator=g)

# this is the output layer
w2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)

# store the params in a list to make life easier later
params = [enc, w1, b1, w2, b2]

In [87]:
# check the number of parameters
print(sum([p.nelement() for p in params]))


for p in params:
    p.requires_grad = True

3481


In [88]:
import torch.nn.functional as F
# time to construct the forward pass

# create the encodings for all of the inputs and feed them into the network

# actually kind of unsure why the output here is tanh-ed
input = enc[X].view(X.shape[0], -1)
h1 = torch.tanh(input @ w1 + b1)

# pass it to the second layer
output = h1 @ w2 + b2

# calculate the loss of the current NN
loss = F.cross_entropy(output, Y)
loss

tensor(19.4104, grad_fn=<NllLossBackward0>)

In [91]:
# testing if backpropagation can occur
for p in params:
    p.grad = None

loss.backward()

for p in params:
    p.data += -0.1 * p.grad

# actually kind of unsure why the output here is tanh-ed
input = enc[X].view(X.shape[0], -1)
h1 = torch.tanh(input @ w1 + b1)

# pass it to the second layer
output = h1 @ w2 + b2

# calculate the loss of the current NN
loss = F.cross_entropy(output, Y)

loss

tensor(14.5042, grad_fn=<NllLossBackward0>)