In [66]:
import torch

In [67]:
names = open("names.txt", "r").read().splitlines()

In [68]:
# collect a list of all the letters
name_set = set()
for name in names:
    for c in name:
        name_set.add(c)

char_list = list(name_set)
char_list.append(".")
sorted_letter = sorted(char_list)

# create a dictionary that contains the mapping of the index to the character
stoi = {s:i for i, s in enumerate(sorted_letter)}
itos = {i:s for s, i in stoi.items()}

In [181]:
# set out the letter counting
# i do not get how this can be visualised though...
# counts = torch.zeros((27, 27, 27))

# input this will take a tuple
x = []

# output
y = []

for word in names:
    formatted_word = "." + word + "."
    for c1, c2, c3 in zip(formatted_word, formatted_word[1:], formatted_word[2:]):
        x.append((stoi[c1], stoi[c2]))
        y.append(stoi[c3])

x = torch.tensor(x)
y = torch.tensor(y)

x.shape[0]

196113

In [182]:
input_tensors = []

# try making my own variation of ohe that can represent 2 inputs
for count, v in enumerate(x):
    base = torch.zeros((27, 27))
    # everything that starts with '.' will be in the first row. anything that ends with '.' will be in the first column
    base[v[0], v[1]] += 1
    # print(base.shape)
    input_tensors.append(base)

xenc = torch.stack(input_tensors, dim=0)
xenc.shape

# reshape the tensor for multiplication -> 3 states the number of rows, while -1 means that the column dimension can be inferred after calculation
xenc = xenc.view(x.shape[0], -1)
xenc.shape

torch.Size([196113, 729])

In [183]:
# now that all the data has been prepared, it is time to make the neural network

# start by creating weights from the seed
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True)
W.shape

torch.Size([729, 27])

In [None]:
for i in range(5000):
    logits = xenc @ W
    exp = logits.exp()
    sum = exp.sum(dim=1, keepdim=True)
    probs = exp / sum

    loss = -probs[torch.arange(xenc.shape[0]), y].log().mean()
    print(f"{i}: {loss}")

    W.grad = None
    loss.backward()

    W.data += -5 * W.grad


In [185]:
loss.item()

2.1570632457733154

In [186]:
# now we cna try to sample from the trigram model. LOL

import random
# random.seed(2147483647)

g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    # for the first, choose the second letter as a random letter
    first_index = 0
    second_index = random.randint(1, 26)
    output = [itos[second_index]]

    while True:
        # create a one hot encoding for the first two letters
        base = torch.zeros((27, 27))
        base[first_index, second_index] += 1
        # squash it into a one-hot encoding that can be fed into the NN
        xenc = base.view(1, -1) # this would probably be 1, 729

        logits = xenc @ W
        probs = logits.exp()
        P = probs / probs.sum(dim=1, keepdim=True)
        output_index = torch.multinomial(P, num_samples=1, replacement=True, generator=g).item()

        output.append(itos[output_index])
        first_index = second_index
        second_index = output_index

        if output_index == 0:
            break

    print("".join(output))
    

ocexzdfzjglkuriana.
ha.
yah.
her.
olistona.


E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. 
Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [91]:
W.shape # torch.Size([729, 27])

test = x[0] #(0, 5)

# I think that figuring out which row to index into is as such
index = test[0] * 27 + test[1]

print(W[index, :])

# trying it with one hot encoding
base = torch.zeros((27, 27))
base[test[0], test[1]] += 1
xenc_t = base.view(1, -1)

print(xenc_t @ W)

tensor([ 0.2761,  1.0152,  0.1471,  0.7295,  1.4850,  0.2727, -0.7242,  0.3931,
        -0.6920,  0.3070,  0.9341,  0.0767, -0.5380,  5.9423, -0.5895,  0.3441,
         0.2779, -1.4199,  1.1005,  0.0231,  0.6784,  0.3041,  0.7013, -1.9261,
        -0.5120, -2.1419,  0.6578], grad_fn=<SliceBackward0>)
tensor([[ 0.2761,  1.0152,  0.1471,  0.7295,  1.4850,  0.2727, -0.7242,  0.3931,
         -0.6920,  0.3070,  0.9341,  0.0767, -0.5380,  5.9423, -0.5895,  0.3441,
          0.2779, -1.4199,  1.1005,  0.0231,  0.6784,  0.3041,  0.7013, -1.9261,
         -0.5120, -2.1419,  0.6578]], grad_fn=<MmBackward0>)


E05: look up and use F.cross_entropy instead. You should achieve the same result. 

Can you think of why we'd prefer to use F.cross_entropy instead?

In [133]:
import torch.nn.functional as F

# when calculating cross entropy, you are required to provide (a) unnormalized logits for each class (b) target, which in this case should be expressed as OHEs
g = torch.Generator().manual_seed(2147483647)

# reset the weights
Wn = torch.randn((729, 27), requires_grad=True, generator=g).float()

# calculate the result 
logits = xenc @ Wn

# calculate the targets' OHE
yenc = F.one_hot(y, num_classes=27).float()

In [138]:
torch.equal(W, Wn)

True

In [107]:
loss = F.cross_entropy(logits, yenc)
loss

tensor(3.7231, grad_fn=<DivBackward1>)

In [137]:
for i in range(1):
    logits = xenc @ Wn
    loss = F.cross_entropy(logits, yenc)
    print(f"{i}: {loss}")

    Wn.grad = None
    loss.backward()

    Wn.data += -1 * W.grad

0: 3.723123073577881
