In [1]:
import torch

In [2]:
# tasks
# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. 
# Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model? 
# -> i will try to make this implementation using a neural net

# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. 
# Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

# E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model 
# - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. 
# How good of a loss do you achieve?

# E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. 
# Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

# E05: look up and use F.cross_entropy instead. You should achieve the same result. 
# Can you think of why we'd prefer to use F.cross_entropy instead?

In [3]:
names = open("names.txt", "r").read().splitlines()
# names[:10]

In [40]:
# get a set of characters to integers
all_letters = set()
for name in names:
    for char in name:
        all_letters.add(char)

all_letters = list(all_letters)
all_letters.append(".")
sorted_letters = sorted(all_letters)

# this is going ot be used for the output layer
stoi = {l:count for count, l in enumerate(sorted_letters)} #string to integer
stoi["."] = 0

# this is going to be used for the input layer
# now this starts at 0 as well
btoi = {}
for outc, ch1 in (enumerate(sorted_letters)):
    for inc, ch2 in enumerate(sorted_letters):
        index = outc * 27 + inc
        btoi[ch1+ch2] = index



In [41]:
# this will be the inputs
x = []

# this will be the outputs
y = []

# # chars
# chars = []

# create trigrams from the names, add a "." to the front and the back. Trigrams use the first 2 letters to predict the third
for name in names[:20]:
    name = "." + name + "."
    # zip() is good here because it stops creating sets once there are no more sets of 3 to make
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        # chars.append(((ch1, ch2), ch3))
        
        input_chars = ch1 + ch2
        x.append(btoi[input_chars])
        
        y.append(stoi[ch3])

In [97]:
# create one-hot encodings for all the inputs
import torch.nn.functional as F

# F.one_hot(torch.tensor([1]), num_classes=729)
for inputs in x:
    xenc = F.one_hot(torch.tensor(x), num_classes=729).float()

xenc.shape

torch.Size([114, 729])

In [98]:
# time to contruct the neural network

# fix the generator
g = torch.Generator().manual_seed(2147483647)

# create the weights with randoms -> it should have 729, 27 so that it can give you a probability distribution for each of the 27 outputs. 
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [99]:
# here is the output if it is a 729, 1 random int matrix -> output is no_of_examples * 1, which does not really help you to estimate any probabilities
W_wrong = torch.rand((729, 1), generator=g)
wrong_output = xenc @ W_wrong

In [100]:
# get the predictions - because these output values have -ves in them, they can be considered to be log-counts
logits = xenc @ W
#  P.shape (13, 27)

# remove the negatives
P = logits.exp()

# normalize the weights
sum = P.sum(dim=1, keepdim=True)
# sum.shape (13, 1)

probs = P / sum

probs.shape

# verify that each row has a probability total of 1
# probs[0, :].sum()


torch.Size([114, 27])

In [101]:
# make a reverse example that maps the index to the string
itos = {i:s for s, i in stoi.items()}

# make a reverse index that maps the index to the bigram
itob = {i:b for b, i in btoi.items()}

In [102]:
# look at the probabilities for the desired outputs [14, 14, 2, 0, 13, 10, 23, 10, 2, 0, 23, 2, 0]
# the 'loss' can be calculated using -log then take the mean
number_of_elements = xenc.shape[0]
loss = -probs[torch.arange(number_of_elements), y].log().mean()
loss

tensor(3.8278, grad_fn=<NegBackward0>)

In [103]:
# try to start the process of gradient descent

# set all the gradients to 0
W.grad = None
loss.backward()

# W.grad

W = -0.1 * W.grad

In [104]:
# get the predictions - because these output values have -ves in them, they can be considered to be log-counts
logits = xenc @ W
#  P.shape (13, 27)

# remove the negatives
P = logits.exp()

# normalize the weights
sum = P.sum(dim=1, keepdim=True)
# sum.shape (13, 1)

probs = P / sum

probs.shape

number_of_elements = xenc.shape[0]
loss = -probs[torch.arange(number_of_elements), y].log().mean()
loss

tensor(3.2945)

In [166]:
# create one-hot encodings for all the inputs
import torch.nn.functional as F

# F.one_hot(torch.tensor([1]), num_classes=729)
for inputs in x:
    xenc = F.one_hot(torch.tensor(x), num_classes=729).float()

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True)

for i in range(20):
    logits = xenc @ W
    p = logits.exp()
    probs = p / p.sum(1, keepdims=True)

    # collect the probabilities of the desired elements
    loss = -probs[torch.arange(xenc.shape[0]), y].log().mean()
    # loss.requires_grad = True
    print(f"{i + 1}: {loss}")

    # backpropagation
    W.grad = None
    loss.backward()

    # -ve since you wanna decrease loss 
    # should be += rather than = , since you are altering the gradient, rather than setting it
    W.data += -10 * W.grad

1: 3.8278255462646484
2: 3.6824936866760254
3: 3.541057586669922
4: 3.4043397903442383
5: 3.2732536792755127
6: 3.1485588550567627
7: 3.030611276626587
8: 2.919307231903076
9: 2.814248561859131
10: 2.714961051940918
11: 2.62100887298584
12: 2.532010555267334
13: 2.4476168155670166
14: 2.367496967315674
15: 2.2913360595703125
16: 2.2188422679901123
17: 2.1497528553009033
18: 2.083834648132324
19: 2.0208845138549805
20: 1.9607229232788086


In [168]:
import random

random.seed(2147483647)

# try sampling from the trigram model lol
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  # random start, this is not right alr welps
  ix = random.randint(0, 26)
  prev_two = itob[ix]
  out = [prev_two[1]]

  while True:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=729).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    

    output_index = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[output_index])

    ix = btoi[out[-2] + out[-1]]
    if ix % 27 == 0:
      break
  print(''.join(out))

jgmxzdfzjglkurxycczkwyhhmvlzimjtnagnrlkfdkzka.
zug.
chamzcpbbpwkhrggitmj.
fibzmmqmkxujgfmtmdofekjeyktgscdgu.
inkgvnrnfrqtbspmhwcjdewvtahlvsuqysfxxblgjxlhgfiwuidwnnjgpfdnipkezktsdesu.
