In [3]:
import torch

In [2]:
# tasks
# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. 
# Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model? 
# -> i will try to make this implementation using a neural net

# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. 
# Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

# E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model 
# - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. 
# How good of a loss do you achieve?

# E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. 
# Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

# E05: look up and use F.cross_entropy instead. You should achieve the same result. 
# Can you think of why we'd prefer to use F.cross_entropy instead?

In [5]:
names = open("names.txt", "r").read().splitlines()
# names[:10]

In [20]:
# get a set of characters to integers
all_letters = set()
for name in names:
    for char in name:
        all_letters.add(char)

all_letters = list(all_letters)
all_letters.append(".")
sorted_letters = sorted(all_letters)

# this is going ot be used for the output layer
stoi = {l:count+1 for count, l in enumerate(sorted_letters)} #string to integer
stoi["."] = 0

# this is going to be used for the input layer
# now this starts at 0 as well
btoi = {}
for outc, ch1 in (enumerate(sorted_letters)):
    for inc, ch2 in enumerate(sorted_letters):
        index = outc * 27 + inc
        btoi[ch1+ch2] = index

btoi


{'..': 0,
 '.a': 1,
 '.b': 2,
 '.c': 3,
 '.d': 4,
 '.e': 5,
 '.f': 6,
 '.g': 7,
 '.h': 8,
 '.i': 9,
 '.j': 10,
 '.k': 11,
 '.l': 12,
 '.m': 13,
 '.n': 14,
 '.o': 15,
 '.p': 16,
 '.q': 17,
 '.r': 18,
 '.s': 19,
 '.t': 20,
 '.u': 21,
 '.v': 22,
 '.w': 23,
 '.x': 24,
 '.y': 25,
 '.z': 26,
 'a.': 27,
 'aa': 28,
 'ab': 29,
 'ac': 30,
 'ad': 31,
 'ae': 32,
 'af': 33,
 'ag': 34,
 'ah': 35,
 'ai': 36,
 'aj': 37,
 'ak': 38,
 'al': 39,
 'am': 40,
 'an': 41,
 'ao': 42,
 'ap': 43,
 'aq': 44,
 'ar': 45,
 'as': 46,
 'at': 47,
 'au': 48,
 'av': 49,
 'aw': 50,
 'ax': 51,
 'ay': 52,
 'az': 53,
 'b.': 54,
 'ba': 55,
 'bb': 56,
 'bc': 57,
 'bd': 58,
 'be': 59,
 'bf': 60,
 'bg': 61,
 'bh': 62,
 'bi': 63,
 'bj': 64,
 'bk': 65,
 'bl': 66,
 'bm': 67,
 'bn': 68,
 'bo': 69,
 'bp': 70,
 'bq': 71,
 'br': 72,
 'bs': 73,
 'bt': 74,
 'bu': 75,
 'bv': 76,
 'bw': 77,
 'bx': 78,
 'by': 79,
 'bz': 80,
 'c.': 81,
 'ca': 82,
 'cb': 83,
 'cc': 84,
 'cd': 85,
 'ce': 86,
 'cf': 87,
 'cg': 88,
 'ch': 89,
 'ci': 90,
 'cj': 91

In [21]:
# this will be the inputs
x = []

# this will be the outputs
y = []

# chars
chars = []

# create trigrams from the names, add a "." to the front and the back. Trigrams use the first 2 letters to predict the third
for name in names[:3]:
    name = "." + name + "."
    # zip() is good here because it stops creating sets once there are no more sets of 3 to make
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        chars.append(((ch1, ch2), ch3))
        
        input_chars = ch1 + ch2
        x.append(btoi[input_chars])
        
        y.append(stoi[ch3])

print(chars)
print(x)
print(y)

[(('.', 'e'), 'm'), (('e', 'm'), 'm'), (('m', 'm'), 'a'), (('m', 'a'), '.'), (('.', 'o'), 'l'), (('o', 'l'), 'i'), (('l', 'i'), 'v'), (('i', 'v'), 'i'), (('v', 'i'), 'a'), (('i', 'a'), '.'), (('.', 'a'), 'v'), (('a', 'v'), 'a'), (('v', 'a'), '.')]
[5, 148, 364, 352, 15, 417, 333, 265, 603, 244, 1, 49, 595]
[14, 14, 2, 0, 13, 10, 23, 10, 2, 0, 23, 2, 0]


In [25]:
# create one-hot encodings for all the inputs
import torch.nn.functional as F

# F.one_hot(torch.tensor([1]), num_classes=729)
for inputs in x:
    xenc = F.one_hot(torch.tensor(x), num_classes=729).float()

xenc.shape

torch.Size([13, 729])

In [43]:
# time to contruct the neural network

# fix the generator
g = torch.Generator().manual_seed(2147483647)

# create the weights with randoms -> it should have 729, 27 so that it can give you a probability distribution for each of the 27 outputs. 
W = torch.randn((729, 27), generator=g)

In [36]:
# here is the output if it is a 729, 1 random int matrix -> output is no_of_examples * 1, which does not really help you to estimate any probabilities
W_wrong = torch.rand((729, 1), generator=g)
wrong_output = xenc @ W_wrong

In [62]:
# get the predictions - because these output values have -ves in them, they can be considered to be log-counts
logits = xenc @ W
#  P.shape (13, 27)

# remove the negatives
P = logits.exp()

# normalize the weights
sum = P.sum(dim=1, keepdim=True)
# sum.shape (13, 1)

probs = P / sum

# verify that each row has a probability total of 1
# probs[0, :].sum()


In [65]:
probs[0].shape

torch.Size([27])

In [69]:
# make a reverse example that maps the index to the string
itos = {i:s for s, i in stoi.items()}

In [75]:
# look at the probabilities for the desired outputs [14, 14, 2, 0, 13, 10, 23, 10, 2, 0, 23, 2, 0]
# the 'loss' can be calculated using -log
loss = -probs[torch.arange(13), y].log().mean()
loss

tensor(3.6330)