E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [1]:
words = open('../names.txt', 'r').read().splitlines()

In [2]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

### Tiagram Data Models

In [3]:
t = {}
for w in words:
    chars = ["."] + list(w) + ["."]
    for ch1,ch2,ch3 in zip(chars,chars[1:],chars[2:]):
        tiagram = (ch1,ch2,ch3)
        t[tiagram] = t.get(tiagram,0)+1

In [4]:
t

{('.', 'e', 'm'): 288,
 ('e', 'm', 'm'): 100,
 ('m', 'm', 'a'): 72,
 ('m', 'a', '.'): 174,
 ('.', 'o', 'l'): 104,
 ('o', 'l', 'i'): 69,
 ('l', 'i', 'v'): 54,
 ('i', 'v', 'i'): 78,
 ('v', 'i', 'a'): 147,
 ('i', 'a', '.'): 903,
 ('.', 'a', 'v'): 243,
 ('a', 'v', 'a'): 161,
 ('v', 'a', '.'): 93,
 ('.', 'i', 's'): 124,
 ('i', 's', 'a'): 142,
 ('s', 'a', 'b'): 76,
 ('a', 'b', 'e'): 173,
 ('b', 'e', 'l'): 201,
 ('e', 'l', 'l'): 822,
 ('l', 'l', 'a'): 337,
 ('l', 'a', '.'): 684,
 ('.', 's', 'o'): 152,
 ('s', 'o', 'p'): 21,
 ('o', 'p', 'h'): 37,
 ('p', 'h', 'i'): 61,
 ('h', 'i', 'a'): 81,
 ('.', 'c', 'h'): 352,
 ('c', 'h', 'a'): 236,
 ('h', 'a', 'r'): 329,
 ('a', 'r', 'l'): 287,
 ('r', 'l', 'o'): 44,
 ('l', 'o', 't'): 14,
 ('o', 't', 't'): 34,
 ('t', 't', 'e'): 121,
 ('t', 'e', '.'): 175,
 ('.', 'm', 'i'): 393,
 ('m', 'i', 'a'): 95,
 ('.', 'a', 'm'): 384,
 ('a', 'm', 'e'): 226,
 ('m', 'e', 'l'): 188,
 ('e', 'l', 'i'): 537,
 ('l', 'i', 'a'): 518,
 ('.', 'h', 'a'): 505,
 ('a', 'r', 'p'): 8,
 ('r

In [5]:
chars = sorted(list((set(''.join(words)))))

In [6]:
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0;
itos = {i:s for s,i in stoi.items()}

### Couting approach

In [7]:
import torch

In [28]:
N = torch.zeros((27,27,27),dtype = torch.int32)
N[0][0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], dtype=torch.int32)

In [9]:
N.shape

torch.Size([27, 27, 27])

In [10]:
for w in words:
    chars = ["."] + list(w) + ["."]
    for ch1,ch2,ch3 in zip(chars,chars[1:],chars[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        N[ix1,ix2,ix3] += 1
        

In [11]:
P = (N+2).float()
P /= P.sum(2,keepdim=True)

In [12]:
P.shape

torch.Size([27, 27, 27])

In [13]:
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
    out = []
    ix1 = 0;
    ix2 = 0;
    while True:
        p = P[ix1][ix2]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break;
        else:
            ix1 = ix2
            ix2 = ix
    print(''.join(out))

junide.
ilyasid.
prelay.
ocin.
fai.
ritoper.
sathen.
dannaaryanileniassibduinrwin.
lessiyanayla.
te.


In [14]:
log_likelihood = 0.0
n = 0
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2,ch3 in zip(chs,chs[1:],chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        prob = P[ix1,ix2,ix3]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        # print(f'{ch1}{ch2} : {prob:.4f} {logprob: .4f}')

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n=}')

log_likelihood=tensor(-414938.1562)
nll=tensor(414938.1562)
nll/n=tensor(2.1158)


### Neural Network Approach

In [53]:
xs , ys = [],[]
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1,ch2,ch3 in zip(chs,chs[1:],chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        print(ch1,ch2,ch3)
        xs.append((ix1,ix2))
        ys.append(ix3)
xs = torch.tensor(xs)
ys = torch.tensor(ys)

. e m
e m m
m m a
m a .
. o l
o l i
l i v
i v i
v i a
i a .
. a v
a v a
v a .
. i s
i s a
s a b
a b e
b e l
e l l
l l a
l a .
. s o
s o p
o p h
p h i
h i a
i a .
. c h
c h a
h a r
a r l
r l o
l o t
o t t
t t e
t e .
. m i
m i a
i a .
. a m
a m e
m e l
e l i
l i a
i a .
. h a
h a r
a r p
r p e
p e r
e r .
. e v
e v e
v e l
e l y
l y n
y n .
. a b
a b i
b i g
i g a
g a i
a i l
i l .
. e m
e m i
m i l
i l y
l y .
. e l
e l i
l i z
i z a
z a b
a b e
b e t
e t h
t h .
. m i
m i l
i l a
l a .
. e l
e l l
l l a
l a .
. a v
a v e
v e r
e r y
r y .
. s o
s o f
o f i
f i a
i a .
. c a
c a m
a m i
m i l
i l a
l a .
. a r
a r i
r i a
i a .
. s c
s c a
c a r
a r l
r l e
l e t
e t t
t t .
. v i
v i c
i c t
c t o
t o r
o r i
r i a
i a .
. m a
m a d
a d i
d i s
i s o
s o n
o n .
. l u
l u n
u n a
n a .
. g r
g r a
r a c
a c e
c e .
. c h
c h l
h l o
l o e
o e .
. p e
p e n
e n e
n e l
e l o
l o p
o p e
p e .
. l a
l a y
a y l
y l a
l a .
. r i
r i l
i l e
l e y
e y .
. z o
z o e
o e y
e y .
. n o
n o 

In [16]:
xs

tensor([[ 0,  5],
        [ 5, 13],
        [13, 13],
        [13,  1]])

In [17]:
ys

tensor([13, 13,  1,  0])

xs is a tensor of shape (num_samples, 2), where each row represents two character indices.

F.one_hot(xs, num_classes=27) converts each index into a one-hot vector of size 27.

Since xs has shape (num_samples, 2), after one-hot encoding, xenc will have shape:

In [30]:
xenc = F.one_hot(xs,num_classes=27).float()
xenc.shape # (num_samples,2,27)

torch.Size([4, 2, 27])

In [31]:
xenc

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0.,

In [34]:
xenc = xenc.view(-1,27*2) # -1 means pytorch infers batch size
xenc.shape

torch.Size([4, 54])

In [35]:
xenc[0]

tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [36]:
xenc.shape

torch.Size([4, 54])

In [37]:
W = torch.randn((27*2,1))
xenc @ W

tensor([[-0.2911],
        [ 1.2716],
        [-1.9629],
        [-1.7850]])

In [38]:
W = torch.randn((27*2,27))
xenc @ W 

tensor([[ 8.6736e-01,  4.8181e-01, -1.6818e+00,  1.4593e+00, -4.0832e-01,
         -5.8142e-01, -2.8904e-01,  1.7743e+00, -5.9394e-01,  1.2503e-01,
          1.2172e+00, -3.0081e+00, -3.7857e-01, -1.1238e+00, -1.5452e+00,
          1.3505e+00,  6.1076e-01, -1.5931e+00,  2.2506e+00, -3.3826e-01,
         -1.0986e+00,  2.5286e-01,  2.6464e+00,  1.0359e+00,  2.2548e+00,
         -9.5200e-01, -1.8651e+00],
        [-8.8775e-01,  1.3851e+00,  2.5170e-01,  2.2256e-01, -2.1590e+00,
          2.0391e+00, -1.5836e+00,  1.1412e+00, -1.3463e+00, -7.4664e-01,
          6.0982e-01, -2.1660e+00, -1.2973e+00, -1.9074e+00,  3.1263e+00,
         -1.6866e+00, -8.5414e-01, -5.6953e-01, -1.0289e+00,  1.5195e+00,
         -8.4211e-01, -1.5799e-01,  5.7707e-01, -7.2359e-01,  1.2790e+00,
         -1.2436e+00, -1.0549e+00],
        [-1.7160e+00,  1.5045e+00,  7.3605e-01,  1.2456e-01, -2.5884e+00,
          9.4218e-02, -2.8581e-02,  3.9962e-01, -1.1144e+00, -5.2540e-02,
          2.1538e+00, -2.6991e+00, -1.20

In [39]:
(xenc @ W)[0,2] # firing rate of second neuron looking at zero input

tensor(-1.6818)

In [47]:
xenc[0].shape

torch.Size([54])

In [50]:
W[:,2].shape

torch.Size([54])

In [51]:
(xenc[0] * W[:,2]).sum()

tensor(-1.6818)

In [52]:
W = torch.randn((27*2,27),requires_grad=True)

In [55]:
for k in range(200):
    # performing forward pass
    xenc = F.one_hot(xs,num_classes=27).float()
    xenc = xenc.view(-1,2*27)

    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    loss = -probs[torch.arange(len(xs)),ys].log().mean()
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.4191
10: 2.6028
20: 2.4688
30: 2.4208
40: 2.3977
50: 2.3850
60: 2.3775
70: 2.3729
80: 2.3699
90: 2.3679
100: 2.3666
110: 2.3658
120: 2.3652
130: 2.3647
140: 2.3644
150: 2.3642
160: 2.3641
170: 2.3640
180: 2.3639
190: 2.3639


In [59]:
probs[0,13],probs[1,13]

(tensor(0.0571, grad_fn=<SelectBackward0>),
 tensor(0.0379, grad_fn=<SelectBackward0>))

In [62]:
ys[0]

tensor(13)

In [65]:
names = []
for i in range(10):
    out = []
    ix1, ix2 = 0, 0
    while True:
        xenc = F.one_hot(torch.tensor([ix1, ix2]), num_classes = 27).float()
        xenc = xenc.view(-1, 27*2)
        
        logits = xenc @ W
        counts = torch.exp(logits)
        p = counts / counts.sum(dim = 1, keepdim = True)

        ix1 = ix2
        ix2 = torch.multinomial(p, num_samples = 1 , replacement = True).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break

    names.append("".join(out))
    
for name in names:
    print(name)

zedie.
ylbbrygayfdicori.
hylemzaon.
osi.
sunohyahcridyynea.
aroune.
anou.
usamindh.
odpqnch.
ja.


In [70]:
xenc = F.one_hot(torch.tensor([ix1, ix2]), num_classes = 27).float()
xenc.shape

torch.Size([2, 27])

In [71]:
xenc = xenc.view(-1, 27*2)
xenc.shape

torch.Size([1, 54])

### E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [72]:
xs.shape,ys.shape

(torch.Size([196113, 2]), torch.Size([196113]))

In [73]:
from sklearn.model_selection import train_test_split

X_train, X_test_val, y_train, y_test_val = train_test_split(xs, ys, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)

In [74]:
X_train.shape,X_test.shape,X_val.shape

(torch.Size([156890, 2]), torch.Size([19611, 2]), torch.Size([19612, 2]))

In [78]:
W = torch.randn((27*2,27),requires_grad=True)
for k in range(200):
    # performing forward pass
    xenc = F.one_hot(X_train,num_classes=27).float()
    xenc = xenc.view(-1,2*27)

    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    loss = -probs[torch.arange(len(X_train)),y_train].log().mean()
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.4329
10: 2.5915
20: 2.4687
30: 2.4223
40: 2.3995
50: 2.3868
60: 2.3792
70: 2.3744
80: 2.3713
90: 2.3692
100: 2.3678
110: 2.3668
120: 2.3662
130: 2.3657
140: 2.3654
150: 2.3651
160: 2.3650
170: 2.3648
180: 2.3648
190: 2.3647


### ### E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [81]:
def MLP_loss(x, y, W):
    xenc = F.one_hot(x, num_classes = 27).float()
    xenc = xenc.view(-1, 27*2)

    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x)), y].log().mean()
    
    return loss.item()

In [82]:
print(f"Train Loss: {MLP_loss(X_train, y_train, W):.4f}")
print(f"Dev Loss: {MLP_loss(X_val, y_val, W):.4f}")
print(f"Test Loss: {MLP_loss(X_test, y_test, W):.4f}")

Train Loss: 2.2875
Dev Loss: 2.2881
Test Loss: 2.2829
