# Exercise 1

**Task:** Train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [116]:
import random
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from tqdm import notebook

RANDOM_SEED = 42

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

%matplotlib inline
sns.set_theme()

In [117]:
words = open("names.txt", "r").read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [118]:
print(f"Number of names: {len(words):,}")

Number of names: 32,033


In [119]:
print(f"First ten words: {words[:10]}")

First ten words: ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [120]:
print(f"Length of words: {len(words)}")

Length of words: 32033


In [121]:
print(f"Length of minimum and maximum words: {min(len(w) for (w) in words)} and max {max(len(w) for (w) in words)}")

Length of minimum and maximum words: 2 and max 15


## Use MLP

In [123]:
# Make a list of characters:

chars = sorted(list(set("".join(words))))
chars = ["."] + chars

# # make a dictionary of character to index
stoi = {ch: i for (i, ch) in enumerate(chars)}

# # make a dictionary of index to character
itos = {i: ch for (ch, i) in stoi.items()}

In [124]:
# prepare dataset
xs, ys = [], []

for w in words:
        # add full stop at start and end
    chs = ["."] + list(w) + ["."]
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]

        xs.append([ix1, ix2])
        ys.append(ix3)

xs = torch.tensor(xs, dtype=torch.int64)
ys = torch.tensor(ys, dtype=torch.int64)

In [135]:
W = torch.randn((27*2,27), requires_grad =True, device = device)


In [139]:
for k in range(200):
    # forward pass
    xenc = F.one_hot(xs, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)

    # do softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    # loss(normalized negative log likelihood)
    loss = - probs[torch.arange(len(xs)), ys]
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    # if k % 10 == 0:
    #     print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

RuntimeError: grad can be implicitly created only for scalar outputs

In [143]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # do forward pass
    xenc = F.one_hot(xs, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # do softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss(normalized negative log likelihood)
    loss = - probs[torch.arange(len(xs)), ys].log().mean()
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.5282
10: 2.5694
20: 2.4515
30: 2.4122
40: 2.3933
50: 2.3827
60: 2.3762
70: 2.3721
80: 2.3695
90: 2.3677
100: 2.3665
110: 2.3657
120: 2.3651
130: 2.3647
140: 2.3644
150: 2.3642
160: 2.3641
170: 2.3640
180: 2.3639
190: 2.3639


In [None]:
def count_loss(input_list, verbose = False):
    log_likelihood = 0.0
    n = 0
    for w in input_list:
        chs = ["."] + list(w) + ["."]
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]

            prob = P[ix1, ix2, ix3]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1
            
            if verbose:
                print(f"{ch1}{ch2} -> {prob:.4f} {logprob:.4f}")

    # higher the log likelihood (closer to 0) is better
    print(f"log Likelihood: {log_likelihood}")

    # but in loss function lower is better, so we negate it
    nll = -log_likelihood
    print(f"Negative log likelihood: {nll}")

    # normalize it
    print(f"Normalized Negative log Likelihood: {(nll / n)}") # we need to minimize this

## Make loss function

In [153]:
N = torch.ones(27, 27, 27, dtype = torch.int32, device = device)
N[0, 0, 0] = 0
# getting the Bigrams
for w in words:
    # add start and end tokens
    chs = ["."] + list(w) + ["."]
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]

        N[ix1, ix2, ix3] += 1

P = N / N.sum(dim = 2, keepdim = True)

In [149]:
def count_loss(input_list, verbose = False):
    log_likelihood = 0.0
    n = 0
    for w in input_list:
        chs = ["."] + list(w) + ["."]
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]

            prob = P[ix1, ix2, ix3]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1
            
            if verbose:
                print(f"{ch1}{ch2} -> {prob:.4f} {logprob:.4f}")

    # higher the log likelihood (closer to 0) is better
    print(f"log Likelihood: {log_likelihood}")

    # but in loss function lower is better, so we negate it
    nll = -log_likelihood
    print(f"Negative log likelihood: {nll}")

    # normalize it
    print(f"Normalized Negative log Likelihood: {(nll / n)}") # we need to minimize this

### Sample from model

In [155]:
names = []
for i in range(10):
    out = []
    ix1, ix2 = 0, 0
    while True:
        # previosly we used P[ix]
        # p = P[ix]

        # now we use the softmax of the logits
        xenc = F.one_hot(torch.tensor([ix1, ix2]).to(device), num_classes = 27).float().to(device)
        xenc = xenc.view(-1, 27*2)
        
        logits = xenc @ W
        counts = torch.exp(logits)
        p = counts / counts.sum(dim = 1, keepdim = True)

        ix1 = ix2
        ix2 = torch.multinomial(p.to(device), num_samples = 1 , replacement = True).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break

    names.append("".join(out))
    
for name in names:
    print(name)
count_loss(names)

elxinzlayshrisekranya.
pn.
zoan.
uttny.
ad.
os.
eria.
ai.
raw.
myranseonn.
log Likelihood: -183.25955200195312
Negative log likelihood: 183.25955200195312
Normalized Negative log Likelihood: 2.8193776607513428


##  Ex-2 split up the dataset randomly into 80% train set, 10% dev set, 10% test set.

In [159]:
# prepare the dataset
from sklearn.model_selection import train_test_split

words_train, words_test = train_test_split(words, test_size=0.2, random_state=1234)
words_dev, words_test = train_test_split(words_test, test_size=0.5, random_state=1234)

x_train, y_train, x_dev, y_dev, x_test, y_test = [], [], [], [], [], []
for wgroup in [words_train, words_dev, words_test]:
    xs , ys = [], []
    for w in wgroup:
        # add start and end tokens
        chs = ["."] + list(w) + ["."]
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
        
            xs.append([ix1, ix2])
            ys.append(ix3)

    xs = torch.tensor(xs, dtype=torch.int64)
    ys = torch.tensor(ys, dtype=torch.int64)

    if wgroup == words_train:
        x_train, y_train = xs, ys
    elif wgroup == words_dev:
        x_dev, y_dev = xs, ys
    else:
        x_test, y_test = xs, ys


In [161]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(x_train, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x_train)), y_train].log().mean()
    # add regularization
    # loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.3494
10: 2.4803
20: 2.3664
30: 2.3256
40: 2.3041
50: 2.2907
60: 2.2814
70: 2.2747
80: 2.2695
90: 2.2654
100: 2.2622
110: 2.2595
120: 2.2573
130: 2.2554
140: 2.2538
150: 2.2524
160: 2.2512
170: 2.2502
180: 2.2493
190: 2.2484


In [163]:
def MLP_loss(x, y, W):
    xenc = F.one_hot(x, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)

    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x)), y].log().mean()
    
    return loss.item()

In [165]:
print(f"Train Loss: {MLP_loss(x_train, y_train, W):.4f}")
print(f"Dev Loss: {MLP_loss(x_dev, y_dev, W):.4f}")
print(f"Test Loss: {MLP_loss(x_test, y_test, W):.4f}")

Train Loss: 2.2477
Dev Loss: 2.2525
Test Loss: 2.2510


## Ex-3: Use Dev set

In [169]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(x_train, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x_train)), y_train].log().mean()
    # add regularization
    # loss += 0.05 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(x_dev, y_dev, W):.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: Train Loss: 4.2443 | Dev Loss 4.2576
10: Train Loss: 2.5107 | Dev Loss 2.5157
20: Train Loss: 2.3791 | Dev Loss 2.3816
30: Train Loss: 2.3313 | Dev Loss 2.3335
40: Train Loss: 2.3069 | Dev Loss 2.3093
50: Train Loss: 2.2923 | Dev Loss 2.2950
60: Train Loss: 2.2826 | Dev Loss 2.2856
70: Train Loss: 2.2756 | Dev Loss 2.2789
80: Train Loss: 2.2704 | Dev Loss 2.2739
90: Train Loss: 2.2664 | Dev Loss 2.2701
100: Train Loss: 2.2631 | Dev Loss 2.2670
110: Train Loss: 2.2604 | Dev Loss 2.2645
120: Train Loss: 2.2582 | Dev Loss 2.2625
130: Train Loss: 2.2563 | Dev Loss 2.2607
140: Train Loss: 2.2547 | Dev Loss 2.2592
150: Train Loss: 2.2533 | Dev Loss 2.2579
160: Train Loss: 2.2520 | Dev Loss 2.2568
170: Train Loss: 2.2509 | Dev Loss 2.2558
180: Train Loss: 2.2500 | Dev Loss 2.2549
190: Train Loss: 2.2491 | Dev Loss 2.2542


No regularization is better

## Ex-4: Rewrite the MLP model without creating one hot vectors


In [177]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    # ====================
    # Previously: using onehot and multiplying by W 
    # xenc = F.one_hot(xs, num_classes = 27).float().to(device)
    # xenc = xenc.view(-1, 27*2)
    # logits = xenc @ W
    # ====================

    # ====================
    # ✅ now: acess by xs indices directly
    logits = W[xs[:,0]] + W[xs[:,1] + 27]
    # ====================
    
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(xs)), ys].log().mean()
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.1857
10: 2.5815
20: 2.4542
30: 2.4075
40: 2.3847
50: 2.3720
60: 2.3643
70: 2.3595
80: 2.3564
90: 2.3544
100: 2.3530
110: 2.3520
120: 2.3514
130: 2.3509
140: 2.3506
150: 2.3504
160: 2.3503
170: 2.3501
180: 2.3501
190: 2.3500


# Ex-5: look up and use F.cross_entropy instead
nn.functonal.cross_entropy() takes the logits and the target class as input and returns the cross entropy loss directly

In [180]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(xs, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    logits = xenc @ W
    
    loss = torch.nn.functional.cross_entropy(logits, ys.to(device))
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.3923
10: 2.6107
20: 2.4677
30: 2.4140
40: 2.3879
50: 2.3736
60: 2.3651
70: 2.3600
80: 2.3567
90: 2.3545
100: 2.3531
110: 2.3521
120: 2.3514
130: 2.3510
140: 2.3507
150: 2.3504
160: 2.3503
170: 2.3501
180: 2.3501
190: 2.3500


# Ex-6: meta-exercise! Think of a fun/interesting exercise and complete it
more % in dev and more iterations

In [208]:
# prepare the dataset
from sklearn.model_selection import train_test_split

words_train, words_test = train_test_split(words, test_size=0.2, random_state=1234)
words_dev, words_test = train_test_split(words_test, test_size=0.3, random_state=1234)

x_train, y_train, x_dev, y_dev, x_test, y_test = [], [], [], [], [], []
for wgroup in [words_train, words_dev, words_test]:
    xs , ys = [], []
    for w in wgroup:
        # add start and end tokens
        chs = ["."] + list(w) + ["."]
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
        
            xs.append([ix1, ix2])
            ys.append(ix3)

    xs = torch.tensor(xs, dtype=torch.int64)
    ys = torch.tensor(ys, dtype=torch.int64)

    if wgroup == words_train:
        x_train, y_train = xs, ys
    elif wgroup == words_dev:
        x_dev, y_dev = xs, ys
    else:
        x_test, y_test = xs, ys


In [210]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(250):
    # forward pass
    xenc = F.one_hot(x_train, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x_train)), y_train].log().mean()
    # add regularization
    # loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.3736
10: 2.4763
20: 2.3649
30: 2.3238
40: 2.3022
50: 2.2889
60: 2.2798
70: 2.2733
80: 2.2684
90: 2.2645
100: 2.2615
110: 2.2590
120: 2.2569
130: 2.2551
140: 2.2536
150: 2.2523
160: 2.2512
170: 2.2501
180: 2.2492
190: 2.2484
200: 2.2477
210: 2.2471
220: 2.2465
230: 2.2459
240: 2.2455


for learning rates we used [0.1, 0.05, 0.05, 0.001] 30k steps each