In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt# reading the data
words = open("/content/names.txt", "r").read().splitlines()

# Exploring
print(f"first 10 words{words[:10]}")
print(f"length of words: {len(words)}")
print(f"min word length {min(len(w) for (w) in words)} and max word length {max(len(w) for (w) in words)}")

first 10 words['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
length of words: 32033
min word length 2 and max word length 15


Exercise 1: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [4]:
chars = sorted(list(set("".join(words))))
chars = ["."] + chars

stoi = {ch: i for (i, ch) in enumerate(chars)}

itos = {i: ch for (ch, i) in stoi.items()}

N = torch.ones(27, 27, 27, dtype = torch.int32)
N[0, 0, 0] = 0
# getting the Bigrams
for w in words:
    # add start and end tokens
    chs = ["."] + list(w) + ["."]
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]

        N[ix1, ix2, ix3] += 1

P = N / N.sum(dim = 2, keepdim = True)

def count_loss(input_list, verbose = False):
    log_likelihood = 0.0
    n = 0
    for w in input_list:
        chs = ["."] + list(w) + ["."]
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]

            prob = P[ix1, ix2, ix3]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1

            if verbose:
                print(f"{ch1}{ch2} -> {prob:.4f} {logprob:.4f}")

    # higher the log likelihood (closer to 0) is better
    print(f"log Likelihood: {log_likelihood}")

    # but in loss function lower is better, so we negate it
    nll = -log_likelihood
    print(f"Negative log likelihood: {nll}")

    print(f"Normalized Negative log Likelihood: {(nll / n)}") # we need to minimize this

In [5]:
print("Training Loss")
count_loss(words)

Training Loss
log Likelihood: -410414.96875
Negative log likelihood: 410414.96875
Normalized Negative log Likelihood: 2.092747449874878


In [6]:
names = []
for i in range(10):
    out = []
    ix1, ix2 = 0, 0
    while True:
        p = P[ix1, ix2]
        ix1 = ix2
        ix2 = torch.multinomial(p, 1, replacement=True).item()
        if ix2 == 0:
            break
        out.append(itos[ix2])

    names.append("".join(out))

print(names)
print("Sampled words Loss")
count_loss(names)

['di', 'ezrechaira', 'ell', 'apriya', 'bromimaita', 'chawaleignyxon', 'tni', 'khyana', 'uria', 'xais']
Sampled words Loss
log Likelihood: -131.75120544433594
Negative log likelihood: 131.75120544433594
Normalized Negative log Likelihood: 2.1250195503234863


E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [7]:
# prepare the dataset
from sklearn.model_selection import train_test_split

words_train, words_test = train_test_split(words, test_size=0.2, random_state=1234)
words_dev, words_test = train_test_split(words_test, test_size=0.5, random_state=1234)

x_train, y_train, x_dev, y_dev, x_test, y_test = [], [], [], [], [], []
for wgroup in [words_train, words_dev, words_test]:
    xs , ys = [], []
    for w in wgroup:
        # add start and end tokens
        chs = ["."] + list(w) + ["."]
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]

            xs.append([ix1, ix2])
            ys.append(ix3)

    xs = torch.tensor(xs, dtype=torch.int64)
    ys = torch.tensor(ys, dtype=torch.int64)

    if wgroup == words_train:
        x_train, y_train = xs, ys
    elif wgroup == words_dev:
        x_dev, y_dev = xs, ys
    else:
        x_test, y_test = xs, ys

In [14]:
W = torch.randn((27*2,27), requires_grad = True)
for k in range(200):
    # forward pass
    xenc = F.one_hot(x_train, num_classes = 27).float()
    xenc = xenc.view(-1, 27*2)

    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x_train)), y_train].log().mean()
    # add regularization
    # loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.3064
10: 2.4965
20: 2.3742
30: 2.3285
40: 2.3051
50: 2.2911
60: 2.2816
70: 2.2747
80: 2.2695
90: 2.2654
100: 2.2621
110: 2.2595
120: 2.2572
130: 2.2554
140: 2.2538
150: 2.2524
160: 2.2512
170: 2.2502
180: 2.2493
190: 2.2485


In [15]:
def MLP_loss(x, y, W):
    xenc = F.one_hot(x, num_classes = 27).float()
    xenc = xenc.view(-1, 27*2)

    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x)), y].log().mean()

    return loss.item()
print(f"Train Loss: {MLP_loss(x_train, y_train, W):.4f}")
print(f"Dev Loss: {MLP_loss(x_dev, y_dev, W):.4f}")
print(f"Test Loss: {MLP_loss(x_test, y_test, W):.4f}")

Train Loss: 2.2477
Dev Loss: 2.2523
Test Loss: 2.2512


E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?


In [17]:
W = torch.randn((27*2,27), requires_grad = True)
for k in range(200):
    # forward pass
    xenc = F.one_hot(x_train, num_classes = 27).float()
    xenc = xenc.view(-1, 27*2)

    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x_train)), y_train].log().mean()
    # add regularization
    # loss += 0.05 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(x_dev, y_dev, W):.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: Train Loss: 4.0929 | Dev Loss 4.1041
10: Train Loss: 2.4989 | Dev Loss 2.5058
20: Train Loss: 2.3767 | Dev Loss 2.3816
30: Train Loss: 2.3298 | Dev Loss 2.3344
40: Train Loss: 2.3053 | Dev Loss 2.3098
50: Train Loss: 2.2905 | Dev Loss 2.2950
60: Train Loss: 2.2807 | Dev Loss 2.2852
70: Train Loss: 2.2737 | Dev Loss 2.2782
80: Train Loss: 2.2684 | Dev Loss 2.2729
90: Train Loss: 2.2643 | Dev Loss 2.2689
100: Train Loss: 2.2611 | Dev Loss 2.2656
110: Train Loss: 2.2584 | Dev Loss 2.2630
120: Train Loss: 2.2563 | Dev Loss 2.2608
130: Train Loss: 2.2545 | Dev Loss 2.2590
140: Train Loss: 2.2529 | Dev Loss 2.2575
150: Train Loss: 2.2516 | Dev Loss 2.2563
160: Train Loss: 2.2505 | Dev Loss 2.2551
170: Train Loss: 2.2495 | Dev Loss 2.2542
180: Train Loss: 2.2486 | Dev Loss 2.2533
190: Train Loss: 2.2478 | Dev Loss 2.2526


E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?


In [19]:
W = torch.randn((27*2,27), requires_grad = True)
for k in range(200):
    # forward pass
    # ====================
    # Previously: using onehot and multiplying by W
    # xenc = F.one_hot(xs, num_classes = 27).float().to(device)
    # xenc = xenc.view(-1, 27*2)
    # logits = xenc @ W
    # ====================

    # ====================
    # ✅ now: acess by xs indices directly
    logits = W[xs[:,0]] + W[xs[:,1] + 27]
    # ====================

    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(xs)), ys].log().mean()
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.5407
10: 2.5866
20: 2.4540
30: 2.4065
40: 2.3839
50: 2.3715
60: 2.3640
70: 2.3594
80: 2.3563
90: 2.3543
100: 2.3530
110: 2.3521
120: 2.3514
130: 2.3510
140: 2.3506
150: 2.3504
160: 2.3503
170: 2.3502
180: 2.3501
190: 2.3500
