In [1]:
import math
import torch
import torch.nn.functional as F

Trying a 3-gram model

In [22]:
import random


words=open('english_words.txt','r').read().splitlines()
random_seed=42
random.seed(random_seed)
random.shuffle(words)

In [23]:
chars=sorted(set(''.join(words)))
chars.append('.')
chars.append('_')
vocab_size=len(chars)

In [24]:
stoi={s:i for i,s in enumerate(chars)}
itos={i:s for s,i in stoi.items()}

In [25]:
stoi['.']

26

In [26]:
def build_masked_dataset(words):
    X,Y=[],[]

    for w in words:
        w='.' + w + '.'
        for i in range(1,len(w)-1):
            left=w[i-1]
            mid=w[i]
            right=w[i+1]

            if mid=='.':
                continue

            X.append([stoi[left],stoi[right]])
            Y.append(stoi[mid])

    return torch.tensor(X), torch.tensor(Y)

In [27]:
X,Y=build_masked_dataset(words)
X.shape, Y.shape

(torch.Size([65259, 2]), torch.Size([65259]))

In [28]:
i = 2
print("Left:", itos[X[i][0].item()])
print("Right:", itos[X[i][1].item()])
print("Target:", itos[Y[i].item()])


Left: l
Right: s
Target: e


In [29]:
test_word = "_oogle"
w = "." + test_word.replace("_", "") + "."
print(w)


.oogle.


In [30]:
W=torch.randn((vocab_size,vocab_size,vocab_size),requires_grad=True)  # W[left, right, mid]

In [31]:
logits=W[X[:,0],X[:,1]]

In [32]:
logits.shape

torch.Size([65259, 28])

In [33]:
loss=F.cross_entropy(logits,Y)

In [34]:
loss.item()

3.812284231185913

In [35]:
lr=50

for step in range(300):
    logits=W[X[:,0],X[:,1]]
    loss=F.cross_entropy(logits,Y)

    W.grad=None
    loss.backward()
    W.data-=lr*W.grad

    if step % 25 == 0:
        print(f'Step {step}: {loss.item()}')

Step 0: 3.812284231185913
Step 25: 2.9119510650634766
Step 50: 2.6048758029937744
Step 75: 2.436429262161255
Step 100: 2.331171989440918
Step 125: 2.259284257888794
Step 150: 2.2069544792175293
Step 175: 2.167023181915283
Step 200: 2.13547682762146
Step 225: 2.1098973751068115
Step 250: 2.0887343883514404
Step 275: 2.07094144821167


In [36]:
def fill_one_blank(word):
    w = '.' + word + '.'
    i = w.index('_')

    left = w[i - 1]
    right = w[i + 1]

    logits = W[stoi[left], stoi[right]]
    probs = logits.softmax(dim=0)
    ix = torch.argmax(probs).item()

    return word.replace('_', itos[ix])


In [44]:
tests = ["goo_le", "regula_ions", "wait_r", "raz_r", "b_tt_r", "compu_er", "develo_ment", "pyth_n"]

for t in tests:
    print(t, "→", fill_one_blank(t))


goo_le → goolle
regula_ions → regulations
wait_r → waiter
raz_r → razkr
b_tt_r → buttur
compu_er → compurer
develo_ment → develomment
pyth_n → pythan


In [38]:
def top_k_predictions(word, k=5):
    w = '.' + word + '.'
    i = w.index('_')

    left = w[i - 1]
    right = w[i + 1]

    logits = W[stoi[left], stoi[right]]
    probs = logits.softmax(dim=0)
    vals, idxs = torch.topk(probs, k)

    return [(itos[i.item()], v.item()) for i, v in zip(idxs, vals)]


In [45]:
top_k_predictions("goo_le")


[('l', 0.4272497594356537),
 ('o', 0.1186051294207573),
 ('u', 0.07536370307207108),
 ('w', 0.03468449413776398),
 ('b', 0.02674558386206627)]

Trying a 5-gram Model

In [46]:
def build_5gram_masked_dataset(words):
    X, Y = [], []

    for w in words:
        w = '..' + w + '..'
        for i in range(2, len(w) - 2):
            mid = w[i]
            if mid == '.':
                continue

            X.append([
                stoi[w[i-2]],  # left2
                stoi[w[i-1]],  # left1
                stoi[w[i+1]],  # right1
                stoi[w[i+2]]   # right2
            ])
            Y.append(stoi[mid])

    return torch.tensor(X), torch.tensor(Y)


In [47]:
X, Y = build_5gram_masked_dataset(words)

X.shape, Y.shape


(torch.Size([65259, 4]), torch.Size([65259]))

In [48]:
i = 0
print(
    itos[X[i][0].item()],
    itos[X[i][1].item()],
    "_",
    itos[X[i][2].item()],
    itos[X[i][3].item()],
    "→",
    itos[Y[i].item()]
)


. . _ l e → b


In [51]:
W = torch.randn((vocab_size, vocab_size, vocab_size, vocab_size, vocab_size), requires_grad=True)


In [52]:
logits = W[
    X[:,0],  # left2
    X[:,1],  # left1
    X[:,2],  # right1
    X[:,3]   # right2
]


In [57]:
import torch.nn.functional as F

lr = 100

for step in range(300):
    logits = W[X[:,0], X[:,1], X[:,2], X[:,3]]
    loss = F.cross_entropy(logits, Y)

    W.grad = None
    loss.backward()
    W.data -= lr * W.grad

    if step % 50 == 0:
        print(step, loss.item())


0 2.1773359775543213
50 2.074345827102661
100 1.9848856925964355
150 1.9063074588775635
200 1.836618423461914
250 1.7742846012115479


In [55]:
def fill_one_blank_5gram(word):
    w = '..' + word + '..'
    i = w.index('_')

    l2 = w[i-2]
    l1 = w[i-1]
    r1 = w[i+1]
    r2 = w[i+2]

    logits = W[
        stoi[l2],
        stoi[l1],
        stoi[r1],
        stoi[r2]
    ]
    probs = logits.softmax(dim=0)
    ix = torch.argmax(probs).item()

    return word.replace('_', itos[ix])


In [60]:
tests = [
    "goo_le",
    "regula_ions",
    "wait_r",
    "b_tt_r",
    "compu_er",
    "develo_ment",
    "pyth_n"
]

for t in tests:
    print(t, "→", fill_one_blank_5gram(t))


goo_le → goojle
regula_ions → regulations
wait_r → waitor
b_tt_r → buttur
compu_er → computer
develo_ment → development
pyth_n → python


We see phonetic errors, but overall the model predicts good things

In [None]:
def top_k_predictions(word, k=5):
    w = '.' + word + '.'
    i = w.index('_')

    left = w[i - 1]
    right = w[i + 1]

    logits = W[stoi[left], stoi[right]]
    probs = logits.softmax(dim=0)
    vals, idxs = torch.topk(probs, k)

    return [(itos[i.item()], v.item()) for i, v in zip(idxs, vals)]

In [63]:
w = '..' + 'goo_le' + '..'
i = w.index('_')

l2 = w[i-2]
l1 = w[i-1]
r1 = w[i+1]
r2 = w[i+2]

logits = W[
        stoi[l2],
        stoi[l1],
        stoi[r1],
        stoi[r2]]
probs = logits.softmax(dim=0)
vals, idxs = torch.topk(probs, 5)
print([(itos[i.item()], v.item()) for i, v in zip(idxs, vals)])

[('j', 0.292244553565979), ('u', 0.12169970571994781), ('e', 0.08292999118566513), ('o', 0.061494868248701096), ('g', 0.055679213255643845)]
