In [1]:
import torch
import torch.nn.functional as F

from sklearn.model_selection import train_test_split

import pickle

## Data prep

In [2]:
words = open("./names.txt", "r").read().splitlines()
train_words, temp_words = train_test_split(words, train_size=0.8, random_state=42)
dev_words, test_words = train_test_split(temp_words, test_size=0.5, random_state=42)

In [3]:
len(train_words), len(dev_words), len(test_words)

(25626, 3203, 3204)

## Trigram

In [4]:
train_chars = sorted(list(set(''.join(train_words))))
two_chars = set()
for c1 in train_chars+["."]:
  for c2 in train_chars+["."]:
    two_chars.add(c1+c2)

two_chars = sorted(list(two_chars))

stoi = {s:i+1 for i,s in enumerate(train_chars)}
stoi["."] = 0
stoi2 = {s:i for i,s in enumerate(two_chars)}
itos2 = {i:s for i,s in enumerate(two_chars)}

In [5]:
xs_t, ys_t = [], []
for w in train_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2,ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi2[ch1+ch2]
    ix2 = stoi[ch3]
    xs_t.append(ix1)
    ys_t.append(ix2)

xs_t = torch.tensor(xs_t)
ys_t = torch.tensor(ys_t)

W = torch.empty(0)

In [6]:
def train(reg_factor, epochs=150):
    global W
    g = torch.Generator().manual_seed(2147483647)
    W = torch.randn((729, 27), generator=g, requires_grad=True)
    for i in range(150):
        # forward pass
        logits = W[xs_t]
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdim=True)
        loss = -probs[torch.arange(xs_t.nelement()), ys_t].log().mean() + reg_factor*(W**2).mean()
    
        print(f"Epoch: {i}; Loss: {loss.item()}")
    
      # backward pass
        W.grad = None
        loss.backward()
        with torch.no_grad():
            W.data += -75 * W.grad

In [7]:
def get_loss(word_set):
    xs_t, ys_t = [], []
    for w in word_set:
        chs = ["."] + list(w) + ["."]
        for ch1,ch2,ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi2[ch1+ch2]
            ix2 = stoi[ch3]
            xs_t.append(ix1)
            ys_t.append(ix2)
    
    xs_t = torch.tensor(xs_t)
    ys_t = torch.tensor(ys_t)

    with torch.no_grad():
        logits = W[xs_t]
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdim=True)
    
        nll = -probs[torch.arange(xs_t.nelement()), ys_t].log().mean()

    return nll

In [8]:
train(reg_factor=0.01)

Epoch: 0; Loss: 3.7336504459381104
Epoch: 1; Loss: 3.6212451457977295
Epoch: 2; Loss: 3.519554376602173
Epoch: 3; Loss: 3.4285521507263184
Epoch: 4; Loss: 3.3480427265167236
Epoch: 5; Loss: 3.277346134185791
Epoch: 6; Loss: 3.215322494506836
Epoch: 7; Loss: 3.160595417022705
Epoch: 8; Loss: 3.11183762550354
Epoch: 9; Loss: 3.0679633617401123
Epoch: 10; Loss: 3.028154134750366
Epoch: 11; Loss: 2.991804599761963
Epoch: 12; Loss: 2.9584553241729736
Epoch: 13; Loss: 2.927744150161743
Epoch: 14; Loss: 2.8993759155273438
Epoch: 15; Loss: 2.873103380203247
Epoch: 16; Loss: 2.848714590072632
Epoch: 17; Loss: 2.8260245323181152
Epoch: 18; Loss: 2.804870367050171
Epoch: 19; Loss: 2.7851061820983887
Epoch: 20; Loss: 2.766599655151367
Epoch: 21; Loss: 2.749232530593872
Epoch: 22; Loss: 2.732896327972412
Epoch: 23; Loss: 2.7174947261810303
Epoch: 24; Loss: 2.7029411792755127
Epoch: 25; Loss: 2.6891579627990723
Epoch: 26; Loss: 2.676077127456665
Epoch: 27; Loss: 2.663638114929199
Epoch: 28; Loss: 2.

In [9]:
get_loss(test_words)

tensor(2.2715)

In [10]:
reg_factors = torch.arange(0.001, 0.15, 0.009)

In [11]:
reg_factors.shape

torch.Size([17])

In [12]:
reg_factors

tensor([0.0010, 0.0100, 0.0190, 0.0280, 0.0370, 0.0460, 0.0550, 0.0640, 0.0730,
        0.0820, 0.0910, 0.1000, 0.1090, 0.1180, 0.1270, 0.1360, 0.1450])

In [13]:
dev_losses = {}
for reg_factor in reg_factors:
    train(reg_factor=reg_factor)
    dev_losses[reg_factor] = get_loss(dev_words)

Epoch: 0; Loss: 3.724609136581421
Epoch: 1; Loss: 3.6123173236846924
Epoch: 2; Loss: 3.51071834564209
Epoch: 3; Loss: 3.4197914600372314
Epoch: 4; Loss: 3.3393442630767822
Epoch: 5; Loss: 3.2686994075775146
Epoch: 6; Loss: 3.2067201137542725
Epoch: 7; Loss: 3.152031183242798
Epoch: 8; Loss: 3.1033072471618652
Epoch: 9; Loss: 3.0594615936279297
Epoch: 10; Loss: 3.0196774005889893
Epoch: 11; Loss: 2.983349084854126
Epoch: 12; Loss: 2.9500181674957275
Epoch: 13; Loss: 2.9193227291107178
Epoch: 14; Loss: 2.890968084335327
Epoch: 15; Loss: 2.8647069931030273
Epoch: 16; Loss: 2.840327501296997
Epoch: 17; Loss: 2.817645788192749
Epoch: 18; Loss: 2.7964982986450195
Epoch: 19; Loss: 2.7767395973205566
Epoch: 20; Loss: 2.7582380771636963
Epoch: 21; Loss: 2.7408742904663086
Epoch: 22; Loss: 2.724541187286377
Epoch: 23; Loss: 2.709141254425049
Epoch: 24; Loss: 2.6945886611938477
Epoch: 25; Loss: 2.6808066368103027
Epoch: 26; Loss: 2.6677262783050537
Epoch: 27; Loss: 2.655287027359009
Epoch: 28; Lo

In [14]:
dev_losses

{tensor(0.0010): tensor(2.2487),
 tensor(0.0100): tensor(2.2487),
 tensor(0.0190): tensor(2.2487),
 tensor(0.0280): tensor(2.2487),
 tensor(0.0370): tensor(2.2487),
 tensor(0.0460): tensor(2.2487),
 tensor(0.0550): tensor(2.2488),
 tensor(0.0640): tensor(2.2489),
 tensor(0.0730): tensor(2.2490),
 tensor(0.0820): tensor(2.2491),
 tensor(0.0910): tensor(2.2492),
 tensor(0.1000): tensor(2.2493),
 tensor(0.1090): tensor(2.2495),
 tensor(0.1180): tensor(2.2497),
 tensor(0.1270): tensor(2.2499),
 tensor(0.1360): tensor(2.2501),
 tensor(0.1450): tensor(2.2503)}

In [15]:
with open("./dev_losses.pkl", 'wb') as f:
    pickle.dump(dev_losses, f)

In [16]:
test_losses = {}
for reg_factor in reg_factors:
    train(reg_factor=reg_factor)
    test_losses[reg_factor] = get_loss(test_words)

Epoch: 0; Loss: 3.724609136581421
Epoch: 1; Loss: 3.6123173236846924
Epoch: 2; Loss: 3.51071834564209
Epoch: 3; Loss: 3.4197914600372314
Epoch: 4; Loss: 3.3393442630767822
Epoch: 5; Loss: 3.2686991691589355
Epoch: 6; Loss: 3.2067203521728516
Epoch: 7; Loss: 3.152031183242798
Epoch: 8; Loss: 3.1033072471618652
Epoch: 9; Loss: 3.0594615936279297
Epoch: 10; Loss: 3.0196774005889893
Epoch: 11; Loss: 2.983349084854126
Epoch: 12; Loss: 2.9500181674957275
Epoch: 13; Loss: 2.9193227291107178
Epoch: 14; Loss: 2.890968084335327
Epoch: 15; Loss: 2.8647069931030273
Epoch: 16; Loss: 2.840327501296997
Epoch: 17; Loss: 2.817645788192749
Epoch: 18; Loss: 2.7964982986450195
Epoch: 19; Loss: 2.7767395973205566
Epoch: 20; Loss: 2.7582380771636963
Epoch: 21; Loss: 2.7408742904663086
Epoch: 22; Loss: 2.724540948867798
Epoch: 23; Loss: 2.709141254425049
Epoch: 24; Loss: 2.6945888996124268
Epoch: 25; Loss: 2.6808066368103027
Epoch: 26; Loss: 2.6677262783050537
Epoch: 27; Loss: 2.655287027359009
Epoch: 28; Lo

In [17]:
test_losses

{tensor(0.0010): tensor(2.2716),
 tensor(0.0100): tensor(2.2715),
 tensor(0.0190): tensor(2.2714),
 tensor(0.0280): tensor(2.2713),
 tensor(0.0370): tensor(2.2713),
 tensor(0.0460): tensor(2.2713),
 tensor(0.0550): tensor(2.2713),
 tensor(0.0640): tensor(2.2713),
 tensor(0.0730): tensor(2.2713),
 tensor(0.0820): tensor(2.2714),
 tensor(0.0910): tensor(2.2714),
 tensor(0.1000): tensor(2.2715),
 tensor(0.1090): tensor(2.2716),
 tensor(0.1180): tensor(2.2717),
 tensor(0.1270): tensor(2.2719),
 tensor(0.1360): tensor(2.2720),
 tensor(0.1450): tensor(2.2722)}

In [18]:
with open("./test_losses.pkl", 'wb') as f:
    pickle.dump(test_losses, f)

In [27]:
W.shape

torch.Size([729, 27])

In [22]:
W[xs_t].shape

torch.Size([156871, 27])

In [23]:
W[ys_t].shape

torch.Size([156871, 27])

In [26]:
xs_t.shape

torch.Size([156871])

In [29]:
W[0]

tensor([ 1.3278, -0.2010, -0.0232, -0.9326,  0.2422, -0.0251, -1.3106,  0.5125,
         0.0670,  0.7664, -0.3992,  0.6666, -0.2782, -0.3668,  1.1631,  2.4851,
         1.3231, -1.3776,  0.5737, -0.7120,  0.8344, -0.1257, -1.2534,  0.3798,
        -0.0599,  2.1152,  2.0712], grad_fn=<SelectBackward0>)

In [41]:
torch.tensor(F.one_hot(ys_t, num_classes=27), dtype=torch.float32)

  torch.tensor(F.one_hot(ys_t, num_classes=27), dtype=torch.float32).dtype


torch.float32

In [35]:
F.one_hot?

[0;31mDocstring:[0m
one_hot(tensor, num_classes=-1) -> LongTensor

Takes LongTensor with index values of shape ``(*)`` and returns a tensor
of shape ``(*, num_classes)`` that have zeros everywhere except where the
index of last dimension matches the corresponding value of the input tensor,
in which case it will be 1.

See also `One-hot on Wikipedia`_ .

.. _One-hot on Wikipedia:
    https://en.wikipedia.org/wiki/One-hot

Arguments:
    tensor (LongTensor): class values of any shape.
    num_classes (int):  Total number of classes. If set to -1, the number
        of classes will be inferred as one greater than the largest class
        value in the input tensor.

Returns:
    LongTensor that has one more dimension with 1 values at the
    index of last dimension indicated by the input, and 0 everywhere
    else.

Examples:
    >>> F.one_hot(torch.arange(0, 5) % 3)
    tensor([[1, 0, 0],
            [0, 1, 0],
            [0, 0, 1],
            [1, 0, 0],
            [0, 1, 0]])
    >