### Plan of attack
1. Parse in the data
2. Split into training, dev/val, and test
3. Sketch out model architecture
4. Make a forward pass
5. Make a backward pass
6. Profit

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd

## Parse the data

In [547]:
with open('names.txt') as file: 
    names = file.read().splitlines()

In [548]:
special_char = '.'

lookup = sorted(set("".join(names) + special_char))

def stoi(c):
    return lookup.index(c)

def itos(i): 
    return lookup[i]

In [549]:
X = []
Y = []

window = 5

for name in names:
    # note: don't want multiple special characters at the end -- just one!
    context = special_char * window + name + special_char

    for i in range(len(context) - window):
        x = [stoi(c) for c in context[i:i + window]]
        X.append(x)
        Y.append(stoi(context[i + window]))
        # print(f"{context[i:i + window]} -> {context[i + window]}")

In [550]:
X = torch.tensor(X)
Y = torch.tensor(Y)

X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([228146, 5]), torch.Size([228146]), torch.int64, torch.int64)

In [551]:
### Split datasets

In [552]:
n1 = int(0.8 * X.shape[0])
n2 = int(0.9 * X.shape[0])
Xtr, Xdev, Xts = X.tensor_split((n1, n2), dim=0)
Ytr, Ydev, Yts = Y.tensor_split((n1, n2), dim=0)

Xtr.shape, Xdev.shape, Xts.shape, Ytr.shape, Ydev.shape, Yts.shape

(torch.Size([182516, 5]),
 torch.Size([22815, 5]),
 torch.Size([22815, 5]),
 torch.Size([182516]),
 torch.Size([22815]),
 torch.Size([22815]))

## Create embeddings

In [553]:
n_chars = len(lookup)
n_dims = 10 

C = torch.randn((n_chars, n_dims))

### sanity check

In [554]:
ch = X[3, 0]
ch, C[ch] #this is the embedding for the fifth character

(tensor(0),
 tensor([-0.1576,  0.2843,  0.0119, -0.5549, -1.1535,  0.5487, -0.9585, -0.6887,
         -0.5648, -0.9221]))

In [555]:
# sanity check
F.one_hot(X[3, 0], num_classes=27).float() @ C

tensor([-0.1576,  0.2843,  0.0119, -0.5549, -1.1535,  0.5487, -0.9585, -0.6887,
        -0.5648, -0.9221])

## Defining model architecture

In [556]:
n_input = window * n_dims # the number of chars used * dimensions per each
n_hidden = 200

W1 = torch.randn((n_input, n_hidden))
b1 = torch.randn(n_hidden)
W2 = torch.randn((n_hidden, n_chars))
b2 = torch.randn(n_chars)

W1.shape, b1.shape, W2.shape, b2.shape

(torch.Size([50, 200]),
 torch.Size([200]),
 torch.Size([200, 27]),
 torch.Size([27]))

In [557]:
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

sum(p.nelement() for p in parameters)

15897

In [558]:
# manually implement the loss

# counts = logits.exp()
# probs = counts / counts.sum(1, keepdim=True) # have to sum across the specific axis

# probs[0, Y[0]] -> correct for first row 
# probs[Y] -> incorrect because takes the entire probability row out for each letter index, not what i want

# likelihoods = probs[torch.arange(probs.shape[0]), Y]
# assert probs[0, Y[0]] == likelihoods[0]

# loss = -likelihoods.log().mean()
# loss

## Training

In [559]:
def forward(X): 
    # forward pass
    emb = C[X]
    input = emb.view(-1, n_dims * window)

    L1 = input @ W1 + b1
    h = torch.tanh(L1) # NB! missed the activation function!
    logits = h @ W2 + b2 # 

    return logits

### Find the optimal lr

In [560]:
# n_iterations = 1000

# lre = torch.linspace(-3, 0, n_iterations)
# lrs = 10 ** lre
# lri = []
# lossi = []

# for i in range(n_iterations): 
#     # batching 
#     ix = torch.randint(0, Xtr.shape[0], (32,)) # 32 is the batch size

#     logits = step(Xtr[ix])
#     loss = F.cross_entropy(logits, Ytr[ix]) #NB! this has to be the raw logits! 

#     for p in parameters: 
#         p.grad = None

#     loss.backward()

#     lr = lrs[i] 
#     for p in parameters: 
#         p.data -= lr * p.grad

#     lri.append(lr)
#     lossi.append(loss.item())

# # why is this actually reliable? 
# # wouldn't the loss naturally decrease with time? 
# px.scatter(
#     x=lri,
#     y=lossi,
#     labels={'x': 'LR', 'y': 'Loss'},
#     trendline='lowess'
# )

### loop

In [561]:
lossi = []
lossdevi = []
stepi = []

total = 0

In [571]:
n_epochs = 50
batch_size = 2048

for i in range(n_epochs): 
    permutation = torch.randperm(Xtr.shape[0])
    # batching 
    for j in range(0, Xtr.shape[0], batch_size):
        ix = permutation[j:j + batch_size]  

        logits = forward(Xtr[ix])
        loss = F.cross_entropy(logits, Ytr[ix]) #NB! this has to be the raw logits! 

        for p in parameters: 
            p.grad = None

        loss.backward()

        lr = 0.1 if i < n_epochs / 2 else 0.01
        # lr = 10 ** (-(total * 2 / 100000 + 1))

        for p in parameters: 
            p.data -= lr * p.grad

        lossi.append(loss.log10().item())
        stepi.append(total)
        total += 1

    logits = forward(Xdev)
    loss = F.cross_entropy(logits, Ydev)
    lossdevi.append(loss.item())

# messy because of mini-batching ! 
px.scatter(
    x=stepi,
    y=lossi,
    labels={'x': 'Step', 'y': 'Loss'},
    title='Loss over time',
)


In [572]:
px.scatter(
    x=range(len(lossdevi)),
    y=lossdevi,
    trendline='lowess'
)


## Evaluate performance

In [573]:
logits = forward(Xdev)
loss = F.cross_entropy(logits, Ydev)

loss

tensor(2.5493, grad_fn=<NllLossBackward0>)

In [574]:
logits = forward(Xtr)
loss = F.cross_entropy(logits, Ytr)

loss

tensor(2.2921, grad_fn=<NllLossBackward0>)

If both losses are roughly equal, the network is likely underfitting. Once the training loss starts to sink below the dev loss, we are probably overfitting.

## Generate by forward pass

In [575]:
def gen(seed):
    x = [stoi(ch) for ch in seed[-window:]]

    x = torch.tensor(x)

    logits = forward(x)
    probs = F.softmax(logits, dim=1) 
    next = torch.multinomial(probs, num_samples=1).item()

    ch = itos(next)
    seed += ch

    return seed


In [626]:
seed = special_char * window 

while True:
    seed = gen(seed)
    if seed[-1] == special_char:
        break

print(seed)

.....zylen.


### Question: does increasing the window size prevent super-long names that seemingly occured previously? 

Hypothesis: yes, because the model takes into account that if the past five letters are all chars, a dot is more likely. With three letters, there are a lot more examples where three chars =/= dot is more likely since the name is not ending yet.

Test plan: generate a bunch of random samples of names after the model has been trained and compare the distribution of name lengths. 

## Visualize vector embeddings

### 2D

In [627]:
every = [stoi(ch) for ch in lookup]
every = torch.tensor(every)

df = C[every].data
px.scatter(df, x=0, y=1, text=lookup, title='Character embeddings', size_max=15, size=[1] * len(df))


## Ideas to optimize loss
0. Default                                          | loss = 2.4434
1. Mini-batching while iterating over everything    | loss = 2.4384 (after running the loop twice, i.e. didn't seem to have much effect probably cuz of law of large numbers)
2. Larger hidden layer                              | loss_200 = 2.4335, loss_300 = 2.4540 
2.5 Keeping track of the min_loss on the dev set every epoch | didn't help much either, basically the same as in the final step of training (~2.45)
* fixed setting lr to smaller based on epoch_i, that (seemed to have) helped get ~2.36

NB! Ignore all of the above. Few bugs were fixes:
- initializing W2 layer with rand instead of randn
- not resetting the C embedding matrix after training loops for different models

seem to be getting roughly 2.5 now no matter what -> why not equal to / lower than Antony's? 

3. More vector embedding dimensions
    -> meh, still 2.4469

2.4243 is lowest so far

2.3806 with slightly nudging up the batch size, and training for longer, as well as a 4-char context window!!

4. Use a larger context window
5. Regularization!!