In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
chars = sorted(list(set(''.join(words))))

In [11]:
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [12]:
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [19]:
block_size = 3
X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0] * block_size # list of 0s of blocksize
    for ch in w + '.':
        ix = stoi[ch] # character index
        X.append(context)
        Y.append(ix) #
        print(''.join(itos[i] for i in context), '--->',itos[ix])
        context = context[1:] + [ix] # Take the first 2 items i context and add the ix at the end of the list

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [20]:
X.shape,X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [21]:
C = torch.randn((27,2))

In [22]:
C

tensor([[-1.0451,  0.2271],
        [-0.4246,  0.5392],
        [-0.5737, -0.8957],
        [ 1.4986, -0.5988],
        [-1.3220,  0.0404],
        [-0.2435, -0.5964],
        [ 0.6180, -0.9611],
        [ 0.6871, -1.3060],
        [ 1.3025, -0.3111],
        [-0.4725,  0.9513],
        [ 1.2088,  0.0408],
        [-0.1670,  1.7796],
        [ 0.1698,  1.0846],
        [-2.4542,  1.6683],
        [-0.1402, -0.4956],
        [-0.3065,  0.7363],
        [ 0.5563, -0.1821],
        [-0.1230,  0.1824],
        [-0.1040,  0.9632],
        [ 0.5984,  0.0641],
        [ 1.0266,  0.6260],
        [ 0.6885,  0.8778],
        [-0.7261,  1.4174],
        [ 2.7825, -2.3553],
        [ 0.0887,  0.1935],
        [-1.1268,  0.3802],
        [ 0.6110,  1.1021]])

In [23]:
C[5]

tensor([-0.2435, -0.5964])

In [27]:
F.one_hot(torch.tensor(5),num_classes=27).float() @ C

tensor([-0.2435, -0.5964])

In [29]:
C[torch.tensor([3,4,5])]

tensor([[ 1.4986, -0.5988],
        [-1.3220,  0.0404],
        [-0.2435, -0.5964]])

In [31]:
C[X].shape

torch.Size([32, 3, 2])

In [32]:
X[13,2]

tensor(1)

In [33]:
C[X][13,2]

tensor([-0.4246,  0.5392])

In [34]:
C[1]

tensor([-0.4246,  0.5392])

In [35]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [None]:
# Hidden layer
W1 = torch.randn((6,100)) # input will be emb.shape(x,y,z) ---> (y * z, The number of neurons we decide (hyperparameter))
b1 = torch.randn(100)

In [39]:
emb @ W1 + b1 # does not work cause emb.shape(32,3,2) while W1 is (6, 100)
              # 32, 3, 2 x
              #  6, 100

RuntimeError: mat1 and mat2 shapes cannot be multiplied (96x2 and 6x100)

In [41]:
logits = emb.view(-1,6) @ W1  + b1

Multiple ways to change the shape of the embedding

In [46]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]],1) # grab all examples, the [0,1,2] index, all indexes in 3rd dimension

tensor([[-1.0451,  0.2271, -1.0451,  0.2271, -1.0451,  0.2271],
        [-1.0451,  0.2271, -1.0451,  0.2271, -0.2435, -0.5964],
        [-1.0451,  0.2271, -0.2435, -0.5964, -2.4542,  1.6683],
        [-0.2435, -0.5964, -2.4542,  1.6683, -2.4542,  1.6683],
        [-2.4542,  1.6683, -2.4542,  1.6683, -0.4246,  0.5392],
        [-1.0451,  0.2271, -1.0451,  0.2271, -1.0451,  0.2271],
        [-1.0451,  0.2271, -1.0451,  0.2271, -0.3065,  0.7363],
        [-1.0451,  0.2271, -0.3065,  0.7363,  0.1698,  1.0846],
        [-0.3065,  0.7363,  0.1698,  1.0846, -0.4725,  0.9513],
        [ 0.1698,  1.0846, -0.4725,  0.9513, -0.7261,  1.4174],
        [-0.4725,  0.9513, -0.7261,  1.4174, -0.4725,  0.9513],
        [-0.7261,  1.4174, -0.4725,  0.9513, -0.4246,  0.5392],
        [-1.0451,  0.2271, -1.0451,  0.2271, -1.0451,  0.2271],
        [-1.0451,  0.2271, -1.0451,  0.2271, -0.4246,  0.5392],
        [-1.0451,  0.2271, -0.4246,  0.5392, -0.7261,  1.4174],
        [-0.4246,  0.5392, -0.7261,  1.4

In [47]:
view_opt = emb.view(-1,6)
cat_opt = torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]],1)

In [49]:
torch.equal(view_opt,cat_opt)

True

Not good because if we want to change the block size we will haveto manually do it in the code

Instead we should do torch.unbind https://pytorch.org/docs/stable/generated/torch.unbind.html
which removes a tensor dimension

Returns a tuple of all slices along a given dimension, already without it.

In [None]:
unbind_opt = torch.unbind(emb,1) # Same as the list in cat options

In [56]:
unbind_opt

(tensor([[-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0.2435, -0.5964],
         [-2.4542,  1.6683],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0.3065,  0.7363],
         [ 0.1698,  1.0846],
         [-0.4725,  0.9513],
         [-0.7261,  1.4174],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0.4246,  0.5392],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0.4725,  0.9513],
         [ 0.5984,  0.0641],
         [-0.4246,  0.5392],
         [-0.5737, -0.8957],
         [-0.2435, -0.5964],
         [ 0.1698,  1.0846],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [ 0.5984,  0.0641],
         [-0.3065,  0.7363],
         [ 0.5563, -0.1821],
         [ 1.3025, -0.3111]]),
 tensor([[-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0

In [60]:
([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]])

[tensor([[-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0.2435, -0.5964],
         [-2.4542,  1.6683],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0.3065,  0.7363],
         [ 0.1698,  1.0846],
         [-0.4725,  0.9513],
         [-0.7261,  1.4174],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0.4246,  0.5392],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0.4725,  0.9513],
         [ 0.5984,  0.0641],
         [-0.4246,  0.5392],
         [-0.5737, -0.8957],
         [-0.2435, -0.5964],
         [ 0.1698,  1.0846],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [ 0.5984,  0.0641],
         [-0.3065,  0.7363],
         [ 0.5563, -0.1821],
         [ 1.3025, -0.3111]]),
 tensor([[-1.0451,  0.2271],
         [-1.0451,  0.2271],
         [-0

In [61]:
torch.cat(unbind_opt,1)

tensor([[-1.0451,  0.2271, -1.0451,  0.2271, -1.0451,  0.2271],
        [-1.0451,  0.2271, -1.0451,  0.2271, -0.2435, -0.5964],
        [-1.0451,  0.2271, -0.2435, -0.5964, -2.4542,  1.6683],
        [-0.2435, -0.5964, -2.4542,  1.6683, -2.4542,  1.6683],
        [-2.4542,  1.6683, -2.4542,  1.6683, -0.4246,  0.5392],
        [-1.0451,  0.2271, -1.0451,  0.2271, -1.0451,  0.2271],
        [-1.0451,  0.2271, -1.0451,  0.2271, -0.3065,  0.7363],
        [-1.0451,  0.2271, -0.3065,  0.7363,  0.1698,  1.0846],
        [-0.3065,  0.7363,  0.1698,  1.0846, -0.4725,  0.9513],
        [ 0.1698,  1.0846, -0.4725,  0.9513, -0.7261,  1.4174],
        [-0.4725,  0.9513, -0.7261,  1.4174, -0.4725,  0.9513],
        [-0.7261,  1.4174, -0.4725,  0.9513, -0.4246,  0.5392],
        [-1.0451,  0.2271, -1.0451,  0.2271, -1.0451,  0.2271],
        [-1.0451,  0.2271, -1.0451,  0.2271, -0.4246,  0.5392],
        [-1.0451,  0.2271, -0.4246,  0.5392, -0.7261,  1.4174],
        [-0.4246,  0.5392, -0.7261,  1.4

Even better way

# Next time

In [63]:
a = torch.arange(18)

In [65]:

a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [66]:
a.storage()

  a.storage()


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [67]:
ras = a.view(3,3,2)
ras.storage()

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [69]:
h = torch.tanh(emb.view(-1,6) @ W1+b1)

In [70]:
h.shape

torch.Size([32, 100])

In [71]:
# 32, 100
#  1, 100
# This will copy vertically for every one of the 32 rows the 1, 100 vector and do an element wise addition

Final layer

In [73]:
W2 = torch.randn((100,27))
b2 = torch.randn(27)

In [74]:
logits = h @ W2 + b2


In [75]:
logits.shape

torch.Size([32, 27])

In [76]:
counts = logits.exp()
prob = counts / counts.sum(1,keepdims = True)

In [77]:
prob

tensor([[5.2366e-05, 9.3566e-10, 5.0658e-05, 1.2492e-12, 3.8561e-04, 3.0587e-06,
         9.8584e-01, 9.6187e-03, 1.2000e-04, 2.8846e-07, 3.3360e-06, 1.9968e-12,
         3.4284e-08, 1.2834e-06, 4.4725e-04, 6.5964e-04, 1.1321e-07, 9.9710e-13,
         3.4790e-11, 8.0023e-13, 1.3880e-06, 8.9050e-10, 2.7877e-03, 3.1887e-05,
         4.5079e-09, 5.0675e-07, 2.9620e-08],
        [3.2929e-04, 2.6412e-09, 2.0807e-04, 8.5551e-12, 1.0617e-04, 1.6208e-06,
         1.4771e-02, 8.3661e-01, 2.9591e-08, 4.2464e-06, 1.9077e-06, 6.1773e-09,
         1.9069e-08, 3.5321e-07, 2.9850e-02, 5.4410e-02, 1.1778e-08, 3.7361e-10,
         2.9665e-07, 1.9469e-09, 6.9641e-06, 5.5433e-06, 6.2688e-02, 8.0127e-07,
         3.5140e-09, 5.7847e-10, 1.0026e-03],
        [7.1652e-10, 1.2589e-11, 4.7370e-10, 3.8557e-12, 2.9714e-05, 2.3954e-03,
         2.3456e-02, 7.2527e-06, 9.6175e-01, 3.0604e-10, 1.0164e-06, 1.9441e-13,
         1.7471e-05, 1.2303e-02, 1.3284e-07, 1.1836e-05, 6.4235e-06, 5.0250e-12,
         5.1195e-

In [78]:
prob[0].sum()

tensor(1.0000)

In [81]:
loss = -prob[torch.arange(32),Y].log().mean()
loss

tensor(18.0905)

More respectable

In [82]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [84]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator = g)
W1 = torch.randn((6,100), generator = g)
b1 = torch.randn(100, generator = g)
W2 = torch.randn((100,27), generator = g)
b2 = torch.randn(27,generator=g)
params = [C,W1,b1,W2,b2]

In [85]:
sum(p.nelement() for p in params)

3481

In [99]:
import torch.nn.functional as F

In [101]:
for p in params:
    p.requires_grad = True

In [105]:
# Overfitting a single batch of the data
for _ in range(1000):
    # Forwards pass
    emb = C[X]
    h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2
    #counts = logits.exp()
    #probs = counts / counts.sum(1, keepdim=True)
    #loss = -prob[torch.arange(32),Y].log().mean()
    # -----------> Same as
    loss = F.cross_entropy(logits,Y)
    print(loss.item())
    # Backward pass
    for p in params:
        p.grad = None
    loss.backward()
    for p in params:
        p.data += -0.1 * p.grad

0.255642831325531
0.2556484341621399
0.25563403964042664
0.25563961267471313
0.25562533736228943
0.25563082098960876
0.25561660528182983
0.25562208890914917
0.2556079626083374
0.25561341643333435
0.25559940934181213
0.2556048333644867
0.2555908262729645
0.25559622049331665
0.255582332611084
0.255587637424469
0.2555737793445587
0.2555791437625885
0.255565345287323
0.255570650100708
0.25555694103240967
0.2555622458457947
0.25554853677749634
0.2555537521839142
0.255540132522583
0.2555452883243561
0.25553178787231445
0.25553691387176514
0.2555234730243683
0.2555285692214966
0.2555152177810669
0.2555202841758728
0.2555069923400879
0.2555120289325714
0.2554987967014313
0.2555038034915924
0.25549057126045227
0.2554956078529358
0.2554824650287628
0.2554874122142792
0.25547435879707336
0.2554793059825897
0.2554663121700287
0.25547119975090027
0.2554582953453064
0.2554631531238556
0.2554502487182617
0.2554550766944885
0.2554422616958618
0.25544705986976624
0.2554343342781067
0.2554391622543335
0

In [108]:
logits.max(1)

torch.return_types.max(
values=tensor([13.9250, 18.9960, 21.1395, 21.4166, 17.5814, 13.9250, 16.8153, 14.8652,
        16.6063, 19.3501, 16.8656, 21.8019, 13.9250, 18.1480, 18.0803, 21.0486,
        13.9250, 17.3920, 16.2242, 18.0663, 19.3317, 16.9361, 11.7411, 11.4472,
        16.1109, 13.9250, 16.9722, 17.7624, 13.3431, 16.8724, 20.0886, 17.2104],
       grad_fn=<MaxBackward0>),
indices=tensor([ 9, 13, 13,  1,  0,  9, 12,  9, 22,  9,  1,  0,  9, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0,  9, 15, 16,  8,  9,  1,  0]))

In [110]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

We are unable to completely overfit and make the loss be completely 0 because
of the structure where we have the start of a name 

... ---> [a,b,c,d,e,f,...,y,z]

This is because these are supposed to be equally likely outcomes for the exact same input
But we are very close in the cases where there is a unique input for a unique output

In [96]:
# Its better to use F cross entrop
# Pytorh will not create intermediate tensors which is fairly inefficient
# Instead it will cluster up all the operations and will have fused kernels
# that very effciently evaluate these expressions that are clustered mathematical operations

# Backward pass can be made more efficient
# We don't have to apply chain rule individually through each tensor
# instead we can go directly because the expression gets simplified mathematically

# Under the hood it can be better behaved

logits = torch.tensor([-2,-3,0,5])
counts = logits.exp()
probs = counts / counts.sum()
print(probs)

# At very positive logits you start to run into trouble

logits = torch.tensor([-100,-3,0,100])
counts = logits.exp()
probs = counts / counts.sum()
print(probs)

# This is because the way that exp works
# If you pass a very negative number you get a number  very near 0
# but if you pass a very big number we run  out of range in the float that represents our values

tensor([9.0466e-04, 3.3281e-04, 6.6846e-03, 9.9208e-01])
tensor([0., 0., 0., nan])


In [98]:
logits = torch.tensor([-100,-3,0,100]) - 100 # offsets will create the same probabilities
counts = logits.exp()
probs = counts / counts.sum()
print(probs)
# Pytorch will subtract 100

tensor([0.0000e+00, 1.4013e-45, 3.7835e-44, 1.0000e+00])


# Using the full dataset to optimize the neural net