In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
words = open('../names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [20]:
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [33]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words[:5]:
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    # print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append
  
X = torch.tensor(X)
Y = torch.tensor(Y)

In [34]:
X.shape, X.dtype, Y.shape, Y.dtype # each input to neural network is 3

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

### Implementing embedding look up table

In [24]:
# We try to embbded the 27 characters to a two dimensional space

In [35]:
C = torch.randn((27,2))
C.shape

torch.Size([27, 2])

In [36]:
C[5]

tensor([-1.5300, -0.4925])

In [37]:
F.one_hot(torch.tensor(5),num_classes=27).float() @ C # it just masks the other field except one

tensor([-1.5300, -0.4925])

In [40]:
C[torch.tensor([5,6,7,7,7,7])] # indexing with one-dimensional tensor of integer

tensor([[-1.5300, -0.4925],
        [-1.2292,  0.5038],
        [ 1.1388, -0.3566],
        [ 1.1388, -0.3566],
        [ 1.1388, -0.3566],
        [ 1.1388, -0.3566]])

In [42]:
C[X].shape # for every X, we have retrived the embedding.

torch.Size([32, 3, 2])

In [44]:
X[13,2]

tensor(1)

In [46]:
# C[X][13,2] == C[1]

tensor([-0.6042, -0.0488])

In [48]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

### Implementing the hidden layer

In [50]:
w1 = torch.randn((6,100)) # no of inputs = 3*2 as we have 2-d embedding and we have 3 of them(100 nuerons initliaze)
b1 = torch.randn(100)

In [52]:
w1.shape

torch.Size([6, 100])

In [53]:
#emb @ w1 + b1 # (32,3,2) @ (6,100)

- emb[:,0,:] : It selects along all the samples, 0 selects only the first embeddings and : selects across all the first embedding

In [67]:
torch.cat([emb[:,0,:],emb[:,1,:],emb[:,2,:]],1).shape # not generalized

torch.Size([32, 6])

In [70]:
torch.cat(torch.unbind(emb,1),1).shape

torch.Size([32, 6])

In [71]:
a = torch.arange(18)
a.shape

torch.Size([18])

In [73]:
a.view(9,2) # as long as total number of elements multiplies to be same,it works

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

In [76]:
a.storage # tensor is always represented as 1d in memory

<bound method Tensor.storage of tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])>

In [77]:
emb.shape

torch.Size([32, 3, 2])

In [78]:
emb.view(32,6)

tensor([[ 2.0137,  0.4006,  2.0137,  0.4006,  2.0137,  0.4006],
        [ 2.0137,  0.4006,  2.0137,  0.4006, -1.5300, -0.4925],
        [ 2.0137,  0.4006, -1.5300, -0.4925,  1.3456,  2.5344],
        [-1.5300, -0.4925,  1.3456,  2.5344,  1.3456,  2.5344],
        [ 1.3456,  2.5344,  1.3456,  2.5344, -0.6042, -0.0488],
        [ 2.0137,  0.4006,  2.0137,  0.4006,  2.0137,  0.4006],
        [ 2.0137,  0.4006,  2.0137,  0.4006,  0.5484, -0.6178],
        [ 2.0137,  0.4006,  0.5484, -0.6178, -2.3037,  0.4911],
        [ 0.5484, -0.6178, -2.3037,  0.4911,  0.7966,  1.6419],
        [-2.3037,  0.4911,  0.7966,  1.6419,  0.6448,  0.7174],
        [ 0.7966,  1.6419,  0.6448,  0.7174,  0.7966,  1.6419],
        [ 0.6448,  0.7174,  0.7966,  1.6419, -0.6042, -0.0488],
        [ 2.0137,  0.4006,  2.0137,  0.4006,  2.0137,  0.4006],
        [ 2.0137,  0.4006,  2.0137,  0.4006, -0.6042, -0.0488],
        [ 2.0137,  0.4006, -0.6042, -0.0488,  0.6448,  0.7174],
        [-0.6042, -0.0488,  0.6448,  0.7

In [85]:
h = torch.tanh(emb.view(-1,6) @ w1 + b1)

In [86]:
h.shape

torch.Size([32, 100])

In [87]:
h

tensor([[-0.7359,  0.8942, -0.9589,  ..., -1.0000,  0.9999,  0.9941],
        [ 0.9979,  0.9522, -0.6011,  ..., -0.9961,  1.0000,  0.8102],
        [-0.9988, -0.9836,  0.5567,  ..., -0.9992,  0.9727,  0.9850],
        ...,
        [-1.0000, -0.2794, -0.9850,  ..., -0.9980, -0.7468, -0.8928],
        [-0.9963,  1.0000, -1.0000,  ...,  0.7268, -0.9226, -0.9751],
        [ 0.9647,  0.1413, -0.8425,  ..., -0.9971,  1.0000,  0.9834]])

### Implementing the output layer

In [89]:
w2 = torch.randn((100,27))
b2 = torch.randn(27)

### Implementing negative log likelihood

In [93]:
logits = h @ w2
logits.shape

torch.Size([32, 27])

In [96]:
counts = logits.exp()
counts

tensor([[2.1387e-07, 2.9891e-06, 5.1301e-02, 2.6671e-06, 6.2228e-02, 4.6337e+01,
         1.2804e+00, 2.0264e-04, 7.5968e+04, 4.1393e-07, 4.4984e-06, 4.2773e+00,
         4.2082e+00, 1.1780e+07, 2.0331e+03, 3.4912e+02, 4.2228e+02, 3.7973e-10,
         1.2671e+00, 1.3128e+01, 7.0674e+04, 1.0002e+02, 6.7173e-06, 2.0433e-03,
         1.3051e-04, 1.5830e+00, 6.4485e-10],
        [3.0033e-01, 1.6264e-01, 1.6017e+01, 2.3802e-03, 3.4270e-03, 2.7046e-05,
         5.5357e+02, 9.5114e-02, 2.4107e+06, 1.4675e-04, 1.8604e-04, 1.4829e+04,
         1.4559e-03, 2.0066e+05, 5.1080e+02, 5.4738e-02, 2.2683e+03, 3.3991e-06,
         4.2263e-09, 4.8864e+05, 7.1900e+01, 1.0602e+04, 1.4767e-08, 1.4490e+02,
         1.4978e-02, 5.7936e+00, 2.5570e+04],
        [3.7206e-02, 8.8144e-09, 7.0131e-01, 7.4201e-09, 1.0269e-10, 1.6888e+03,
         2.7081e+01, 9.0253e-03, 6.6574e+01, 6.6991e-01, 3.9530e-06, 1.5852e+04,
         8.7460e+03, 7.6566e-04, 2.5354e+04, 1.8825e+01, 1.0847e+01, 1.9868e+00,
         1.8812e-

In [97]:
prob = counts/counts.sum(1,keepdim=True)

In [98]:
prob.shape

torch.Size([32, 27])

In [103]:
prob[0].sum()

tensor(1.0000)

In [107]:
loss = - prob[torch.arange(32),Y].log().mean()
loss

tensor(20.1827)

In [105]:
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [104]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])