In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
words = open('names.txt','r').read().splitlines()

In [3]:
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
#Build mapping and vocab

In [6]:
chars = sorted(list(set(''.join(words))))


In [7]:
chars

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [8]:
#Set stoi and itos
stoi = {s:i+1 for i,s in enumerate(chars)}

In [9]:
stoi


{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [10]:
stoi['.'] = 0


In [11]:
stoi


{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [12]:
itos = {i:s for s,i in stoi.items()}

In [20]:
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [21]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
  
  #print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    #print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append
  
X = torch.tensor(X)
Y = torch.tensor(Y)

In [14]:
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [15]:
C = torch.randn((27,2))

In [16]:
C


tensor([[ 0.0361,  0.4885],
        [ 0.0533, -0.2031],
        [ 0.6363, -0.4911],
        [-0.6803,  1.1564],
        [ 0.7179, -0.2056],
        [-0.3365, -0.5927],
        [ 0.3686,  1.3327],
        [ 0.6379,  0.8276],
        [ 0.5147, -0.5502],
        [-0.2603,  1.2304],
        [ 0.2489, -0.8023],
        [ 0.8135,  0.9552],
        [-0.4613, -0.1111],
        [ 0.2042, -0.4401],
        [ 0.9298, -1.5218],
        [ 0.9145,  0.9651],
        [-1.8234,  0.4625],
        [ 1.4934,  1.0275],
        [-0.9016,  1.5394],
        [ 0.0285,  1.5345],
        [ 0.6796,  0.5618],
        [ 1.1976, -0.5799],
        [-0.9362, -0.3487],
        [ 1.0310, -0.4908],
        [-1.2604, -0.4092],
        [-0.3772,  0.3916],
        [ 0.7594, -0.4835]])

In [17]:
C[5] 


tensor([-0.3365, -0.5927])

In [18]:
F.one_hot(torch.tensor(5),num_classes=27).float()@C

tensor([-0.3365, -0.5927])

In [22]:
X.shape

torch.Size([228146, 3])

In [23]:
C[X]

tensor([[[ 0.0361,  0.4885],
         [ 0.0361,  0.4885],
         [ 0.0361,  0.4885]],

        [[ 0.0361,  0.4885],
         [ 0.0361,  0.4885],
         [-0.3772,  0.3916]],

        [[ 0.0361,  0.4885],
         [-0.3772,  0.3916],
         [ 1.1976, -0.5799]],

        ...,

        [[ 0.9145,  0.9651],
         [-0.9362, -0.3487],
         [ 0.0533, -0.2031]],

        [[-0.9362, -0.3487],
         [ 0.0533, -0.2031],
         [ 0.9298, -1.5218]],

        [[ 0.0533, -0.2031],
         [ 0.9298, -1.5218],
         [-0.2603,  1.2304]]])

In [24]:
C[X].shape

torch.Size([228146, 3, 2])

In [25]:
X[13,2]

tensor(18)

In [26]:
C[X][13,2] 


tensor([-0.9016,  1.5394])

In [27]:
emb = C[X] 
emb.shape

torch.Size([228146, 3, 2])

In [37]:
W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [28]:
#We need 228146,6 instead of 3,2 

In [38]:
h = torch.tanh(emb.view(-1,6) @ W1 + b1)

In [39]:
h

tensor([[ 0.4032,  0.0865, -0.5076,  ...,  0.6410, -0.1398, -0.9597],
        [ 0.3678, -0.3318, -0.6415,  ...,  0.6211,  0.3788, -0.8937],
        [-0.5583,  0.9531, -0.5883,  ...,  0.4874, -0.9793, -0.9020],
        ...,
        [ 0.9780, -0.9890, -0.9990,  ...,  0.9089, -0.9877, -0.7066],
        [-0.9839,  0.9929,  0.2585,  ..., -0.2403, -0.7081, -0.3759],
        [ 0.9280, -0.9993, -0.9366,  ...,  0.9372,  0.4778, -0.9982]])

In [40]:
h.shape

torch.Size([228146, 100])

In [41]:
#layer 2  
W2 = torch.randn((100,27))
b2 = torch.randn(27) 

In [42]:
logits = h @ W2 + b2 

In [43]:
logits.shape

torch.Size([228146, 27])

In [44]:
logits

tensor([[ -8.6695,  -1.4701, -10.6531,  ...,  -6.7846,  -5.0195,   3.4941],
        [ -4.7855,  -5.1132, -10.9286,  ...,  -7.8906,  -2.6363,   2.4424],
        [-12.0126,  22.6463,  -3.3152,  ...,  -5.9904,   0.4106,  -8.2298],
        ...,
        [ -7.8274,   4.2940,  -0.0633,  ...,  -5.9034,  -2.1718,  -9.6683],
        [ -9.8818,  17.6640,  10.5999,  ...,   3.5152,   1.7104,   0.2975],
        [ -8.7941, -19.3735,   1.6455,  ...,   0.6196,   0.5459,  -4.5699]])

In [45]:
counts = logits.exp()

In [46]:
counts

tensor([[1.7174e-04, 2.2990e-01, 2.3627e-05,  ..., 1.1310e-03, 6.6080e-03,
         3.2920e+01],
        [8.3499e-03, 6.0168e-03, 1.7937e-05,  ..., 3.7424e-04, 7.1624e-02,
         1.1501e+01],
        [6.0671e-06, 6.8416e+09, 3.6326e-02,  ..., 2.5026e-03, 1.5078e+00,
         2.6660e-04],
        ...,
        [3.9865e-04, 7.3256e+01, 9.3866e-01,  ..., 2.7301e-03, 1.1397e-01,
         6.3256e-05],
        [5.1095e-05, 4.6923e+07, 4.0132e+04,  ..., 3.3622e+01, 5.5312e+00,
         1.3465e+00],
        [1.5162e-04, 3.8566e-09, 5.1838e+00,  ..., 1.8582e+00, 1.7261e+00,
         1.0359e-02]])

In [47]:
prob = counts / counts.sum(1,keepdim=True)

In [48]:
prob

tensor([[1.1077e-07, 1.4829e-04, 1.5240e-08,  ..., 7.2954e-07, 4.2623e-06,
         2.1234e-02],
        [5.6149e-06, 4.0460e-06, 1.2062e-08,  ..., 2.5165e-07, 4.8163e-05,
         7.7338e-03],
        [8.8663e-16, 9.9981e-01, 5.3086e-12,  ..., 3.6572e-13, 2.2034e-10,
         3.8960e-14],
        ...,
        [2.1038e-12, 3.8659e-07, 4.9536e-09,  ..., 1.4408e-11, 6.0145e-10,
         3.3382e-13],
        [1.5610e-15, 1.4336e-03, 1.2261e-06,  ..., 1.0272e-09, 1.6898e-10,
         4.1136e-11],
        [6.3378e-12, 1.6121e-16, 2.1668e-07,  ..., 7.7675e-08, 7.2152e-08,
         4.3300e-10]])

In [49]:
prob.shape

torch.Size([228146, 27])

In [51]:
prob[0].sum()

tensor(1.)

In [52]:
prob.shape[0]

228146

In [60]:
loss = -prob[torch.arange(prob.shape[0]),Y].log().mean()

In [61]:
Y


tensor([25, 21,  8,  ..., 14,  9,  0])

In [62]:
loss

tensor(14.3079)

In [63]:
F.cross_entropy(logits,Y)

tensor(14.3079)

In [64]:
#Cleanup
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [65]:
sum(p.nelement() for p in parameters)

11897

In [66]:
for p in parameters:
    p.requires_grad = True

In [68]:
#Train 
for _ in range(10):
    emb = C[X]
    h = torch.tanh(emb.view(-1,6)) @W1 + b1)

SyntaxError: unmatched ')' (1581787022.py, line 4)