In [98]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [99]:
names = open('names.txt').read().splitlines()
print(names[:10]) # 10 first names

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [100]:
# Build the vocabulary
chars = sorted(list(set(''.join(names))))
stoi = {s: i + 1 for i, s in enumerate(chars)} # string to index
stoi['.'] = 0 # padding
itos = {i: s for s, i in stoi.items()} # index to string
print(itos) # index to string

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [101]:
# Block size and batch size
block_size = 3 # how many characters to predict
# Training and validation split
X = []
Y = []
for name in names[:5]:
    print(name)
    context = [0] * block_size # padding
    for ch in name + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '->', itos[ix])
        context = context[1:] + [ix] # shift the context
# Convert to tensors
X = torch.tensor(X)
Y = torch.tensor(Y)
print(X.shape, Y.shape) # (number of examples, block size), (number of examples, 1)

emma
... -> e
..e -> m
.em -> m
emm -> a
mma -> .
olivia
... -> o
..o -> l
.ol -> i
oli -> v
liv -> i
ivi -> a
via -> .
ava
... -> a
..a -> v
.av -> a
ava -> .
isabella
... -> i
..i -> s
.is -> a
isa -> b
sab -> e
abe -> l
bel -> l
ell -> a
lla -> .
sophia
... -> s
..s -> o
.so -> p
sop -> h
oph -> i
phi -> a
hia -> .
torch.Size([32, 3]) torch.Size([32])


In [102]:
X.shape, X.dtype, Y.shape, Y.dtype # (number of examples, block size), (number of examples, 1)

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [103]:
C = torch.randn((27, 2)) # 27 characters, 2-dimensional embedding

In [104]:
# what is C meaning?
# C is a matrix of shape (27, 2) where each row corresponds to a character in the vocabulary.
# The first row corresponds to the padding character (index 0), and the rest correspond to the characters in the vocabulary.
# Each character is represented by a 2-dimensional vector, which can be thought of as an embedding for that character.
# The embedding is initialized randomly, and during training, it will be updated to capture the relationships between characters.
# The embedding allows the model to learn a representation of characters that captures their similarities and differences.
# The embedding can be used as input to a neural network, allowing the model to learn patterns in the data.
# The embedding can also be used for visualization, allowing us to see how characters are related in the embedding space.
# The embedding can be used for various tasks such as character-level language modeling, text generation, and more.
# Are they weights or biases?
# They are weights. In a neural network, weights are parameters that are learned during training.
# Weights are used to transform the input data into a different representation, allowing the model to learn patterns in the data.

In [105]:
C[5] # embedding for character '5'

tensor([ 0.1676, -0.3755])

In [106]:
F.one_hot(torch.tensor(5), num_classes=27) # one-hot encoding for character '5'


tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

In [107]:
F.one_hot(torch.tensor(5), num_classes=27).shape # one-hot encoding for character '5' with 27 classes

torch.Size([27])

In [108]:
C[5] # embedding for character '5' using the embedding matrix C

tensor([ 0.1676, -0.3755])

In [109]:
# F.one_hot(torch.tensor(5), num_classes=27) @ C # embedding for character '5' using one-hot encoding

In [110]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C # one-hot encoding for character '5' with 27 classes

tensor([ 0.1676, -0.3755])

In [111]:
# C[5] == F.one_hot(torch.tensor(5), num_classes=27).float() @ C

In [112]:
C[torch.tensor([5, 6, 7,7,7,7,7])] # embedding for characters '5', '6', '7' using the embedding matrix C

tensor([[ 0.1676, -0.3755],
        [ 0.1135,  0.5526],
        [-1.1090, -1.8569],
        [-1.1090, -1.8569],
        [-1.1090, -1.8569],
        [-1.1090, -1.8569],
        [-1.1090, -1.8569]])

In [113]:
C[X]

tensor([[[-0.5505,  0.9653],
         [-0.5505,  0.9653],
         [-0.5505,  0.9653]],

        [[-0.5505,  0.9653],
         [-0.5505,  0.9653],
         [ 0.1676, -0.3755]],

        [[-0.5505,  0.9653],
         [ 0.1676, -0.3755],
         [-0.3721, -0.7949]],

        [[ 0.1676, -0.3755],
         [-0.3721, -0.7949],
         [-0.3721, -0.7949]],

        [[-0.3721, -0.7949],
         [-0.3721, -0.7949],
         [ 0.5172, -1.0980]],

        [[-0.5505,  0.9653],
         [-0.5505,  0.9653],
         [-0.5505,  0.9653]],

        [[-0.5505,  0.9653],
         [-0.5505,  0.9653],
         [ 0.6375,  0.2702]],

        [[-0.5505,  0.9653],
         [ 0.6375,  0.2702],
         [ 0.1712, -0.7044]],

        [[ 0.6375,  0.2702],
         [ 0.1712, -0.7044],
         [ 0.0332, -0.0345]],

        [[ 0.1712, -0.7044],
         [ 0.0332, -0.0345],
         [ 0.5857,  0.6051]],

        [[ 0.0332, -0.0345],
         [ 0.5857,  0.6051],
         [ 0.0332, -0.0345]],

        [[ 0.5857,  0

In [114]:
X.shape # (number of examples, block size)

torch.Size([32, 3])

In [115]:
X[:1]

tensor([[0, 0, 0]])

In [116]:
C[X[0]]

tensor([[-0.5505,  0.9653],
        [-0.5505,  0.9653],
        [-0.5505,  0.9653]])

In [117]:
C.shape # (27, 2) embedding matrix for 27 characters with 2-dimensional embedding

torch.Size([27, 2])

In [118]:
C[X].shape # embedding for all characters in X

torch.Size([32, 3, 2])

In [119]:
names[13][2]

'l'

In [120]:
C[stoi[names[13][2]]] # embedding for character 'a' in name 'aab'

tensor([ 0.1712, -0.7044])

In [121]:
C[X][13,2] # embedding for the 13th example, 2nd character

tensor([ 0.5172, -1.0980])

In [122]:
C[X[13][2]]

tensor([ 0.5172, -1.0980])

In [123]:
X[13]

tensor([0, 0, 1])

In [124]:
for i in range(3):
    print(itos[X[13][i].item()], end='')

..a

In [125]:
# what is C[1] and how it is equivalent to C[X[13][1]]?
C.shape

torch.Size([27, 2])

Embbeding and Weights

In [126]:
emb = C[X] # embedding for all characters in X
emb.shape # (number of examples, block size, embedding dimension)

torch.Size([32, 3, 2])

In [127]:
emb[0] # embedding of ...

tensor([[-0.5505,  0.9653],
        [-0.5505,  0.9653],
        [-0.5505,  0.9653]])

In [128]:
emb[1] # embedding of the ..e

tensor([[-0.5505,  0.9653],
        [-0.5505,  0.9653],
        [ 0.1676, -0.3755]])

In [129]:
emb[3] # embedding of the emm

tensor([[ 0.1676, -0.3755],
        [-0.3721, -0.7949],
        [-0.3721, -0.7949]])

Embedding are vector given to a certain word and in above we have 3 input to NN as block size is 3

#### Let's start the main Implementation

In [130]:
# there are 3 block and each block has 1 character with 2 embedding dimension so the total embedding dimension is 3*2=6
W1 = torch.randn((6, 100)) # 6-dimensional embedding to 100-dimensional hidden layer
b1 = torch.randn(100) # bias for hidden layer

In [None]:
# emb @ W1 + b1 # linear transformation from embedding to hidden layer

RuntimeError: mat1 and mat2 shapes cannot be multiplied (96x2 and 6x100)

In [147]:
# If you want to concatenate embeddings along dimension 1 (which should match in other dimensions):
torch.cat([emb[:,0,:],emb[:,1,:],emb[:,2,:]],1).shape  # Concatenates first 3 examples

torch.Size([32, 6])

In [148]:
torch.cat(torch.unbind(emb,1),1).shape # Concatenates first 3 examples

torch.Size([32, 6])

In [149]:
(torch.cat([emb[:,0,:],emb[:,1,:],emb[:,2,:]],1)==torch.cat(torch.unbind(emb,1),1)).all() # Check if they are equal

tensor(True)

##### Advance feature of flatting

In [153]:
a = torch.randn(18) # random tensor of shape (3, 2)

In [154]:
a.shape

torch.Size([18])

In [157]:
a.view(3,3,2) # reshape to (3, 2)

tensor([[[ 0.5086, -0.6824],
         [ 0.7816, -0.8055],
         [-0.7320, -1.0116]],

        [[-0.1844,  0.1980],
         [-0.1459,  1.4125],
         [ 0.8560, -1.5703]],

        [[-1.1208,  0.0909],
         [-1.1916, -0.0298],
         [ 1.9229,  0.1539]]])

In [158]:
a.storage()

  a.storage()


 0.5085802674293518
 -0.6823678016662598
 0.7815552949905396
 -0.8054583668708801
 -0.7320087552070618
 -1.0115565061569214
 -0.1844131499528885
 0.19803953170776367
 -0.1458822786808014
 1.4125443696975708
 0.8559510707855225
 -1.570260763168335
 -1.1207613945007324
 0.09086468070745468
 -1.1915953159332275
 -0.029782384634017944
 1.9229425191879272
 0.15391285717487335
[torch.storage.TypedStorage(dtype=torch.float32, device=cpu) of size 18]

In [159]:
emb.view(32, 6) # reshape to (32, 6)

tensor([[-0.5505,  0.9653, -0.5505,  0.9653, -0.5505,  0.9653],
        [-0.5505,  0.9653, -0.5505,  0.9653,  0.1676, -0.3755],
        [-0.5505,  0.9653,  0.1676, -0.3755, -0.3721, -0.7949],
        [ 0.1676, -0.3755, -0.3721, -0.7949, -0.3721, -0.7949],
        [-0.3721, -0.7949, -0.3721, -0.7949,  0.5172, -1.0980],
        [-0.5505,  0.9653, -0.5505,  0.9653, -0.5505,  0.9653],
        [-0.5505,  0.9653, -0.5505,  0.9653,  0.6375,  0.2702],
        [-0.5505,  0.9653,  0.6375,  0.2702,  0.1712, -0.7044],
        [ 0.6375,  0.2702,  0.1712, -0.7044,  0.0332, -0.0345],
        [ 0.1712, -0.7044,  0.0332, -0.0345,  0.5857,  0.6051],
        [ 0.0332, -0.0345,  0.5857,  0.6051,  0.0332, -0.0345],
        [ 0.5857,  0.6051,  0.0332, -0.0345,  0.5172, -1.0980],
        [-0.5505,  0.9653, -0.5505,  0.9653, -0.5505,  0.9653],
        [-0.5505,  0.9653, -0.5505,  0.9653,  0.5172, -1.0980],
        [-0.5505,  0.9653,  0.5172, -1.0980,  0.5857,  0.6051],
        [ 0.5172, -1.0980,  0.5857,  0.6

In [160]:
emb.view(32, 6) == torch.cat(torch.unbind(emb,1),1) # Check if they are equal

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

#### Continue with our model

In [170]:
# h = emb.view(emb.shape[0],6) @ W1 + b1 # linear transformation from embedding to hidden layer
# alternative way to do the same thing
h = torch.tanh(emb.view(-1,6) @ W1 + b1) # linear transformation from embedding to hidden layer

In [172]:
h

tensor([[-0.9964,  0.5287,  0.9678,  ..., -0.7641,  0.3293,  0.1134],
        [-0.9962, -0.3652,  0.7386,  ..., -0.9020, -0.1333,  0.0140],
        [-0.6333, -0.1956, -0.9738,  ..., -0.6146,  0.8490, -0.3872],
        ...,
        [ 0.8839,  0.8141, -0.9991,  ...,  0.9997, -0.1372,  0.9892],
        [ 0.9955, -0.9994, -0.8716,  ..., -0.0252,  0.8697,  0.9970],
        [-0.4507, -0.9166, -0.9165,  ..., -0.8108,  0.5001, -0.0135]])

In [171]:
h.shape # (number of examples, hidden layer size)

torch.Size([32, 100])

In [173]:
### warning: while and the bias the dimention wise adding might be wrong
# 32 ,100
# 1(fake dimension) ,100 in b1 as bias was 100 in dimension and while adding the dimension align to the right

In [None]:
W2 = torch.randn((100, 27)) # 100-dimensional hidden layer to 27-dimensional output layer as we have 27 characters
b2 = torch.randn(27) # bias for output layer

In [179]:
logits = h @ W2 + b2 # linear transformation from hidden layer to output layer
logits.shape # (number of examples, output layer size)

torch.Size([32, 27])

##### Internal

In [None]:
# counts = logits.exp() # exponentiate the logits to get counts
# prob = counts / counts.sum(1, keepdim=True) # normalize to get probabilities
# prob.shape # (number of examples, output layer size)
# loss = -prob[torch.arange(32), Y].log().mean() # negative log likelihood loss
# loss

tensor(15.7007)

In [194]:
# above is equivalent to the following code
loss = F.cross_entropy(logits, Y) # cross entropy loss and use this instead of the above code as it is more efficient and numerically stable
loss

tensor(15.7007)

In [184]:
torch.arange(32) # 32 examples

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [None]:
# prob[torch.arange(32), Y] # probabilities of the true labels

tensor([4.8424e-08, 6.3380e-13, 1.9539e-09, 1.0754e-08, 2.6381e-08, 1.1545e-06,
        7.3593e-08, 7.4603e-10, 4.9412e-11, 5.2409e-07, 3.3851e-04, 1.3655e-05,
        1.3326e-07, 7.5386e-05, 1.8526e-12, 2.0684e-06, 2.9216e-04, 1.4561e-07,
        1.3808e-12, 2.5120e-07, 1.5755e-05, 2.6834e-11, 2.6447e-10, 3.3107e-08,
        1.2383e-06, 5.8305e-07, 1.1109e-05, 6.3519e-10, 7.9154e-01, 2.5525e-04,
        2.3577e-05, 9.5987e-04])

In [182]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])