In [15]:
# Implementation based on the following paper:

# https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [2]:
# Key concept: Words are embedded in an n-dimensional embedding space. The MLP learns to organize this space in a way that forms understanding of
#              semantics. The goal is to give similar embeddings to similar words. Each embedding dimensions will encode the model's compressed
#              understanding. This allows the model to transfer knowledge and generalize to novel data.

In [3]:
import torch 
import torch.nn.functional as F
import matplotlib.pyplot as plt 
%matplotlib inline

In [4]:
words = open('names.txt').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

In [6]:
# Build the vocabulary
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)} # {} to create a dictionary, i+1 so the starting index is 1, enumerate returns an iterable of tuples
print(f'{stoi = }')

# Syntax is: {key_expression: value_expression for item in iterable}

stoi['.'] = 0 # Add start/end token with 0 index
itos = {i:s for s,i in stoi.items()}
print(f'{itos = }')

stoi = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
itos = {1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [7]:
list(enumerate(chars))

[(0, 'a'),
 (1, 'b'),
 (2, 'c'),
 (3, 'd'),
 (4, 'e'),
 (5, 'f'),
 (6, 'g'),
 (7, 'h'),
 (8, 'i'),
 (9, 'j'),
 (10, 'k'),
 (11, 'l'),
 (12, 'm'),
 (13, 'n'),
 (14, 'o'),
 (15, 'p'),
 (16, 'q'),
 (17, 'r'),
 (18, 's'),
 (19, 't'),
 (20, 'u'),
 (21, 'v'),
 (22, 'w'),
 (23, 'x'),
 (24, 'y'),
 (25, 'z')]

In [8]:
# Build the dataset

block_size = 3 # Context length
X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0] * block_size # Multiplication of a list with a positive integer replicates the contents of the list that many times. 
                               # here [0, 0, 0]
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), ' ----> ', itos[ix])
        
        context = context[1:] + [ix] # Crop and append. This essentially shifts the context window to the right.

X = torch.tensor(X)
Y = torch.tensor(Y) 

emma
...  ---->  e
..e  ---->  m
.em  ---->  m
emm  ---->  a
mma  ---->  .
olivia
...  ---->  o
..o  ---->  l
.ol  ---->  i
oli  ---->  v
liv  ---->  i
ivi  ---->  a
via  ---->  .
ava
...  ---->  a
..a  ---->  v
.av  ---->  a
ava  ---->  .
isabella
...  ---->  i
..i  ---->  s
.is  ---->  a
isa  ---->  b
sab  ---->  e
abe  ---->  l
bel  ---->  l
ell  ---->  a
lla  ---->  .
sophia
...  ---->  s
..s  ---->  o
.so  ---->  p
sop  ---->  h
oph  ---->  i
phi  ---->  a
hia  ---->  .


In [9]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [10]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [11]:
# Create the lookup table "C" of embeddings per character

C = torch.randn((27,2)) # 27 characters embedded in a 2-dimensional space. Returns a tensor.
print(f'{C.shape = }, \n{C.dtype = }, \n{C = }')

C.shape = torch.Size([27, 2]), 
C.dtype = torch.float32, 
C = tensor([[-0.3624, -0.4966],
        [-1.2292,  0.1354],
        [ 0.7915, -0.2870],
        [ 0.2531, -1.1512],
        [ 0.1504, -2.2786],
        [-1.8882,  0.2456],
        [ 2.1242,  0.9396],
        [-1.4602, -1.1785],
        [ 0.6620,  0.2064],
        [-1.7284, -0.0120],
        [-1.6734, -1.3556],
        [-0.7919, -1.3280],
        [ 0.3607, -0.3684],
        [ 1.3874, -0.6661],
        [-1.2694, -0.9046],
        [-0.5850,  0.1864],
        [-1.0960,  0.5380],
        [ 0.2761,  0.2254],
        [ 0.2845,  1.1659],
        [ 2.5072,  1.6047],
        [ 0.5344, -1.1039],
        [ 0.4764, -0.5298],
        [-0.7484, -0.3143],
        [-0.2986, -0.3627],
        [ 0.8337,  0.0834],
        [ 0.0124, -0.5235],
        [-0.2294, -0.5578]])


In [12]:
C[X].shape # We can index into the tensor C using another tensor.

# The shape is (sample x character x embedding)
# Remember 1 "sample" is made of 3 characters (the preceding 3, to be exact) at that is why the first two dimensions are 32 x 3,
# and "embedding " consists of 2 values, which is why the last dimension is 2.

torch.Size([32, 3, 2])

In [13]:
# Explanation of pytorch advanced indexing

# What Happens in C[X]?
# When you do C[X], you're not directly indexing C as if C were 3D. Instead, you're using the advanced indexing feature in PyTorch, 
# where the tensor X acts as a lookup table for rows in C. Here's how it works:

# Indexing Mechanism:
# X specifies which rows of C to select.
# Each ENTRY of X is an index that tells PyTorch which row of C to pick.

print(f'{C[0] = }') # This selects the first row of C
print(f'{C[X][0,:,0] = }') # This grabs the first embedding for each character in the first sample (sample = context, made of 3 characters)
                           # So the 3rd dimension is the useful one which contains the embeddings for each character


C[0] = tensor([-0.3624, -0.4966])
C[X][0,:,0] = tensor([-0.3624, -0.3624, -0.3624])


In [14]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [31]:
# Build hidden layer

# Matrix of weights

W1 = torch.randn((6, 100))  # The NN takes the previous 3 characters and predicts the next one,
                            # and each character was embedded using 2 values (2D embedding space).
                            # This means that the NN takes (3 characters * 2 embeddings/character = 6 values),
                            # and we set the layer to have 100 neurons.
                        
b1 = torch.randn(100)       # Make 100 biases, 1 per neuron.

In [17]:
emb

tensor([[[-0.3624, -0.4966],
         [-0.3624, -0.4966],
         [-0.3624, -0.4966]],

        [[-0.3624, -0.4966],
         [-0.3624, -0.4966],
         [-1.8882,  0.2456]],

        [[-0.3624, -0.4966],
         [-1.8882,  0.2456],
         [ 1.3874, -0.6661]],

        [[-1.8882,  0.2456],
         [ 1.3874, -0.6661],
         [ 1.3874, -0.6661]],

        [[ 1.3874, -0.6661],
         [ 1.3874, -0.6661],
         [-1.2292,  0.1354]],

        [[-0.3624, -0.4966],
         [-0.3624, -0.4966],
         [-0.3624, -0.4966]],

        [[-0.3624, -0.4966],
         [-0.3624, -0.4966],
         [-0.5850,  0.1864]],

        [[-0.3624, -0.4966],
         [-0.5850,  0.1864],
         [ 0.3607, -0.3684]],

        [[-0.5850,  0.1864],
         [ 0.3607, -0.3684],
         [-1.7284, -0.0120]],

        [[ 0.3607, -0.3684],
         [-1.7284, -0.0120],
         [-0.7484, -0.3143]],

        [[-1.7284, -0.0120],
         [-0.7484, -0.3143],
         [-1.7284, -0.0120]],

        [[-0.7484, -0

In [16]:
# Now, the layer has to perform a matrix multiplication to operate on the inputs:

# Something like emb @ W1 + b1

# But this can't be done because the dimensions of emb and W1 are not compatible. Therefore, we need to reshape emb e.g. through concatenation.


torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1)    # This grabs the embeddings of the 1st char, 2nd char, and 3rd char
                                                            # and adds them as columns (dim 1)

# The problem is that this code does not generalize because we have to manually list the tensors in the list that is passed to cat.

tensor([[-0.3624, -0.4966, -0.3624, -0.4966, -0.3624, -0.4966],
        [-0.3624, -0.4966, -0.3624, -0.4966, -1.8882,  0.2456],
        [-0.3624, -0.4966, -1.8882,  0.2456,  1.3874, -0.6661],
        [-1.8882,  0.2456,  1.3874, -0.6661,  1.3874, -0.6661],
        [ 1.3874, -0.6661,  1.3874, -0.6661, -1.2292,  0.1354],
        [-0.3624, -0.4966, -0.3624, -0.4966, -0.3624, -0.4966],
        [-0.3624, -0.4966, -0.3624, -0.4966, -0.5850,  0.1864],
        [-0.3624, -0.4966, -0.5850,  0.1864,  0.3607, -0.3684],
        [-0.5850,  0.1864,  0.3607, -0.3684, -1.7284, -0.0120],
        [ 0.3607, -0.3684, -1.7284, -0.0120, -0.7484, -0.3143],
        [-1.7284, -0.0120, -0.7484, -0.3143, -1.7284, -0.0120],
        [-0.7484, -0.3143, -1.7284, -0.0120, -1.2292,  0.1354],
        [-0.3624, -0.4966, -0.3624, -0.4966, -0.3624, -0.4966],
        [-0.3624, -0.4966, -0.3624, -0.4966, -1.2292,  0.1354],
        [-0.3624, -0.4966, -1.2292,  0.1354, -0.7484, -0.3143],
        [-1.2292,  0.1354, -0.7484, -0.3

In [20]:
# Better way with torch.unbind(emb, 1), which is equal to [emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]] but generalizes.

torch.cat(torch.unbind(emb, 1), 1)

tensor([[-0.3624, -0.4966, -0.3624, -0.4966, -0.3624, -0.4966],
        [-0.3624, -0.4966, -0.3624, -0.4966, -1.8882,  0.2456],
        [-0.3624, -0.4966, -1.8882,  0.2456,  1.3874, -0.6661],
        [-1.8882,  0.2456,  1.3874, -0.6661,  1.3874, -0.6661],
        [ 1.3874, -0.6661,  1.3874, -0.6661, -1.2292,  0.1354],
        [-0.3624, -0.4966, -0.3624, -0.4966, -0.3624, -0.4966],
        [-0.3624, -0.4966, -0.3624, -0.4966, -0.5850,  0.1864],
        [-0.3624, -0.4966, -0.5850,  0.1864,  0.3607, -0.3684],
        [-0.5850,  0.1864,  0.3607, -0.3684, -1.7284, -0.0120],
        [ 0.3607, -0.3684, -1.7284, -0.0120, -0.7484, -0.3143],
        [-1.7284, -0.0120, -0.7484, -0.3143, -1.7284, -0.0120],
        [-0.7484, -0.3143, -1.7284, -0.0120, -1.2292,  0.1354],
        [-0.3624, -0.4966, -0.3624, -0.4966, -0.3624, -0.4966],
        [-0.3624, -0.4966, -0.3624, -0.4966, -1.2292,  0.1354],
        [-0.3624, -0.4966, -1.2292,  0.1354, -0.7484, -0.3143],
        [-1.2292,  0.1354, -0.7484, -0.3

In [24]:
# But there is actually a more efficient way.

# Proof:

a = torch.arange(18)
print(f'{a = }, \n{a.shape = }, \n\n{a.storage() = }') # Note: a.storage() returns a TypedStorage object, which is deprecated. Use .untyped_storage()

# Note: In computer memory the data of a tensor is always represented as a 1D vector!
#       Pytorch then interprets this number sequence as a tensor of specific dimensions.

a = tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17]), 
a.shape = torch.Size([18]), 

a.storage() =  0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]


  print(f'{a = }, \n{a.shape = }, \n\n{a.storage() = }')


In [29]:
# On the other hand,

print(f'{a.view(3,3,2) = }') # a.view(9, 9), a.view(6, 3), etc.

#  .view() is therefore more efficient than .cat() in PyTorch because .view() only alters the shape of the tensor by modifying its metadata, 
#  without changing the underlying data. This operation is computationally inexpensive since no new memory allocation occurs. 
#  In contrast, .cat() creates a new tensor by concatenating existing tensors, which involves copying data and allocating new memory, 
#  making it more resource-intensive.

# So instead we can do:

print(f'\n{emb.view(32, 6) = }')


a.view(3,3,2) = tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

emb.view(32, 6) = tensor([[-0.3624, -0.4966, -0.3624, -0.4966, -0.3624, -0.4966],
        [-0.3624, -0.4966, -0.3624, -0.4966, -1.8882,  0.2456],
        [-0.3624, -0.4966, -1.8882,  0.2456,  1.3874, -0.6661],
        [-1.8882,  0.2456,  1.3874, -0.6661,  1.3874, -0.6661],
        [ 1.3874, -0.6661,  1.3874, -0.6661, -1.2292,  0.1354],
        [-0.3624, -0.4966, -0.3624, -0.4966, -0.3624, -0.4966],
        [-0.3624, -0.4966, -0.3624, -0.4966, -0.5850,  0.1864],
        [-0.3624, -0.4966, -0.5850,  0.1864,  0.3607, -0.3684],
        [-0.5850,  0.1864,  0.3607, -0.3684, -1.7284, -0.0120],
        [ 0.3607, -0.3684, -1.7284, -0.0120, -0.7484, -0.3143],
        [-1.7284, -0.0120, -0.7484, -0.3143, -1.7284, -0.0120],
        [-0.7484, -0.3143, -1.7284, -0.0120, -1.2292,  0.1354],
        [-0.3624, -0

In [38]:
# Perform hidden layer operation

# emb.view(32, 6) @ W1 + b1
emb.view(emb.shape[0], 6) @ W1 + b1     # Even better so dims are not hardcoded.
                                        # emb.shape[0] selects the first dimension given by emb.shape.
                                        # Can also use -1 as the first dim so Pytorch derives it itself, knowing that the number of elements has to
                                        # be equal to the original tensor's.

# Corresponds to the logits of the hidden layer, pre-activation function

tensor([[ 2.1129, -2.6930,  1.4483,  ...,  0.9303,  0.6698,  0.1502],
        [-0.2639, -2.2789,  2.2134,  ...,  0.3233,  1.9377,  0.0224],
        [ 3.6410, -4.4634, -0.0771,  ...,  2.3118,  0.3872,  2.2425],
        ...,
        [ 1.3664, -2.8027,  0.3359,  ...,  1.9897,  0.5240,  3.0480],
        [ 0.1678, -1.4718,  1.6818,  ..., -0.2888,  1.5553,  0.0467],
        [-0.7058, -0.3824,  0.1012,  ...,  1.1909,  1.8748,  0.3961]])

In [41]:
# Calculate activations

h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)

print(f'{h = }, \n\n{h.shape = }')

h = tensor([[ 0.9712, -0.9909,  0.8954,  ...,  0.7307,  0.5849,  0.1491],
        [-0.2580, -0.9792,  0.9764,  ...,  0.3125,  0.9594,  0.0224],
        [ 0.9986, -0.9997, -0.0769,  ...,  0.9806,  0.3689,  0.9777],
        ...,
        [ 0.8779, -0.9927,  0.3238,  ...,  0.9633,  0.4808,  0.9955],
        [ 0.1662, -0.8999,  0.9331,  ..., -0.2810,  0.9147,  0.0466],
        [-0.6080, -0.3648,  0.1009,  ...,  0.8309,  0.9540,  0.3766]]), 

h.shape = torch.Size([32, 100])


In [45]:
# Broadcasting correctness check:

print(f'{(emb.view(emb.shape[0], 6) @ W1).shape = }')
print(f'\n{b1.shape = }')

# So internally the broadcasting will

# 32, 100
#     100

# first align the dimensions from the right,

# 32, 100
#  1, 100

# then add a dimension made of 1s entries (this turns b1 into a row vector),
# and then this row vector will be copied vertically to match the 32, 100 dim necessary for the addition

# Conclusion: broadcasting in this case achieves the desired operation as the element-wise addition is performed with the same values for every row,
# meaning each neuron (dim 1) adds the same bias to each sample.

(emb.view(emb.shape[0], 6) @ W1).shape = torch.Size([32, 100])

b1.shape = torch.Size([100])
