In [1]:
#Hyper parameter --> a designer's choice parameter, can be as small or as big as you like
import torch
from torch import nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
print(device)

cuda


In [2]:

with open("names.txt", 'r', encoding="UTF-8") as f:
    text = f.read()
    words = text.splitlines()
    chars = sorted(set("".join(words)))

stoi = {char:integer+1 for integer,char in enumerate(chars)}
itos = {integer+1:char for integer,char in enumerate(chars)}

stoi["."] = 0
itos[0] = "."

vocab_size = len(stoi)


In [4]:
#Generating data
inputs, labels = [], []
context_size = 3

for w in words[:1]:
    context = [0] * context_size
    for char in w + ".":
        ix = stoi[char]
        inputs.append(context)
        labels.append(ix)
        context = context[1:] + [ix]

inputs = torch.tensor(inputs)
labels = torch.tensor(labels)

#print(inputs)
print(inputs.shape) #number of total characters in the words X block size (context size) --> there is a 1 new character for each dimension added, thus why it's n_c X b_s
print(labels.shape) #number of total characters in the words X 1


[[0, 0, 0], [0, 0, 5], [0, 5, 13], [5, 13, 13], [13, 13, 1]]
torch.Size([5, 3])
torch.Size([5])


In [4]:
#The embedding is pretty handy because it can generalize to new scenarios never seen by the model
g = torch.Generator(device=device).manual_seed(2718472123)
emb = torch.randn((vocab_size, 2), generator=g)
print(emb.shape) #each character basically have 2 values, the neural network can prioritize one of the values depending on the context so to give a different answer as the next character


print(inputs[:5]) #grab the first word
print(emb[torch.tensor([[0, 0, 0]])]) #indexing at [0, 0, 0] is the same as taking the value emb[0, :] three times and stacking those tensors
print(emb[torch.tensor([[0, 0, 0]])].shape)
print(torch.stack((emb[0, :], emb[0, :], emb[0, :])))
print("")

visualization = emb[torch.tensor(
    [
        [0, 0, 0],
        [0, 0, 5],
        [0, 5, 13],
        [5, 13, 13],
        [13, 13, 1]
    ]
)]

print("\n\n Visualizing")
print(visualization)
print(visualization.shape) # 5 X 3 X 2 (5 inputs, 3 context characters (block size), 2 values per character) total of 5*3*2=30 values
print(torch.eq(visualization, emb[inputs[:5]]))
#they are the same

embedded = emb[inputs]
#print(embedded.shape)

torch.Size([27, 2])
tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1]], device='cuda:0')
tensor([[[-0.4136,  0.0690],
         [-0.4136,  0.0690],
         [-0.4136,  0.0690]]], device='cuda:0')
torch.Size([1, 3, 2])
tensor([[-0.4136,  0.0690],
        [-0.4136,  0.0690],
        [-0.4136,  0.0690]], device='cuda:0')



 Visualizing
tensor([[[-0.4136,  0.0690],
         [-0.4136,  0.0690],
         [-0.4136,  0.0690]],

        [[-0.4136,  0.0690],
         [-0.4136,  0.0690],
         [ 1.3629,  0.3710]],

        [[-0.4136,  0.0690],
         [ 1.3629,  0.3710],
         [ 0.1636,  0.1639]],

        [[ 1.3629,  0.3710],
         [ 0.1636,  0.1639],
         [ 0.1636,  0.1639]],

        [[ 0.1636,  0.1639],
         [ 0.1636,  0.1639],
         [ 0.9522,  0.2234]]], device='cuda:0')
torch.Size([5, 3, 2])
tensor([[[True, True],
         [True, True],
         [True, True]],

        [[True, True],
         [True, True],
     

In [1]:
#print(embedded.shape)
embedded_for_forwardpass = embedded.view(embedded.shape[0], context_size*2)
print(embedded_for_forwardpass.shape)
print(embedded_for_forwardpass[0]) #This is the same as emb[[0, 0 ,0]] crammed together to form a single vector
print(emb[[0,0,0]])
print()
#Or can think of it as embedded[0, :] crammed together for form a single vector of 1 X 3*2 (3 context characters in total with 2 values each)
print(embedded[0, :])
#This way it can be passed to the hidden layer for the forward pass


NameError: name 'embedded' is not defined

In [6]:
#DOING THE FORWARD PASS


#Hidden layer
print(embedded_for_forwardpass.shape)
w1_n_neurons = 2000
W1 = torch.randn((embedded_for_forwardpass.shape[1], w1_n_neurons), generator=g)
b1 = torch.randn((w1_n_neurons), generator=g) #One bias per neuron

hidden_layer_output = torch.tanh(embedded_for_forwardpass @ W1 + b1) #torch.tanh is the activation function, very similar to sigmoid but quite different

print(hidden_layer_output)
print(hidden_layer_output.shape)

#Output layer
W2 = torch.randn((w1_n_neurons, vocab_size), generator=g)
#Number of features X vocab size (it will pull which character from the vocab size is expected)
b2 = torch.randn((vocab_size), generator=g) #One bias per neuron in the output layer

logits = hidden_layer_output @ W2 + b2
probs = torch.softmax(logits, dim=1)
print(probs.sum(dim=1)) #Every row sums to 1
print(f"PROBS SHAPE --> {probs.shape}") #228146 character predictions using a vocab of 27 characters

preds = torch.argmax(probs, dim=1)
print(preds)
print(f"PREDS SHAPE --> {preds.shape}")
print(f"LABELS SHAPE --> {labels.shape}")


torch.Size([228146, 6])
tensor([[ 0.9726,  0.4431, -0.0864,  ...,  0.0149, -0.7072,  0.1730],
        [ 0.6922, -0.8180,  0.9886,  ..., -0.0238,  0.2132,  0.9690],
        [ 0.7751,  0.7703,  0.8652,  ..., -0.8317, -0.9067, -0.3992],
        ...,
        [ 0.7867, -0.6697,  0.9998,  ..., -0.6898, -0.5859,  0.9968],
        [ 0.9975,  0.9968,  0.3273,  ..., -0.9427, -0.8403, -0.7195],
        [ 0.9586, -0.9914,  0.9985,  ..., -0.9999,  0.9998,  0.9991]],
       device='cuda:0')
torch.Size([228146, 2000])
tensor([1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000], device='cuda:0')
PROBS SHAPE --> torch.Size([228146, 27])
tensor([25, 17, 19,  ...,  5,  0,  5], device='cuda:0')
PREDS SHAPE --> torch.Size([228146])
LABELS SHAPE --> torch.Size([228146])


In [7]:
loss = F.cross_entropy(logits, labels)
print(logits)
print(loss)

tensor([[  5.5468,  14.0438,   4.0803,  ...,  21.6168,  64.5432,  25.1775],
        [-43.1634,  -8.5669,   1.7233,  ...,  52.6536, -25.9140,   8.2162],
        [ 14.4222,   0.4754, -48.1010,  ..., -41.6979, -19.8595,  -3.8052],
        ...,
        [-11.2490,  -2.4094, -48.5172,  ...,  26.2349,  19.9011,  26.8131],
        [ 95.0431,  27.8424, -34.6679,  ..., -67.1100,  22.7324, -64.7000],
        [ 26.4629, -38.3026,  31.6897,  ...,  40.6187, -70.5282,  -5.8369]],
       device='cuda:0')
tensor(71.9009, device='cuda:0')


In [200]:
#Training
lr = -0.001
parameters = [emb, W1, b1, W2, b2]
embedded_for_forwardpass = emb[inputs].view(embedded.shape[0], context_size*2)
h = torch.tanh(embedded_for_forwardpass @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, labels)
for p in parameters:
    p.requires_grad = True
    p.grad = None

loss.backward()
for p in parameters:
    p.data += lr*p.grad

print(loss)

tensor(6.3359, device='cuda:0', grad_fn=<NllLossBackward0>)
