In [2]:
import torch
from torch import nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
print(f"Default device set to {device}")

Default device set to cuda


In [3]:
with open("names.txt", "r", encoding="UTF-8") as f:
    text = f.read()
    words = text.splitlines()
    chars = sorted(set(("".join(words))))

stoi = {char:integer+1 for integer,char in enumerate(chars)}
itos = {integer+1:char for integer,char in enumerate(chars)}

stoi["."] = 0
itos[0] = "."

vocab_size = len(stoi)

In [4]:
inputs, labels = [], []
context_size = 3 #Block size

for w in words[:]:
    context = [0] * context_size
    for char in w + ".":
        ix = stoi[char]
        inputs.append(context)
        labels.append(ix)
        context = context[1:] + [ix]

inputs = torch.tensor(inputs)
labels = torch.tensor(labels)



In [5]:
embedding = torch.randn((vocab_size, 2))
embedded = embedding[inputs]
print(embedded[:1])
embedded_feedforward = embedded.view(embedded.shape[0], context_size*2)
print(embedded_feedforward[:1])

tensor([[[-1.1948, -1.1598],
         [-1.1948, -1.1598],
         [-1.1948, -1.1598]]], device='cuda:0')
tensor([[-1.1948, -1.1598, -1.1948, -1.1598, -1.1948, -1.1598]],
       device='cuda:0')


In [9]:
#Defning params for forward pass

# Inputs @ Weights + biases
# m X n    n X p

# Result --> embedded_feedforward.shape[1] X p

w1_n_neurons = 2000
w2_n_neurons = vocab_size #27 different characters to be predicted

W1 = torch.randn((embedded_feedforward.shape[1], w1_n_neurons))
b1 = torch.randn((w1_n_neurons))

# Hidden Layer @ W2 + biases2
# n X p            p X vocab_size

W2 = torch.randn((w1_n_neurons, w2_n_neurons))
b2 = torch.randn((w2_n_neurons))

params = [embedding, W1, b1, W2, b2]

In [10]:
from tqdm import tqdm

for param in params:
    param.requires_grad = True

In [350]:

lr = -0.01
epochs = 1000

for epoch in tqdm(range(epochs)):
    for param in params:
        param.grad = None

    embedded_feedforward = embedding[inputs].view(embedded.shape[0], context_size*2)
    hidden_layer = torch.tanh(embedded_feedforward @ W1 + b1)
    logits = hidden_layer @ W2 + b2
    loss = F.cross_entropy(logits, labels)

    loss.backward()

    for param in params:
        param.data += lr * param.grad

loss = F.cross_entropy(logits, labels)
print(loss)



100%|██████████| 1000/1000 [02:34<00:00,  6.49it/s]


tensor(2.6537, device='cuda:0', grad_fn=<NllLossBackward0>)


In [357]:
print(inputs.shape)
print(torch.randint(0, inputs.shape[0], (32,))) #Random location for a batch

print(inputs[2989]) #a random input for a batch
print(labels[2989]) #a random label for a batch
print(inputs[2990])

torch.Size([228146, 3])
tensor([189255,  45137, 132600, 127339, 188739, 103533,  10022, 199029, 115773,
        118999, 205292, 162118,  41799, 183687, 135551,   2220,  39997, 183070,
        168301, 119452, 161958,  16972,  61822,  30158, 218652,  70642, 226070,
        195489,  14589, 191483, 158296,  45816], device='cuda:0')
tensor([ 2, 18,  9], device='cuda:0')
tensor(14, device='cuda:0')
tensor([18,  9, 14], device='cuda:0')


In [360]:
#Optimized for mini batches

batch_size = 32

lr = -0.01
epochs = 1

for epoch in tqdm(range(epochs)):
    batch = torch.randint(0, inputs.shape[0], (batch_size,))

    for param in params:
        param.grad = None

    embedded_feedforward = embedding[inputs[batch]].view(-1, context_size*2)
    hidden_layer = torch.tanh(embedded_feedforward @ W1 + b1)
    logits = hidden_layer @ W2 + b2
    loss = F.cross_entropy(logits, labels[batch])

    loss.backward()

    for param in params:
        param.data += lr * param.grad

loss = F.cross_entropy(logits, labels)
print(loss)



100%|██████████| 1/1 [00:00<00:00, 57.73it/s]

tensor([183618,  90314, 118532, 208236, 223653,  18869, 132579,  14748,  65795,
        129448,  78632,  24806,  61435, 209417,  86837,  66943,  53227, 173939,
         14043, 151076,  26669,  63205, 167467,  34925, 110213,  88982, 207545,
        103525, 178633, 215949, 109798, 221257], device='cuda:0')





ValueError: Expected input batch_size (32) to match target batch_size (228146).