In [2]:
import torch
from torch import nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "gpu"
torch.set_default_device(device)
print(f"current device --> {device}")

current device --> cuda


In [3]:
with open("names.txt", 'r', encoding="UTF-8") as f:
    text = f.read()
    words = text.splitlines()
    chars = sorted(set("".join(words)))

stoi = {char:integer+1 for integer, char in enumerate(chars)}
itos = {integer+1:char for integer, char in enumerate(chars)}

stoi["."] = 0
itos[0] = "."

vocab_size = len(stoi)

In [13]:
import random
random.shuffle(words)

train_percent = 0.8
eval_percent = 0.1
test_percent = 0.1

inputs, labels = [], []


context_size = 3

for w in words[:]:
    context = [0] * context_size
    for char in w + ".":
        ix = stoi[char]
        inputs.append(context)
        labels.append(ix)
        context = context[1:] + [ix]

train_split = int(len(inputs)*train_percent)
eval_split = int(len(inputs)*eval_percent)
test_split = int(len(inputs)*test_percent)

Xtrain = torch.tensor(inputs[:train_split])
Ytrain = torch.tensor(labels[:train_split])

Xeval = torch.tensor(inputs[train_split:train_split+eval_split])
Yeval = torch.tensor(labels[train_split:train_split+eval_split])

Xtest = torch.tensor(inputs[train_split+eval_split:])
Ytest = torch.tensor(labels[train_split+eval_split:])

In [14]:
n_crammed_dimensions = 10

C = torch.randn((vocab_size, n_crammed_dimensions))

In [None]:
class Linear():
    def __init__(self, in_features, out_features, bias):
        self.weights = torch.randn((in_features, out_features)) / in_features**0.5 #Kaiming initialization
        self.biases = torch.zeros((out_features)) if bias else None
    
    def __call__(self, inputs):
        self.output = inputs @ self.weights
        if self.biases is not None:
            self.output += self.biases
        return self.output
    
    def params(self):
        if self.biases is not None:
            return [self.weights, self.biases]
        else:
            return [self.weights]

In [44]:
print(C[Xtrain].shape)

#  Emb   X   W1
# m x n     n x p

w1_n_neurons = 2000

#W1 = torch.randn((context_size*n_crammed_dimensions, w1_n_neurons)) * ((5/3) / (vocab_size*n_crammed_dimensions**0.5))
linear1 = nn.Linear(in_features=context_size*n_crammed_dimensions, out_features=w1_n_neurons, bias=True, device=device)
#b1 = torch.randn((w1_n_neurons)) * 0.01
# #No need for b1 because of batch normalization

#W2 = torch.randn((w1_n_neurons, vocab_size)) * 0.01
#b2 = torch.randn((vocab_size)) * 0
linear2 = nn.Linear(in_features=w1_n_neurons, out_features=vocab_size, bias=True)

params = [C, linear1, linear2]
for param in params:
    param.requires_grad = True

torch.Size([182516, 3, 10])
tensor([[ 3.0407e+00, -9.4090e-01, -7.7606e+00, -4.4884e-01,  1.2144e+00,
          2.0335e+00,  2.3982e-01, -3.1188e-01, -4.6000e+00, -1.2278e+00],
        [ 3.8857e+00, -2.1588e-01, -8.0756e+00,  1.7754e+00, -1.6461e+00,
          2.5144e+00, -1.5102e+00, -3.9295e-01, -5.1568e+00,  2.5783e-01],
        [ 2.4733e+00,  5.2570e-01, -3.2230e+00, -2.4308e-01, -4.6965e-01,
          3.3051e-01, -1.0441e+00,  4.7854e-01,  2.2393e-01,  8.5947e-01],
        [ 3.1372e+00, -2.8095e-01, -3.4550e+00,  1.4800e+00, -4.3365e-01,
         -8.5897e-01, -2.1755e-01, -6.9670e-01, -4.9960e-01, -1.3064e+00],
        [ 1.8108e+00,  5.0826e-01, -2.3793e+00,  5.5754e-01, -2.6300e-01,
          9.5574e-01, -2.8494e-02, -1.9921e+00, -4.1166e+00, -7.6748e-01],
        [ 4.5352e+00,  7.6271e-01, -7.0911e+00,  3.0721e+00,  1.7022e-01,
         -2.3601e-01, -1.6577e+00, -1.6835e+00, -3.3861e+00,  9.7677e-01],
        [ 2.7822e+00, -1.0838e+00, -2.0333e+00, -5.6300e-01, -6.1111e-01,
    

In [41]:
batch_size = 32
epochs = 1000

lr = -0.01

for epoch in range(epochs):
    for param in params:
        param.grad = None

    batch = torch.randint(0, Xtrain.shape[0], (batch_size,))
    emb = C[Xtrain[batch]].view(-1, context_size*n_crammed_dimensions)
    #hpreact = emb @ W1 + b1
    hpreact = linear1(emb)
    h = torch.tanh(hpreact)
    #logits = h @ W2 + b2
    logits = linear2(h)
    loss = F.cross_entropy(logits, Ytrain[batch])
    loss.backward()

    #for param in params:
        #param.data += lr * param.grad
    C.data += lr * C.grad
    linear1.weight.grad

AttributeError: 'Linear' object has no attribute 'data'

In [38]:
#Eval loss
with torch.no_grad():
    emb = C[Xeval].view(-1, context_size*n_crammed_dimensions)
    hpreact = emb @ W1 + b1
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yeval)
print(loss)

tensor(2.4326, device='cuda:0')
