In [40]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
words = open('names.txt', 'r').read().splitlines()
print(len(words))

32033


In [42]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [62]:
block_size = 3
X, Y = [], []
for w in words:
	context = [0] * block_size
	for ch in w+'.':
		ix = stoi[ch]
		X.append(context)
		Y.append(ix)
		context = context[1:] + [ix]
X = torch.tensor(X)
Y = torch.tensor(Y)

In [97]:
Xtr.shape, Ytr.shape

(torch.Size([182625, 3]), torch.Size([182625]))

In [104]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6,100), generator=g)
W2 = torch.randn((100,27), generator=g)
b1 = torch.randn(100, generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [105]:
sum(p.nelement() for p in parameters)

3481

In [106]:
for p in parameters:
	p.requires_grad = True

In [101]:
lre = torch.linspace(-3,0,1000)
lrs = 10**lre

In [102]:
# lri = []
# lossi = []
# for i in range(1000):
# 	# forward pass
# 	# minibatch
# 	ix = torch.randint(0, X.shape[0], (32,))
# 	emb = C[X[ix]]
# 	h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
# 	logits = h @ W2 + b2
# 	loss = F.cross_entropy(logits, Y[ix])
# 	# backward pass
# 	for p in parameters:
# 		p.grad = None
# 	loss.backward()
# 	# update
# 	lr = lrs[i]
# 	for p in parameters:
# 		p.data += -lr * p.grad
# 	lri.append(lr)
# 	lossi.append(loss.item())


# print(loss.item())

In [111]:

for i in range(10000):
	# forward pass
	# minibatch
	ix = torch.randint(0, Xtr.shape[0], (32,))
	emb = C[Xtr[ix]]
	h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
	logits = h @ W2 + b2
	loss = F.cross_entropy(logits, Ytr[ix])
	# backward pass
	for p in parameters:
		p.grad = None
	loss.backward()
	# update
	lr = 0.01
	for p in parameters:
		p.data += -lr * p.grad


print(loss.item())

1.9237751960754395


In [92]:
# training split, dev/validation split, test split
# 80%, 10%, 10%

def build_dataset(words):
	block_size = 3
	X, Y = [], []
	for w in words:
		context = [0] * block_size
		for ch in w+'.':
			ix = stoi[ch]
			X.append(context)
			Y.append(ix)
			context = context[1:] + [ix]
	X = torch.tensor(X)
	Y = torch.tensor(Y)
	print(X.shape, Y.shape)
	return X, Y
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte, Yte = build_dataset(words[n2:])				# 10%


torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [112]:
# evaluvate
emb = C[Xdev]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
print(loss.item())

2.309002637863159


In [113]:
emb = C[Xte]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Yte)
print(loss.item())

2.307586669921875
