In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
words = open('names.txt').read().splitlines()

#build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

vocab_size = len(itos)

In [4]:
len(words)

32033

In [5]:
# trian/ val/ test set splits
def build_dataset(words):
  block_size = 4 # context length: how many words do we take in to predict the next?
  X, Y = [], []

  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix]
  X = torch.tensor(X)
  Y = torch.tensor (Y)
  return X,Y

import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[:n2])

In [6]:
Xtr.shape, Ytr.shape , Xdev.shape, Ydev.shape, Xte.shape, Yte.shape , n1, n2

(torch.Size([182625, 4]),
 torch.Size([182625]),
 torch.Size([22655, 4]),
 torch.Size([22655]),
 torch.Size([205280, 4]),
 torch.Size([205280]),
 25626,
 28829)

In [7]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,20), generator = g, requires_grad = True)
W1 = torch.randn((80,300), generator = g, requires_grad = True)
b1 = torch.randn(300, generator = g, requires_grad = True)
W2 = torch.randn((300,27), generator = g, requires_grad = True)
b2 = torch.randn(27, generator = g, requires_grad = True)

parameters = [C, W1, b1, W2, b2]

In [8]:
print(sum(p.nelement() for p in parameters))

32967


In [9]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre

In [10]:
lri = []
lossi = []
stepi = []

In [11]:

for i in range(200000):

    #mini-batch construct ; overfitting a random mini-batch size = 32
    # mini-batch ----> lower quality gradient
    # its better to calculate an estimate of gradient and make a few more steps
    ix =  torch.randint(0, Xtr.shape[0], (64,))


    #forward pass
    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(-1, 80) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])
    #backward pass
    for p in parameters:
        p.grad = None

    loss.backward()

    lr = 0.1 if i < 100000 else 0.01

    for p in parameters:
        p.data += -lr * p.grad

    #keeping track of stats
    # lri.append(lre[i])
    lossi.append(loss.log10().item())
    stepi.append(i)

In [12]:
print(loss.item())

2.2356367111206055


In [None]:
plt.plot(stepi, lossi) # thickness because mini-batch creates a little bit of noise

#one possible case is the batch size is too low that there is way too much noise in the training ---> play around with some batch size numbers

[<matplotlib.lines.Line2D at 0x1c92f3cd6a0>]

In [None]:
# emb = C[Xtr]
# #forward pass
# h = torch.tanh(emb.view(-1,6) @ W1 + b1)
# logits = h @ W2 + b2
# loss = F.cross_entropy(logits, Ytr)
# print(loss.item())

In [None]:
emb = C[Xdev]
#forward pass
h = torch.tanh(emb.view(-1,60) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
print(loss.item())

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data from the provided configurations
data = [
    {"iterations": 400000, "parameters": 3487, "loss": 2.2, "config": "embed_dim=2, batch=32"},
    {"iterations": 400000, "parameters": 11987, "loss": 2.23, "config": "embed_dim=10, batch=32"},
    {"iterations": 400000, "parameters": 17697, "loss": 1.98, "config": "embed_dim=10, batch=32"},
    {"iterations": 600000, "parameters": 26967, "loss": 1.67, "config": "embed_dim=20, batch=32"},
    {"iterations": 400000, "parameters": 26967, "loss": 2.03, "config": "embed_dim=20, batch=64"},
    {"iterations": 200000, "parameters": 26967, "loss": 1.92, "config": "embed_dim=20, batch=64 (200k)"}
]

# Create subplots with better spacing
plt.figure(figsize=(15, 6))

# Plot 1: Iterations vs Loss
plt.subplot(1, 2, 1)
plt.scatter([d["iterations"] for d in data], [d["loss"] for d in data], c='blue', s=150, edgecolors='black')
plt.title('Iterations vs Loss', fontsize=16)
plt.xlabel('Iterations', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
for d in data:
    plt.annotate(d["config"], (d["iterations"], d["loss"]),
                 xytext=(5, 5), textcoords='offset points', fontsize=10, color='black')

# Plot 2: Parameters vs Loss
plt.subplot(1, 2, 2)
plt.scatter([d["parameters"] for d in data], [d["loss"] for d in data], c='red', s=150, edgecolors='black')
plt.title('Parameters vs Loss', fontsize=16)
plt.xlabel('Number of Parameters', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
for d in data:
    plt.annotate(d["config"], (d["parameters"], d["loss"]),
                 xytext=(5, 5), textcoords='offset points', fontsize=10, color='black')

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
  plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color='white')

plt.grid('minor')