<a href="https://colab.research.google.com/github/Vihaan3/Custom-Search-Engine-Demo/blob/main/Karpathy_Exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import torch
import torch.nn.functional as F

### Makemore Part 1

In [59]:
words = open('/names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
g = torch.Generator().manual_seed(2147483647)

#### A pared down version of the neural network in the original video.

In [60]:
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of examples:  228146


In [61]:
for k in range(100):

  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()

  if k % 10 == 0:
    print(loss.item())

  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()

  # update
  W.data += -50 * W.grad

print(loss.item())

3.7686190605163574
2.696505546569824
2.5822560787200928
2.5413522720336914
2.52126407623291
2.509854555130005
2.5027060508728027
2.4978790283203125
2.4944381713867188
2.4918932914733887
2.4901304244995117


In [62]:
for i in range(5):

  out = []
  ix = 0
  while True:

    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character

    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

junide.
janasah.
p.
cfay.
a.


Loss: 2.49

Text Generated:
- junide.
- janasah.
- p.
- cfay.
- a.

#### E01: Trigram
I wasn't sure at first whether increasing the right way to go was increasing the size of the input dimension or adding another dimension so I decided to try both. Also, note that my code here is meant to resemble Andrej Karpathy's, which is intentonally a little bit hacky for educational purposes. This is not what a "production" implementation would look like.

##### V1: Additional Dimension

In [63]:
import einops

In [94]:
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.', '.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement() // 2 # dividing by 2 now because the shape of xs has changed
print('number of examples: ', num)

W1 = torch.randn((27, 27, 27), generator=g, requires_grad=True) # 3D now

number of examples:  228146


In [95]:
for k in range(100):

  # forward pass
  xenc = F.one_hot(xs[:, 0], num_classes=27).float() # character 1
  xenc2 = F.one_hot(xs[:, 1], num_classes=27).float() # character 2
  logits = einops.einsum(xenc, W1, xenc2, "b i, i j k, b j -> b k") # you might have to ask GPT about this line. Understand what's happening but don't stress it, it'll make more sense soon.
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdims=True)
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W1**2).mean()

  if k % 10 == 0:
    print(loss.item())

  # backward pass
  W1.grad = None # set to zero the gradient
  loss.backward()

  # update
  W1.data += -50 * W1.grad

print(loss.item())

3.6517415046691895
2.8801517486572266
2.6203973293304443
2.471735715866089
2.373619794845581
2.3028182983398438
2.2486462593078613
2.205559015274048
2.1703648567199707
2.1410269737243652
2.1184768676757812


In [99]:
for i in range(5):

  out = []
  # couldn't decide on good var names so I thought I would be funny
  uno = 0
  dos = 0
  while True:

    xenc = F.one_hot(torch.tensor(uno), num_classes=27).float().unsqueeze(0)
    xenc2 = F.one_hot(torch.tensor(dos), num_classes=27).float().unsqueeze(0)
    logits = einops.einsum(xenc, W1, xenc2, "b i, i j k, b j -> b k")
    counts = logits.exp()
    p = counts / counts.sum(1, keepdims=True)

    uno = dos
    dos = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[dos])
    if dos == 0:
      break
  print(''.join(out))

ni.
makestobfzwruvbvjxpxdznleanczqwmdemyw.
mkzdnseira.
na.
umjoebvsdfyhoujfodpmvuhdgdohupy.


Loss: 2.12

Text Generated
- coud
- qipny
- hafijcpydhrdqzieqiupfdcaanvinegnhiyah
- dae
- hatea

Lower loss but also lower quality outputs than the original. Seems like overfitting.

Curiously, when I train for 1000 steps instead, I get these results:

Loss: 1.49

Text Generated
- ouwade
- ilyasid
- prelay
- ocnzi
- ohr

These seem much higher quality than the original.  

##### V2: Bigger input dimension

In [103]:
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.', '.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement() // 2 # dividing by 2 now because the shape of xs has changed
print('number of examples: ', num)

W2 = torch.randn((27*2, 27), generator=g, requires_grad=True) # still 2D but now the input dimension is double

number of examples:  228146


In [105]:
for k in range(100):

  # forward pass
  '''Edit: a cleaner implementation would be:
  xenc = F.one_hot(xs, num_classes=27).float()
  logits = xenc.view(-1, 27*2) @ W2
  '''
  xenc = F.one_hot(xs[:, 0], num_classes=27).float() # character 1
  xenc2 = F.one_hot(xs[:, 1], num_classes=27).float() # character 2
  xcat = torch.cat ((xenc, xenc2), 1)
  logits = xcat @ W2
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdims=True)
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W2**2).mean()

  if k % 10 == 0:
    print(loss.item())

  # backward pass
  W2.grad = None # set to zero the gradient
  loss.backward()

  # update
  W2.data += -50 * W2.grad

print(loss.item())

4.5374555587768555
2.16996693611145
2.063832998275757
2.020979404449463
1.9984627962112427
1.9848086833953857
1.9757133722305298
1.9692426919937134
1.9644166231155396
1.960689663887024
1.958001732826233


In [98]:
for i in range(5):

  out = []
  # couldn't decide on good var names so I thought I would be funny
  uno = 0
  dos = 0
  while True:

    xenc = F.one_hot(torch.tensor(uno), num_classes=27).float().unsqueeze(0)
    xenc2 = F.one_hot(torch.tensor(dos), num_classes=27).float().unsqueeze(0)
    xcat = torch.cat((xenc, xenc2), 1)
    logits = xcat @ W2
    counts = logits.exp()
    p = counts / counts.sum(1, keepdims=True)

    uno = dos
    dos = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[dos])
    if dos == 0:
      break
  print(''.join(out))

.
.
mefilayniylenn.
.
.


Loss: 1.95

Text Generated:
- blank
- dena
- blank
- blank
- blank

Much lower loss but also lower quality generations.

#### E02: Splitting
Basic implementation of the splitting. Play around by yourself to answer Andrej Karpathy's question.

In [108]:
import random

random.shuffle(words)

num_words = len(words)
train_split = int(0.8*num_words)
dev_split = int(0.9*num_words)

train_words = words[:train_split]
dev_words = words[train_split:dev_split]
test_words = words[dev_split:]

print(f"Train Size: {len(train_words)}")
print(f"Dev Size: {len(dev_words)}")
print(f"Test Size: {len(test_words)}")

Train Size: 25626
Dev Size: 3203
Test Size: 3204


#### E05: F.cross_entropy


In [None]:
# From the original. You can turn snippet 1 into snippet 2 using F.cross_entropy
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()

# F.cross_entropy
xenc = F.one_hot(xs, num_classes=27).float()
logits = xenc @ W
loss = F.cross_entropy(logits, ys) + 0.01*(W**2).mean()

# Why you want to use F.cross_entropy instead: https://youtu.be/TCH_1BHY58I?t=1979

### Makemore Part 2: MLP

#### E03

### Makemore Part 3: Activations and Gradients, Batchnorm

#### E02

### GPT

#### EX1

#### EX2

#### EX3

#### EX4