<a href="https://colab.research.google.com/github/TanayPhatak/Google_Colab_Projects/blob/main/build_MakeMore_bigram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Creating mock of MakeMore by Andrej Karpathy.
Video Link: https://www.youtube.com/watch?v=PaCmpygFfXo

In [None]:
import io
import requests
import torch
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
url = 'https://raw.githubusercontent.com/karpathy/makemore/master/names.txt'
read_data = requests.get(url).content
raw_words = read_data.splitlines()
words = [w.decode('ascii') for w in raw_words]

In [None]:
words[:10]

## Building a Bigram language model for MakeMore

Bigram is a pair of alphabets. It takes a character and predicts the following character.

In [None]:
b = {}
for w in words:
  chs = ['<S>'] + list(w) + ['<E>']
  for ch1, ch2 in zip(chs, chs[1:]):
    bigram = (ch1, ch2)
    b[bigram] = b.get(bigram, 0) + 1

In [None]:
sorted(b.items(), key = lambda kv: -kv[1])

In [None]:
N = torch.zeros((27, 27), dtype = torch.int32)

In [None]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0

In [None]:
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    N[ix1, ix2] += 1

In [None]:
itos = {i:s for s, i in stoi.items()}

In [None]:
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
  for j in range(27):
    chstr = itos[i] + itos[j]
    plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
    plt.text(j, i, N[i, j].item(), ha="center", va="top", color="gray")
plt.axis('off')

In [None]:
g = torch.Generator().manual_seed(2147483647)
p = torch.rand(3, generator=g)
p = p / p.sum()
p

In [None]:
torch.multinomial(p, num_samples=200, replacement=True, generator=g)

###PyTorch sum function

torch.sum() has keepdim param. that retains a dimension after the operation

* torch.sum(1, keepdim=true) sums up along the row

* torch.sum(0, keepdim=true) sums up along the column

**Also see broadcasting rules of PyTorch: https://pytorch.org/docs/stable/notes/broadcasting.html**

In [None]:
P = (N+1).float()
P /= P.sum(1, keepdim=True)

#                      P -> 27, 27
# P.sum(1, keepdim=True) -> 27,  1

# They are broadcastable according to broadcasting rule of pytorch
# Resulting tensor will have shape 27, 27

In [None]:
P[0].sum()

In [None]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
  ix = 0
  out = []
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break

  print(''.join(out))

**Negative Log Likehood** is a very useful loss function in this case
It is negative of log of products of probabilites of model parameters in use.

*It is common to take average negative log likehood to give an overall picture of the loss of the model*

--- Minimizing the negative log likehood is the goal (lower it is, better is the model)

Here, -log(P[ix1, ix2])

If in some case, model is 0% likely to predict a output, loss will be infinity due to log function.
* Therefore, we add an initial value to every data element to avoid such a situation (this is called model smoothing).

In [None]:
log_likehood = 0.0
n = 0

for w in words[:3]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likehood += logprob
    n += 1
    print(f'{ch1}{ch2}: {prob=: 4f}, {logprob=: 4f}')

print(f'{log_likehood=}')
nll = -log_likehood
print(f'{nll=}')
print(f'{(nll/n)=}')

---
---
---

## Creating a neural for the same above...

In [None]:
words[0], words[:1]

In [None]:
# Create the training set of bigrams (x, y)
xs, ys = [], []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

print(f'{xs=}, {ys=}')

In [None]:
# One-Hot encoding the tensors xs and ys
xenc = torch.nn.functional.one_hot(xs, num_classes=27).float()
xenc, xenc.shape

In [None]:
plt.imshow(xenc), xenc.dtype

In [None]:
W = torch.randn((27, 27))
xenc @ W

# '@' is a matrix multiplication operator in PyTorch

In [None]:
logits = (xenc @ W) # log-counts
counts = logits.exp() # Equivalent N
probs = counts / counts.sum(1, keepdims=True)
probs

---
## Summary so far :-

In [None]:
xs

In [None]:
ys

In [None]:
# Randomly initialize 27 neurons' weights, each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
w = torch.randn((27, 27), generator=g)

In [None]:
xenc = torch.nn.functional.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# btw: the last 2 lines here are together called 'softmax'
# Therefore: Softmax = ((xenc @ W).exp()) / ((xenc @ W).exp()).sum(1, keepdims=True)
# Softmax is an activation function for neurons in the neural network (generally used on output layer).

In [None]:
probs.shape

In [None]:
print('Raw input: ', words[:1])
nlls = torch.zeros(5)
for i in range(5):
  # i-th bigram
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------------')
  print(f'Bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x}, {y})')
  print('Input to the neural network: ', x)
  print('Output of the neural network: ', probs[i])
  print('Label (actual next character): ', y)
  p = probs[i, y]
  print('Prbability assignaed by the network to the correct character: ', p.item())
  logp = torch.log(p)
  print('Log likehood: ', logp.item())
  nll = -logp
  print('Negative log likehood: ', nll.item())
  nlls[i] = nll

print('===============')
print('Average negative log likehood, i.e. loss = ', nlls.mean().item())

## *--------- !!! OPTIMIZATION !!! ---------*

In [None]:
xs, ys

In [None]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [None]:
# Forward Pass
xenc = torch.nn.functional.one_hot(xs, num_classes=27).float()
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(5), ys].log().mean()

In [None]:
print(loss.item())

In [None]:
# Backward Pass
W.grad = None # set to zero the gradient (none is better than 0)
loss.backward()

In [None]:
W.data += -0.1 * W.grad

---
---
## Compiling everything done till now to work in a flow

In [None]:
# Create Dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('Number of examples: ', num)

In [None]:
# Initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [None]:
# Gradient Descent
for k in range(10):

  # Forward pass
  xenc = torch.nn.functional.one_hot(xs, num_classes=27).float()
  logits = xenc @ W
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdims=True)
  loss = -probs[torch.arange(num), ys].log().mean()
  print(loss.item())

  # Gradient-based optimization:
  # Backward pass
  W.grad = None
  loss.backward()

  # Update weights
  W.data += -100 * W.grad

In [None]:
# Finally, sample from the 'neural network' model
for i in range(5):
  out = []
  ix = 0
  while True:
    # --------------
    # BEFORE:
    # p = P[ix]
    # --------------
    # NOW:
    xenc = torch.nn.functional.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    p = counts / counts.sum(1, keepdims=True)
    # --------------

    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break

  print(''.join(out))
