# Shifting to Neural Network Framework

In [455]:
import torch

In [456]:
words = open('names.txt', 'r').read().splitlines()

In [457]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0

itos = {s:i for i, s in stoi.items()}


In [458]:
# Create a training set for bigrams
xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        print(f"{ch1} : {ch2}")
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)


xs = torch.tensor(xs)
ys = torch.tensor(ys)



. : e
e : m
m : m
m : a
a : .


xs and ys are the indices for the characters

And, we simply cannot give indices as input to the neural network

So we will perform one hot encoding to feed the character data to the net

In [459]:
import torch.nn.functional as F

xenc = F.one_hot(xs, num_classes=27).float()
xenc.shape

torch.Size([5, 27])

In [460]:
"""Torch.randn() returns a tensor filled with random numbers from a normal distribution with mean  = 0 and variance  = 1"""

"""Let's say we have a word emma. there are 5 bigrams for the word emma
   so the dimensions of xenc would be (5, 27)

   So, when we do xenc @ W , we are doing => (5, 27) @ (27, 1), which results to (5, 1)
"""

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27,27), generator=g, requires_grad=True)
xenc @ W

tensor([[ 1.5674e+00, -2.3729e-01, -2.7385e-02, -1.1008e+00,  2.8588e-01,
         -2.9643e-02, -1.5471e+00,  6.0489e-01,  7.9136e-02,  9.0462e-01,
         -4.7125e-01,  7.8682e-01, -3.2843e-01, -4.3297e-01,  1.3729e+00,
          2.9334e+00,  1.5618e+00, -1.6261e+00,  6.7716e-01, -8.4039e-01,
          9.8488e-01, -1.4837e-01, -1.4795e+00,  4.4830e-01, -7.0730e-02,
          2.4968e+00,  2.4448e+00],
        [ 4.7236e-01,  1.4830e+00,  3.1748e-01,  1.0588e+00,  2.3982e+00,
          4.6827e-01, -6.5650e-01,  6.1662e-01, -6.2197e-01,  5.1007e-01,
          1.3563e+00,  2.3445e-01, -4.5585e-01, -1.3132e-03, -5.1161e-01,
          5.5570e-01,  4.7458e-01, -1.3867e+00,  1.6229e+00,  1.7197e-01,
          9.8846e-01,  5.0657e-01,  1.0198e+00, -1.9062e+00, -4.2753e-01,
         -2.1259e+00,  9.6041e-01],
        [ 1.9359e-01,  1.0532e+00,  6.3393e-01,  2.5786e-01,  9.6408e-01,
         -2.4855e-01,  2.4756e-02, -3.0404e-02,  1.5622e+00, -4.4852e-01,
         -1.2345e+00,  1.1220e+00, -6.73

Explaination of above step

lets say we access the following way out of xenc @ W

(xenc @ W)[3, 13] = > this is giving us the firing rate of the 13th neuron looking at the third input

For everyone of the 27 neurons we created, what is the firing rate of those neurons on everyone of the 5 examples (5 wrt emma bigram)

This is the dumbest, smallest, simplest neural network with a single linear layer

Intuitively, we are trying to create a probabilty distribution for the next input in the sequence

In [461]:
"""We are now treating these 27 outputs as log_counts and so we exponentiate every element"""

'We are now treating these 27 outputs as log_counts and so we exponentiate every element'

# Softmax !! :)

In [462]:
logits = (xenc @ W) #log-counts
counts = logits.exp() # Equivalent to the N matrix in the bigram language model using BOW
 
probs = counts / counts.sum(1, keepdims=True) # Normalising the counts

In [463]:
probs[0]

tensor([0.0607, 0.0100, 0.0123, 0.0042, 0.0168, 0.0123, 0.0027, 0.0232, 0.0137,
        0.0313, 0.0079, 0.0278, 0.0091, 0.0082, 0.0500, 0.2378, 0.0603, 0.0025,
        0.0249, 0.0055, 0.0339, 0.0109, 0.0029, 0.0198, 0.0118, 0.1537, 0.1459],
       grad_fn=<SelectBackward0>)

In [464]:
"""This step tells what probabilty is the input giving for the expected output character
"""

probs[torch.arange(len(xs)), ys]

tensor([0.0123, 0.0181, 0.0267, 0.0737, 0.0150], grad_fn=<IndexBackward0>)

In [465]:
# Loss

loss = -probs[torch.arange(len(xs)), ys].log().mean()

loss

tensor(3.7693, grad_fn=<NegBackward0>)

# Initialisation

In [466]:
"""Torch.randn() returns a tensor filled with random numbers from a normal distribution with mean  = 0 and variance  = 1"""

"""Let's say we have a word emma. there are 5 bigrams for the word emma
   so the dimensions of xenc would be (5, 27)

   So, when we do xenc @ W , we are doing => (5, 27) @ (27, 1), which results to (5, 1)
"""

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27,27), generator=g, requires_grad=True)


# Forward Pass

In [467]:
xenc = F.one_hot(xs, num_classes=27).float()
logits = xenc @ W #log-counts
counts = logits.exp() # Equivalent to the N matrix in the bigram language model using BOW
probs = counts / counts.sum(1, keepdims=True) # Normalising the counts

loss = -probs[torch.arange(len(xs)), ys].log().mean()

print(loss.item())



3.7693049907684326


# Backward Pass

In [468]:
W.grad = None
loss.backward()

In [469]:
W.data += -0.1 * W.grad

In [470]:
loss

tensor(3.7693, grad_fn=<NegBackward0>)

# Putting everything together

In [471]:
# Create the dataset
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)


xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

print(f"Number of examples: {num}")


# Initialize the network

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27,27), generator=g, requires_grad=True)


Number of examples: 228146


In [472]:
# Gradient Descent
regularization_strength = 0.01

for k in range(100):
     
     # forward pass
    xenc   = F.one_hot(xs, num_classes=27).float()                              # input to the network : one-hot encoding
    logits = xenc @ W                                                           # predict log-counts
    counts = logits.exp()                                                       # counts, equivalent to N
    probs  = counts/ counts.sum(1, keepdims=True)                               # Probabilities for next character

    loss   = -probs[torch.arange(num), ys].log().mean() + regularization_strength * (W**2).mean()  # Loss  + Regularization loss
    print(loss.item())

    # Backward Pass
    W.grad = None
    loss.backward()

    # Update
    W.data += -50 * W.grad

        

3.768618583679199
3.3788068294525146
3.161090850830078
3.0271859169006348
2.9344840049743652
2.867231607437134
2.8166542053222656
2.777146577835083
2.745253801345825
2.7188305854797363
2.696505308151245
2.6773719787597656
2.6608052253723145
2.6463515758514404
2.633665084838867
2.622471570968628
2.6125476360321045
2.6037068367004395
2.595794916152954
2.5886807441711426
2.5822560787200928
2.576429843902588
2.5711236000061035
2.566272735595703
2.5618226528167725
2.5577261447906494
2.5539445877075195
2.550442695617676
2.547192335128784
2.5441696643829346
2.5413522720336914
2.538722038269043
2.536262035369873
2.5339579582214355
2.531797409057617
2.529768228530884
2.527860164642334
2.5260636806488037
2.5243704319000244
2.522773265838623
2.52126407623291
2.519836664199829
2.5184857845306396
2.5172054767608643
2.515990734100342
2.5148372650146484
2.5137410163879395
2.512697696685791
2.511704921722412
2.5107579231262207
2.509855031967163
2.5089924335479736
2.5081686973571777
2.507380247116089
2

We did model smoothening by adding fake counts in the BOW model

But in the NN framework, we can do this by pushing W close to 0

If W is 0, logits = 0, 
counts is all 1, 
and prob distribution is very smooth

# Sampling from the Model

In [473]:
g = torch.Generator().manual_seed(2147483647)

for i in range(15):
    out = []
    ix = 0
    while True:
        # Forward pass
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts/ counts.sum(1, keepdims=True)

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break

    print(''.join(out))



junide.
janasah.
p.
cfay.
a.
nn.
kohin.
tolian.
juwe.
kalanaauranilevias.
dedainrwieta.
ssonielylarte.
faveumerifontume.
phynslenaruani.
core.
