In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import torch

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
N = torch.zeros((27, 27), dtype= torch.int32)     # 26 aplhabets + 1 special char ('.')

In [4]:
chars = sorted(list(set(''.join(words))))       
# taking whole dataset and passing them as single string, then constrainig them with set, which
# throws out the duplicate, as a result we will get the set as string of 26 lower case alphabets

# the charaters obtaiend from zip will be strings
# therefore we need look up table from characetrs to integers
stoi = {s:i+1 for i, s in enumerate(chars)}   
# enumerate will index the characters, hence we are doing one to one mapping of the characters
# This is 's' to 'i' mapping 

In [5]:
# lets add the special character as well ('.')
stoi['.'] = 0
# lets create 'i' to 's' mapping for  better representation of matrix
itos = {i:s for s, i in stoi.items()}

In [6]:
for w in words:
    chs = ['.']+list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] +=1

In [7]:
# lets create a prob distribution 
p = N[0].float()
p = p/p.sum()  # normalize
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p, num_samples =1, replacement = True, generator = g).item()
itos[ix]

'm'

In [8]:
# lets normalize the whole distribution matrix in one go
#'P = N.float()'
P = (N+1).float()      # adding "+1" as a part of smoothing process
P /= P.sum(1, keepdim = True)
P[0].sum()

tensor(1.)

In [9]:
# lets analyse the prob assigned to the bigrams 
# using likelihood concept to assess the prob of given bigrams
# since taking prob product of small values will become even more smaller
# therefore make use log prb
log_likelihood = 0.0
n = 0

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob =  P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1 
        #print(f'{ch1}{ch2}: {logprob:.4f}')

print(f'{log_likelihood=}')

log_likelihood=tensor(-559951.5625)


In [10]:
# our aim is to get prob= 1, for every bigram,such that logprob will be zero for the bigram combo
# and so will be log_likelihood
# using loss function, we focus on reducing the error therefore we need to flip the expression
# use negative log_likelihood
nll = -log_likelihood
print(f'{nll = }')
average_nll = nll/n
print(f'Normalized_nll ={average_nll}')

nll = tensor(559951.5625)
Normalized_nll =2.4543561935424805


### Lets build a NN with the same concept of Bigram based analysis of words

In [11]:
# create a training set of bigrams (x, y)
xs, ys = [], []
for w in words[:1]:   # just for first word 
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        print(ch1, ch2)
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
xs, ys
# We will see that, even first word "emma" has 5 training example for bigram NN

. e
e m
m m
m a
a .


(tensor([ 0,  5, 13, 13,  1]), tensor([ 5, 13, 13,  1,  0]))

In [12]:
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes = 27).float()
xenc.shape
# therefore we encoded the five examples coming from first word.
#input layer,with 27 neurons represents the one_hot encoding featuring system to describe bigram.
# since there 5 examples, hence the whole x matrix becomes of dimension 5x27.

torch.Size([5, 27])

In [13]:
# for a input layer of 27 neurons
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator = g)
xenc @ W      # 1st hidden layer derived from the input layer vetor mult. with weight matrix
# this hidden layer also contains 27 neurons
# vectorized implementation of example training

tensor([[ 1.5674e+00, -2.3729e-01, -2.7385e-02, -1.1008e+00,  2.8588e-01,
         -2.9643e-02, -1.5471e+00,  6.0489e-01,  7.9136e-02,  9.0462e-01,
         -4.7125e-01,  7.8682e-01, -3.2843e-01, -4.3297e-01,  1.3729e+00,
          2.9334e+00,  1.5618e+00, -1.6261e+00,  6.7716e-01, -8.4039e-01,
          9.8488e-01, -1.4837e-01, -1.4795e+00,  4.4830e-01, -7.0730e-02,
          2.4968e+00,  2.4448e+00],
        [ 4.7236e-01,  1.4830e+00,  3.1748e-01,  1.0588e+00,  2.3982e+00,
          4.6827e-01, -6.5650e-01,  6.1662e-01, -6.2197e-01,  5.1007e-01,
          1.3563e+00,  2.3445e-01, -4.5585e-01, -1.3132e-03, -5.1161e-01,
          5.5570e-01,  4.7458e-01, -1.3867e+00,  1.6229e+00,  1.7197e-01,
          9.8846e-01,  5.0657e-01,  1.0198e+00, -1.9062e+00, -4.2753e-01,
         -2.1259e+00,  9.6041e-01],
        [ 1.9359e-01,  1.0532e+00,  6.3393e-01,  2.5786e-01,  9.6408e-01,
         -2.4855e-01,  2.4756e-02, -3.0404e-02,  1.5622e+00, -4.4852e-01,
         -1.2345e+00,  1.1220e+00, -6.73

In [14]:
(xenc@W).shape 

torch.Size([5, 27])

###  SOFTMAX Formulation (exponentiation of elements and normalization) 

In [15]:
# the activation values of neurons in hidden layers, are all sorts of 
# positive and negative logrithmic counts 
# what we need is simply counts for better interpretability, hence exponentiate them
logits = (xenc@W)
counts = logits.exp()
probs = counts / counts.sum(1, keepdims = True)     # normalize the count matrix
probs[0].sum(), probs.shape

(tensor(1.0000), torch.Size([5, 27]))

In [16]:
probs[0]  #probability distri map for 1st example or bigram, wrt random generated weight matrix

tensor([0.0607, 0.0100, 0.0123, 0.0042, 0.0168, 0.0123, 0.0027, 0.0232, 0.0137,
        0.0313, 0.0079, 0.0278, 0.0091, 0.0082, 0.0500, 0.2378, 0.0603, 0.0025,
        0.0249, 0.0055, 0.0339, 0.0109, 0.0029, 0.0198, 0.0118, 0.1537, 0.1459])

##### Training every word create different number of bigrams and these number of bigrams become example that are to be trained. Consider if we had two words to train such that one word was producing 5 bigram and other one producing 3 bigrams. Then while training the data-set, we will treat all of the bigrams as individual examples (hence train them as 8 examples simply) and no need to make a distinction in between words.

##### Further every bigram training will set first char as input and other char as the output(label).
##### The input layer for every char involves 27 dimensional one_hot encoding of first character of bigram

##### Another point to note here is that we have only one hidden layer with "Softmax" as the activation fucntion

In [17]:
# lets summarize what we have been doing with NN

In [18]:
nlls = torch.zeros(5)
for i in range(xs.shape[0]):
    # i-th bigram:
    x = xs[i].item()
    y = ys[i].item()
    print('-------')
    print(f'bigram example {i+1}: {itos[x]}{itos[y]}({x}, {y})')
    print('input to the neural net:', x)
    print('output label to the neural net:', y)
    p = probs[i,y]
    print('probability assigned by the net to the correct character:', p.item())
    logp = torch.log(p)
    print('log likelihood:', logp.item())
    nll = -logp
    print('negative log lieklihood:', nll.item())
    nlls[i] = nll
    
print('=======')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

-------
bigram example 1: .e(0, 5)
input to the neural net: 0
output label to the neural net: 5
probability assigned by the net to the correct character: 0.01228625513613224
log likelihood: -4.399273872375488
negative log lieklihood: 4.399273872375488
-------
bigram example 2: em(5, 13)
input to the neural net: 5
output label to the neural net: 13
probability assigned by the net to the correct character: 0.018050700426101685
log likelihood: -4.014570713043213
negative log lieklihood: 4.014570713043213
-------
bigram example 3: mm(13, 13)
input to the neural net: 13
output label to the neural net: 13
probability assigned by the net to the correct character: 0.026691533625125885
log likelihood: -3.623408794403076
negative log lieklihood: 3.623408794403076
-------
bigram example 4: ma(13, 1)
input to the neural net: 13
output label to the neural net: 1
probability assigned by the net to the correct character: 0.07367686182260513
log likelihood: -2.6080665588378906
negative log lieklihood:

### To make the prediction better, use gradient descent and backpropogation
                                    ## Optimization

In [19]:
xs ,ys

(tensor([ 0,  5, 13, 13,  1]), tensor([ 5, 13, 13,  1,  0]))

In [20]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27,27), generator = g, requires_grad = True)

In [21]:
# Forward Pass 
xenc = F.one_hot(xs, num_classes = 27).float()
logits = xenc@W
counts = logits.exp()
probs = counts/ counts.sum(1, keepdims = True)

In [22]:
# the particular probs element that we need to optiomize are:
'''probs[0, ys[0]], probs[0, ys[1]], probs[0, ys[2]], probs[0, ys[3]], probs[0, ys[4]]'''
loss = -probs[torch.arange(5), ys].log().mean()

In [23]:
print(f'loss with current parameters are:{loss}')

loss with current parameters are:3.7693049907684326


In [24]:
# backward_pass
W.grad = None   # set to zero
loss.backward()

In [25]:
W.grad.shape, W.shape
# therefore every element of the "W.grad", is giving the gradient for the next step optimization

(torch.Size([27, 27]), torch.Size([27, 27]))

In [26]:
# updating the pararmeter data with the gradient
W.data += -0.1 * W.grad

In [27]:
logits = xenc@W
counts = logits.exp()
probs = counts/ counts.sum(1, keepdims = True)
loss = -probs[torch.arange(5), ys].log().mean()
print(f'updated loss is:{loss}')

updated loss is:3.7492127418518066


### Lets clean and clear up the code
                                Crystal Clear

In [48]:
# creating the dataset
xs, ys = [], []
for w in words:   # including all the words 
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of example:', num)

# Initializing the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27,27), generator = g, requires_grad = True)

number of example: 228146


In [54]:
# gradient descent
for k in range(10):
    # Forward Pass 
    xenc = F.one_hot(xs, num_classes = 27).float()
    logits = xenc@W
    counts = logits.exp()
    probs = counts/ counts.sum(1, keepdims = True)
    regularized_loss = (W**2).mean()
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01 * regularized_loss
    print('{:.4f}'.format(loss.item()))
    
    # backward_pass
    W.grad = None
    loss.backward()
    
    # update
    W.data += -50 * W.grad

2.5764
2.5711
2.5663
2.5618
2.5577
2.5539
2.5504
2.5472
2.5442
2.5414
