In this notebook we are taking neural network approach to bigrams instead of the count approach.\
We will take the input and output the probability distribution of the next character.

We create a training dataset of the bigrams that we did in the last nbotebook.


In [1]:
import torch
import torch.nn.functional as F

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
# create stoi and itos
stoi = {}
chars = sorted(list(set(''.join(words))))
stoi = {c: i+1 for i, c in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [4]:
# creating training set
xs, ys = [], []
for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1, ix2 = stoi[ch1], stoi[ch2]
        print(ch1, ch2)
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

. e
e m
m m
m a
a .


In [5]:
xs

tensor([ 0,  5, 13, 13,  1])

In [6]:
ys

tensor([ 5, 13, 13,  1,  0])

In [7]:
# now we cannot feed the numbers as it is. We have to convert them into vectors with uniform length
# we will use one-hot encoding
xenc = F.one_hot(xs, num_classes=27).float()
# in the one_hot function there is no parameter for dtype. We will always get int. So, we have to explicitly cast it to float
xenc.shape

torch.Size([5, 27])

In [8]:
# constructing the first neuron
W = torch.randn((27, 1)) # randn gives the random numbers from the normal function. Size is 27 as input size is 27.

xenc @ W # its shape would 5 X 1 (basic matrix multiplication)

tensor([[-1.2015],
        [-0.2023],
        [-0.0802],
        [-0.0802],
        [-0.0148]])

In [9]:
# we will make 27 neurons now
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g)
(xenc @ W).shape
# (5, 27) @ (27, 27) -> (5, 27) => this tells us the firing rate of neurons on those five inputs

torch.Size([5, 27])

In [10]:
print((xenc @ W)[2, 20])
# this is the firing rate of the 20th neuron on the 2nd example or input
print(sum(xenc[2] * W[:, 20])) 

tensor(-0.2129)
tensor(-0.2129)


In [11]:
# we are just keeping this as the neural network. Just one layer with 27 neurons with no non linearity. Just the linear output
print((xenc @ W).shape)
# this is the output of the neural network. How do we interpret them. We would want something like that of a probability. But the numbers we are getting are not all between 0 and 1
# We can interpret them as something like log(count) count of bigrams basically
# so to make more sense of them and make them more readable, we can exponentiate them

logits = xenc @ W # log-counts, logits are unnormalized last layer values
counts = logits.exp() # equivalent to N (from bigrams). So after exponentiating it, we can interpret values as the count of bigrams.
probs = counts / counts.sum(1, keepdim=True)  # we can interpet this as probabilities
# calculating counts by exponentiating and finding out the probabilities is actually softmax function.

# we are getting these values from a neural network.
# so now what is happening is that we have got logits from W and x. We exponentiate them to get something like counts and from that we transformed into something that looks
# like probabilities. All of these operations are differentiable and can be back propagated.

# we have to find loss now
# loss is average of negative log probability
# we have ys that is the output label or sort of the ground truth
# we have to take avg of probabilities at the ground truth labels to find the loss value
# we have 5 inputs and 5 outputs
# for 0th input what is the probability to get the ground truth => probs[0, ys[0]] and so on for every index till 5
loss = -probs[torch.arange(5), ys].log().mean() # loss is mean of negative log loss
loss

torch.Size([5, 27])


tensor(3.7693)

In [12]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [13]:
# forward pass
xenc = F.one_hot(xs, num_classes=27).float()
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
loss = -probs[torch.arange(5), ys].log().mean()

In [14]:
# backward pass
W.grad = None # setting to zero but a more efficient way to do it by setting it None
loss.backward()

In [15]:
print(loss.item())

3.7693049907684326


In [16]:
# update weights
W.data += -0.1 * W.grad

In [17]:
# The full summary of this notebook in code

# create the dataset
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1, ix2 = stoi[ch1], stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
nums = xs.nelement()
print("Number of samples:", nums)

# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

Number of samples: 228146


In [18]:
# training loop
steps = 500
lr = 50

for k in range(steps):
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(nums), ys].log().mean()
    
    # backward pass
    W.grad = None
    loss.backward()
    print(f"step {k} => Loss: {loss.item()}")
    # update weights
    W.data += -lr * W.grad

step 0 => Loss: 3.758953809738159
step 1 => Loss: 3.371100902557373


step 2 => Loss: 3.154043197631836
step 3 => Loss: 3.020373821258545
step 4 => Loss: 2.927711248397827
step 5 => Loss: 2.8604023456573486
step 6 => Loss: 2.8097290992736816
step 7 => Loss: 2.7701022624969482
step 8 => Loss: 2.7380728721618652
step 9 => Loss: 2.711496353149414
step 10 => Loss: 2.6890032291412354
step 11 => Loss: 2.6696884632110596
step 12 => Loss: 2.6529300212860107
step 13 => Loss: 2.638277292251587
step 14 => Loss: 2.6253879070281982
step 15 => Loss: 2.613990545272827
step 16 => Loss: 2.603863477706909
step 17 => Loss: 2.5948219299316406
step 18 => Loss: 2.5867116451263428
step 19 => Loss: 2.579403877258301
step 20 => Loss: 2.572789192199707
step 21 => Loss: 2.5667762756347656
step 22 => Loss: 2.5612878799438477
step 23 => Loss: 2.5562586784362793
step 24 => Loss: 2.551633596420288
step 25 => Loss: 2.547365665435791
step 26 => Loss: 2.543415069580078
step 27 => Loss: 2.539748430252075
step 28 => Loss: 2.5363364219665527
step 29 => Loss: 2.5331544876098633
step 30 => Lo

In [19]:
# sampling from this neural network

g = torch.Generator().manual_seed(2147483647)

for i in range(10):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdim=True)
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0: # we have reached the end of the word
            break
    print(''.join(out))
    

# we got the exact same names as we got in the counting bigrams model as both the model exactly represents the same thing hence we got the same loss
# bigram counting table model is not scalable as if we want to store more ngrams, it would be too difficult and hence not flexible.
# gradient based method is very scalable and flexible that way.

cexze.
momasurailezitynn.
konimittain.
llayn.
ka.
da.
staiyaubrtthrigotai.
moliellavo.
ke.
teda.
