In [307]:
print('Lets build the model :)')
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
#In this modeling approach we are taking 3 previous and trying to predict the 4th word in the sequance

Lets build the model :)


In [308]:
# reading the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [309]:
len(words)

32033

In [310]:
# building the mappings of characters to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [311]:
#building the dataset
block_size = 3 #context length how many characters do we take to predict the next one
X,Y = [],[]
for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] #crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

        

In [312]:
g = torch.Generator().manual_seed(219384384)
#creating a lookup table for the character embeddings
C = torch.randn((27,2), generator = g)
# Constructiong the hidden layer
#The number of inputs to this layer is going to be 3 * 2  because we hae to dimensional embeddings and we have 3 of them
# and its up to us to decide how many neurons we want inside the layer here we are going with 100 of them
W1 = torch.randn(6,100, generator = g )
b1 = torch.randn(100, generator = g) 
W2 = torch.randn(100, 27, generator = g) #our second layer will take 100 inputs and 
b2 = torch.randn(27, generator = g)
parameters = [C,W1, b1, W2, b2]

In [313]:
sum(p.nelement() for p in parameters) #tells us number of parameters in total

3481

In [314]:
for p in parameters:
    p.requires_grad = True
#we do this because p.requires_grad is false by default but insted of treating these tensors as constant we want pytorch to treat them as variable, Variable which requires to have gradients

In [315]:
for i in range(1000):
    #creating a minibatch
    ix = torch.randint(0,X.shape[0], (32,))
    #in the above line of code we are generating 32 random numbers between 0 and the number of examples(X.shape[0])
    # forward pass
    emb = C[X[ix]] #the shape is (32,3,2)
    # in the code above we are only selecting 32 examples with the index ix randomly and calculating the loss based on those examples
    #and the implemeting gradient based on that loss this will speed up the training
    h = torch.tanh(emb.view(-1,6) @ W1 + b1) #we are using tanh activation fn so the numbers in h will be -1 and 1 
    # We pass -1 to emb.view(), becuase we want pytorch to guess what what will be the right number for rows given we have already told it the number of columns are 6
    # here you can also use emb.reshape(32,6) the difference is emb.view() will not use extra space it ensure that the emb tensor and and the new tensor that we create will use the same data so no memory wastag
    logits = h @ W2 + b2 #the shape is 32,27
    # implementing the loss function
    loss = F.cross_entropy(logits, Y[ix])  #implementing the categorical cross entropy using pytorch
    print(f'loss= {loss.item()} for iteration= {i}') 
    #we use pytorch here because large positive numbers when exponentiated causes the overflow which can be handled by subtracting the biggest number in the logits from each element row wise, and pytorch handles this operation internally

    # Implementing backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # param update
    for p in parameters:
        p.data -= 0.1 * p.grad

loss= 17.7404842376709 for iteration= 0
loss= 17.38859748840332 for iteration= 1
loss= 13.848173141479492 for iteration= 2
loss= 14.321552276611328 for iteration= 3
loss= 9.560193061828613 for iteration= 4
loss= 15.206975936889648 for iteration= 5
loss= 12.192161560058594 for iteration= 6
loss= 12.558395385742188 for iteration= 7
loss= 11.932211875915527 for iteration= 8
loss= 13.271073341369629 for iteration= 9
loss= 9.676918029785156 for iteration= 10
loss= 10.957734107971191 for iteration= 11
loss= 9.470869064331055 for iteration= 12
loss= 10.225584030151367 for iteration= 13
loss= 10.456774711608887 for iteration= 14
loss= 8.826881408691406 for iteration= 15
loss= 10.016298294067383 for iteration= 16
loss= 8.247517585754395 for iteration= 17
loss= 8.32795238494873 for iteration= 18
loss= 8.87733268737793 for iteration= 19
loss= 8.656319618225098 for iteration= 20
loss= 9.51900863647461 for iteration= 21
loss= 9.376317024230957 for iteration= 22
loss= 7.886662006378174 for iteration