In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F


Loading the dataset

In [2]:
f = open('datasets/train.txt','r',encoding="utf8")
total_str = f.read()
f.close()

We now extract all the unique tokens (in this case characters) from the dataset

In [3]:
vocab = sorted(list(set(list(total_str))))
voc_sz = len(vocab)
print(''.join(vocab))  # printing all the unique character
print(voc_sz)               

	
 !"'(),-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
77


Creating the encoding and decoding dictionaries

In [4]:
itoa = {}  # index to token 
atoi = {}  # token to index
for i,ele in enumerate(vocab):
    atoi[ele] = i+1
    itoa[i] = ele

Converting the string to an index torch tensor

In [5]:
Xdata = []
for e in total_str:
    Xdata.append(atoi[e])
Xdata = torch.tensor(Xdata)

In [6]:
cntxt = 8   # context length

In [7]:
Xt = []
Yt = []
for i in range(len(Xdata)-cntxt):
    Xt.append(Xdata[i:i+cntxt])
    Yt.append(Xdata[i+cntxt])
Xt = torch.stack(Xt)
Yt = torch.stack(Yt)

In [8]:
len(Xt)

1597676

In [9]:
Xt[0],Yt[0]  # dataset fist look

(tensor([ 1,  1, 30, 39, 42, 29, 47, 39]), tensor(42))

Train Test Val split

In [10]:
xtrain = Xt[:int(0.8*len(Xt))]
ytrain = Yt[:int(0.8*len(Yt))]
xval = Xt[int(0.8*len(Xt)):int(0.9*len(Xt))]
yval = Yt[int(0.8*len(Yt)):int(0.9*len(Yt))]
xtest = Xt[int(0.9*len(Xt)):]
ytest = Yt[int(0.9*len(Yt)):]

Variable definition

In [11]:
cntxt = 8  # context length (rewritten)
n_fl = 100  # number of neurons in 1st layer
n_ml = 256  # number of neurons in 2nd layer
n_octms = voc_sz  # output layer 
l_r = 0.0001  # learning rate
nl_r = 0.00001 # new learning rate
bat_sz = 4  # batch size
epoch = 10  # number of iterations on the entire dataset

During the creation of the model, the number of layers / activation funstion use / neuron per layer is entirely dependent on the dataset and thus needs to be customised by dryrunning the dataset on equivalent possibilities

uncomment the activation function which you want to try out below 

In [12]:
# -------------------------------------------------
model = nn.Sequential(
    nn.Linear(cntxt,n_fl),  # layer one 
    #nn.ReLU(),
    #nn.Tanh(),   
    #nn.Sigmoid(),
    nn.Linear(n_fl,n_ml),   # layer two
    #nn.ReLU(),
    #nn.Tanh(),
    #nn.Sigmoid(),
    # nn.Dropout(0.0),
    nn.Linear(n_ml,n_octms),  # output layer
    #nn.ReLU(),
    #nn.Tanh(),
    #nn.Sigmoid(),
    # nn.Softmax(),
)
# -------------------------------------------------

initializing the optimiser

In [13]:
optimizer = torch.optim.SGD(model.parameters(), lr=l_r)

Randomised indices selection for the training iteration

In [14]:
ix = (torch.randperm(len(xtrain)-bat_sz))

Accuracy funtion 

In [15]:
def accuracy():
    k,d=len(xval),len(xval)
    for n in range(len(xval)):
        omt = ((model(xval[n]))).tolist()
        # print(omt.index(max(omt)),ytest[n].item())

        k -= (omt.index(max(omt))==yval[n].item())
    accu = (((d-k)/d) *100)
    return("Accuracy = " + str(accu))

A batch of samples at once are sent into the model to calculate the possible output of the token which is then matched with the real value to calculate loss

In [26]:
for e in range(epoch):
    for i in ix:
        xt = xtrain[i:i+bat_sz]    # selecting the batch 
        yt = ytrain[i:i+bat_sz]     
        X_train = xt.view([bat_sz,cntxt])   # creating the appropriate tensor size
        Y_train = F.one_hot(yt,voc_sz).float()   # creating a one hot vector representation of the possible output

        t = model(X_train.float())
        loss = F.cross_entropy(t,Y_train)    # Loss definition
        optimizer.zero_grad()    

        loss.backward()     # accumulating the gradient
        optimizer.step()    # rectifying the weights using the gradients
        
        # print(loss.item())
        
    print(str(e+1) + " / " + str(epoch)+", " + accuracy())
    if(e == int(0.8*epoch)):
        optimizer = torch.optim.SGD(model.parameters(), lr=nl_r)  # decreasing the learning rate 



1 / 10, Accuracy = 76.64565739200646
2 / 10, Accuracy = 79.02380952380952
3 / 10, Accuracy = 80.92323845957248
4 / 10, Accuracy = 81.82793295923897
5 / 10, Accuracy = 82.35965486465846
6 / 10, Accuracy = 83.06614656651612
7 / 10, Accuracy = 83.81238464879212
8 / 10, Accuracy = 84.19588455623352
9 / 10, Accuracy = 84.32623587899555
10 / 10, Accuracy = 84.45945224522255



The parameters (Weights and biases) of the model are rectified in the direction opposite to the slope at the rate of learning specified 
In other words the weights and biases are getting better each iteration

In the case of traditional neural net testing on neural net, we can expect the accuracy to reach about 84% due to the absence of atention (transformer architecture) 

In [27]:
inper = [0]*bat_sz
leng = 400
for i in range(leng):
    oot = (model(torch.tensor(inper[-bat_sz:])))
    inper.append(oot.index(max(oot)))
    print(itoa[oot.index(max(oot))])


The thing to take away from here is setting up a neural net is easier as compared to actually using it for the real world applications. The quality of this neural net is not great that being said it helps you all to understand hot a neural network is applied on text data

Go make the models you want and test out stuff
Happy learning