In [51]:
import sys, random, math
from collections import Counter
import numpy as np

In [52]:
f = open("tasks_1-20_v1-2/en/qa1_single-supporting-fact_train.txt", 'r')
raw = f.readlines()
f.close()
         

In [53]:
raw[:10]

['1 Mary moved to the bathroom.\n',
 '2 John went to the hallway.\n',
 '3 Where is Mary? \tbathroom\t1\n',
 '4 Daniel went back to the hallway.\n',
 '5 Sandra moved to the garden.\n',
 '6 Where is Daniel? \thallway\t4\n',
 '7 John moved to the office.\n',
 '8 Sandra journeyed to the bathroom.\n',
 '9 Where is Daniel? \thallway\t4\n',
 '10 Mary moved to the hallway.\n']

In [54]:
tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n","").split(" ")[1:])
print(tokens[0:3])

[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', '\tbathroom\t1']]


In [55]:
print(tokens[1])

['john', 'went', 'to', 'the', 'hallway.']


In [56]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)
word2index = {}
for ii, word in enumerate(vocab):
    word2index[word] = ii

In [57]:
def word2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis = 0)

In [58]:
np.random.seed(1)
embed_size = 10 

embed = (np.random.rand(len(vocab), embed_size) -0.5) * 0.1
recurrent = np.eye(embed_size)

start = np.zeros(embed_size)
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1
one_hot = np.eye(len(vocab))

In [59]:
def predict(sent):
    
    layers = list() 
    layer = {}
    
    layer["hidden"] = start
    layers.append(layer)
    
    loss = 0
    
    for target_i in range(len(sent)):
        
        layer = {}
        layer["pred"] = softmax(layers[-1]["hidden"].dot(decoder))
        loss += -np.log(layer["pred"][sent[target_i]])
        
        layer["hidden"] = layers[-1]["hidden"].dot(recurrent) + embed[sent[target_i]]
        layers.append(layer)
    return layers, loss

    
    

In [60]:
layer, loss = predict(word2indices(tokens[1]))
print(layer)

[{'hidden': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}, {'pred': array([0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.01219512,
       0.01219512, 0.01219512, 0.01219512, 0.01219512, 0.0

In [61]:
#Backprop and update of RNN
for iter in range(30000):
    alpha = 0.001
    #Indices for every second to end word in a sentence.
    sent = word2indices(tokens[iter%len(tokens)][1:])
    layers, loss = predict(sent)
    
    for layer_idx in reversed(range(len(layers))):
        
        layer = layers[layer_idx] # The current timestep
        target = sent[layer_idx - 1] #The previous timestep's word index
        
        #Not first timestep
        if layer_idx > 0: 
            #Calculate the error of the output decoder
            layer["output_delta"] = layer["pred"] - one_hot[target]
            #Find error of each hidden dimension
            new_hidden_delta = layer["output_delta"].dot(decoder.T)
            if (layer_idx == len(layers) - 1):
            #If the last timestep then there are no next timesteps
                layer["hidden_delta"] = new_hidden_delta
            else:
            #These changes are also affected by the delta from the next timesteps
                layer["hidden_delta"] = new_hidden_delta + layers[layer_idx + 1]["hidden_delta"].dot(recurrent.T)
                
        else:
            #This hidden dimension is the init hidden has not been used to predict an output so no output delta
            layer["hidden_delta"] = layers[layer_idx+1]["hidden_delta"].dot(recurrent.T)
            
    
    #Only update the hidden dimension layer
    start -= layers[0]["hidden_delta"] * alpha / float(len(sent))
    for layer_idx,layer in enumerate(layers[1:]):
        
        #Here the output delta is error and the hidden layers is input like FFNN
        decoder -= np.outer(layers[layer_idx]["hidden"], layer["output_delta"]) * alpha /float(len(sent))
        
        embed_idx = sent[layer_idx]
        #The embedding is the amount I change the network it is subtracting the hidden delta
        embed[embed_idx] -= layers[layer_idx]["hidden_delta"] * alpha / float(len(sent))
        #The recurrent weights are the hidden later 
        recurrent -= np.outer(layers[layer_idx]["hidden"], layers[layer_idx]["hidden_delta"]) * alpha/ float(len(sent))
    
    if(iter % 1000 == 0):
        print("Perplexity:" + str(np.exp(loss/len(sent))))
    
           
                
            

Perplexity:82.05801257706915
Perplexity:81.7990793441101
Perplexity:81.42272196378181
Perplexity:80.69903225946521
Perplexity:78.98693438589551
Perplexity:73.08526905400181
Perplexity:34.691344532264374
Perplexity:21.834262201639888
Perplexity:21.36056083292534
Perplexity:20.798918847005446
Perplexity:20.195492452086448
Perplexity:19.51138466202749
Perplexity:18.758623223580457
Perplexity:18.09651341403817
Perplexity:17.532642141254286
Perplexity:16.947618398556735
Perplexity:16.467411467075777
Perplexity:16.08121997874825
Perplexity:15.728543664270317
Perplexity:15.422030818687233
Perplexity:14.012238135413842
Perplexity:13.624185010349143
Perplexity:13.894423778320744
Perplexity:12.88620425377288
Perplexity:12.120166794194914
Perplexity:10.591269448030882
Perplexity:7.993765313467362
Perplexity:7.625368992575441
Perplexity:6.808815460716412
Perplexity:7.156277969956574


In [None]:
#Updating layers[state]
# If the timestep is the first timestep
# If yes: then no output layer so instead just allocated to each hidden node in hidden dimension
# If no:
    #If it is the last timestep:
        #Find output delta only then allocated that to each hidden dimension