In [2]:
import sys, random, math
from collections import Counter
import numpy as np

f = open('tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n","").split(" ")[1:])
    
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

In [3]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

In [4]:
np.random.seed(1)
embed_size = 10

embed = (np.random.rand(len(vocab), embed_size) - 0.5) * 0.1

recurrent = np.eye(embed_size)

start = np.zeros(embed_size)

decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1 # output weights

one_hot = np.eye(len(vocab))

In [5]:
def predict(sent):
    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)
    
    loss = 0
    
    preds = list()
    for target_i in range(len(sent)):
        layer = {}
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))
        loss += -np.log(layer['pred'][sent[target_i]])
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]
        layers.append(layer)
    
    return layers, loss

In [6]:
def net():
    global decoder, recurrent, start
    for iter in range(30000): # Forwardprop
        alpha = 0.001
        sent = words2indices(tokens[iter%len(tokens)][1:])
        layers, loss = predict(sent)

        for layer_idx in reversed(range(len(layers))): # Backprop
            layer = layers[layer_idx]
            target = sent[layer_idx-1]
            
            if (layer_idx > 0): # if its not the first layer
                layer['output_delta'] = layer['pred'] - one_hot[target]
                new_hidden_delta = layer['output_delta'].dot(decoder.T)
                
                if (layer_idx == len(layers) - 1):
                    layer['hidden_delta'] = new_hidden_delta
                else:
                    layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.T)
            else:
                layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.T)
                
        start -= layers[0]['hidden_delta'] * alpha/float(len(sent))
        for layer_idx,layer in enumerate(layers[1:]):
            decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']) * alpha / float(len(sent))
            
            embed_idx = sent[layer_idx]
            embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / float(len(sent))
            
            recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / float(len(sent))
            
        if (iter % 1000 == 0):
            print("Perplexity: ", np.exp(loss/len(sent)))
                

In [7]:
net()

Perplexity:  82.04853763160185
Perplexity:  81.97865097135555
Perplexity:  81.86092949447317
Perplexity:  81.62319186189592
Perplexity:  81.11862125139729
Perplexity:  79.99830202398905
Perplexity:  77.22750833796393
Perplexity:  68.0611757652277
Perplexity:  37.750523481948505
Perplexity:  22.653961024431805
Perplexity:  20.005867935205345
Perplexity:  18.91573063754801
Perplexity:  17.801768234726683
Perplexity:  16.282192885700297
Perplexity:  13.940173076229952
Perplexity:  10.846999160481582
Perplexity:  8.289444628471909
Perplexity:  6.932582994587655
Perplexity:  6.084344811808846
Perplexity:  5.490221485013707
Perplexity:  5.109966296676601
Perplexity:  4.88505560771513
Perplexity:  4.728635496436536
Perplexity:  4.622246242480398
Perplexity:  4.557733531920261
Perplexity:  4.515676030407581
Perplexity:  4.473599177027516
Perplexity:  4.421592771482916
Perplexity:  4.361734814415217
Perplexity:  4.2967153824719455


In [10]:
sent_index = 4

l, _ = predict(words2indices(tokens[sent_index]))
print(tokens[sent_index])

for i,each_layer in enumerate(l[1:-1]):
    inp = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    pred = vocab[each_layer['pred'].argmax()]
    print("Prev Input:", inp, (' ' * (12 - len(inp))) +\
          "True:", true, (" " * (15 - len(true))), "Pred:", pred)

['sandra', 'moved', 'to', 'the', 'garden.']
Prev Input: sandra       True: moved            Pred: is
Prev Input: moved        True: to               Pred: to
Prev Input: to           True: the              Pred: the
Prev Input: the          True: garden.          Pred: bedroom.
