In [1]:
!pip install mxnet



# New Section

In [0]:
from __future__ import print_function
import mxnet as mx
from mxnet import nd, autograd
import numpy as np
import string
import re
import os
mx.random.seed(1)
ctx = mx.cpu()

In [0]:
#Preprocessing the data

'''
Read reviews from a JSON-formatted file into an array.
'''
lines = [] 

with open('train.txt', 'r') as f:
    for line in f:
        line = line.strip()
        lines.append(line)

num_train = len(lines)

with open('test.txt', 'r') as f:
    for line in f:
        line = line.strip()
        lines.append(line)        

reviews = lines

'''
Clean each document by removing unnecesary characters and splitting by space.
'''
def clean_document(doco):
    punctuation = string.punctuation + '\n\n';
    punc_replace = ''.join([' ' for s in punctuation]);
    doco_clean = doco.replace('-', ' ');
    doco_alphas = re.sub(r'\W +', '', doco_clean)
    trans_table = str.maketrans(punctuation, punc_replace);
    doco_clean = ' '.join([word.translate(trans_table) for word in doco_alphas.split(' ')]);
    doco_clean = doco_clean.split(' ');
    doco_clean = [word.lower() for word in doco_clean if len(word) > 0];
    
    return doco_clean;

# Generate a cleaned reviews array from original review texts
review_cleans = [clean_document(doc) for doc in reviews];
sentences = [' '.join(r) for r in review_cleans]


In [0]:
lengths = [len(item.split(' ')) for item in sentences]
MAX_SEQUENCE_LENGTH = max(lengths)

word_list = []
for sent in sentences:
    words = sent.split(' ')
    for word in words:
        if word not in word_list:
            word_list.append(word)

In [0]:
#Embeddings matrix using one-hot encoding.
# EMBEDDING_DIM = 100
one_hot_emb_matrix = nd.zeros((len(word_list) + 1, len(word_list) + 1), ctx = ctx)
one_hot_emb_matrix[0,0] = 1
for i in range(len(word_list)):
    one_hot_emb_matrix[i+1,i+1] = 1

In [6]:
embeddings_index = {}
# rev_emb_index = {}
!wget -O glove.6B.100d.txt https://worksheets.codalab.org/rest/bundles/0xd16b6c21f7a44270908b95992812f39f/contents/blob/
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
#     rev_emb_index[list(coefs)] = word
f.close()

--2019-03-24 16:58:51--  https://worksheets.codalab.org/rest/bundles/0xd16b6c21f7a44270908b95992812f39f/contents/blob/
Resolving worksheets.codalab.org (worksheets.codalab.org)... 40.71.231.153
Connecting to worksheets.codalab.org (worksheets.codalab.org)|40.71.231.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘glove.6B.100d.txt’

glove.6B.100d.txt       [            <=>     ] 331.04M   137MB/s    in 2.4s    

2019-03-24 16:58:54 (137 MB/s) - ‘glove.6B.100d.txt’ saved [347116733]



In [0]:
#Embeddings matrix using pre-trained embeddings.
EMBEDDING_DIM = 100
pretr_emb_matrix = nd.zeros((len(word_list) + 1, EMBEDDING_DIM), ctx = ctx)
for i, word in enumerate(word_list):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        pretr_emb_matrix[i + 1] = embedding_vector

In [0]:
def train_and_test_data(num_train, word_list, emb_matrix, sentences, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM):
    data = nd.zeros((len(sentences), MAX_SEQUENCE_LENGTH-1, EMBEDDING_DIM), ctx = ctx)
    for idx, sent in enumerate(sentences):
        words = sent.split(' ')[:-1]
        for i in range(MAX_SEQUENCE_LENGTH - 1 - len(sent)):
            words.append('<\s>')
        for idx2, word in enumerate(words):
            if word not in word_list:
                data[idx, idx2, :] = emb_matrix[0]
            else:
                data[idx, idx2, :] = emb_matrix[word_list.index(word)]
    return(data[0:num_train], data[num_train:])
#returns X_train, y_train, X_test, y_test

In [0]:
#Using pre-trained embedding
train, test = train_and_test_data(num_train, word_list, pretr_emb_matrix, sentences, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

In [10]:
seq_length = MAX_SEQUENCE_LENGTH - 1
vocab_size = len(word_list) + 1
batch_size = 50
num_batches = len(train) // batch_size
print('# of batches: ', num_batches)
train_data = train[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, EMBEDDING_DIM))
# swap batch_size and seq_length axis to make later access easier
train_data = nd.swapaxes(train_data, 0, 1)
train_data = nd.swapaxes(train_data, 1, 2)
print(train_data.shape)

# of batches:  72
(72, 17, 50, 100)


In [0]:
def train_and_test_labels(num_train, word_list, emb_matrix, sentences, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM):
    data = nd.zeros((len(sentences), MAX_SEQUENCE_LENGTH-1, EMBEDDING_DIM), ctx = ctx)
    for idx, sent in enumerate(sentences):
        words = sent.split(' ')[1:]
        for i in range(MAX_SEQUENCE_LENGTH - 1 - len(sent)):
            words.append('<\s>')
        for idx2, word in enumerate(words):
            if word not in word_list:
                data[idx, idx2, :] = emb_matrix[0]
            else:
                data[idx, idx2, :] = emb_matrix[word_list.index(word)]
    return(data[0:num_train], data[num_train:])
#returns X_train, y_train, X_test, y_test

In [0]:
#Using pre-trained embedding
train_l, test_l = train_and_test_labels(num_train, word_list, pretr_emb_matrix, sentences, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

In [13]:
seq_length = MAX_SEQUENCE_LENGTH - 1
vocab_size = len(word_list) + 1
batch_size = 50
num_batches = len(train_l) // batch_size
print('# of batches: ', num_batches)
train_label = train_l[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, EMBEDDING_DIM))
# swap batch_size and seq_length axis to make later access easier
train_label = nd.swapaxes(train_label, 0, 1)
train_label = nd.swapaxes(train_label, 1, 2)
print(train_label.shape)

# of batches:  72
(72, 17, 50, 100)


In [0]:
def softmax(y_linear, temperature=1.0):
    lin = (y_linear-nd.max(y_linear, axis=1).reshape((-1,1))) / temperature # shift each row of y_linear by its max
    exp = nd.exp(lin)
    partition =nd.sum(exp, axis=1).reshape((-1,1))
    return exp / partition

In [0]:
def simple_rnn(inputs, state, temperature=1.0):
    outputs = []
    h = state
    for X in inputs:
        h_linear = nd.dot(X, Wxh) + nd.dot(h, Whh) + bh
        h = nd.tanh(h_linear)
        yhat_linear = nd.dot(h, Why) + by
        yhat = softmax(yhat_linear, temperature=temperature)
        outputs.append(yhat)
    return (outputs, h)

In [0]:
def cross_entropy(yhat, y):
    return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))

In [0]:
def average_ce_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0.
    for (output, label) in zip(outputs,labels):
        total_loss = total_loss + cross_entropy(output, label)
    return total_loss / len(outputs)

In [0]:
def grad_clip(grad, max_grad):
    if len(grad.shape) == 1:
        for i in range(grad.shape[0]):
            if grad[i] > max_grad:
                grad[i] = max_grad
            elif grad[i] < -max_grad:
                grad[i] = -max_grad
    elif len(grad.shape) == 2:
        for i in range(grad.shape[0]):
            for j in range(grad.shape[1]):
                if grad[i][j] > max_grad:
                    grad[i][j] = max_grad
                elif grad[i][j] < -max_grad:
                    grad[i][j] = -max_grad
    return(grad)
def SGD(params, lr):
    for param in params:
        param.grad[0]=mx.ndarray.clip(param.grad[0],-10,10)

In [0]:
num_inputs = EMBEDDING_DIM
num_hidden = 64
num_outputs = EMBEDDING_DIM

########################
#  Weights connecting the inputs to the hidden layer
########################
Wxh = nd.random_normal(shape=(num_inputs,num_hidden), ctx=mx.cpu(0)) * .01

########################
#  Recurrent weights connecting the hidden layer across time steps
########################
Whh = nd.random_normal(shape=(num_hidden,num_hidden), ctx=mx.cpu(0)) * .01

########################
#  Bias vector for hidden layer
########################
bh = nd.random_normal(shape=num_hidden, ctx=mx.cpu(0)) * .01


########################
# Weights to the output nodes
########################
Why = nd.random_normal(shape=(num_hidden,num_outputs), ctx=mx.cpu(0)) * .01
by = nd.random_normal(shape=num_outputs, ctx=mx.cpu(0)) * .01

# NOTE: to keep notation consistent,
# we should really use capital letters
# for hidden layers and outputs,
# since we are doing batchwise computations

In [0]:
params = [Wxh, Whh, bh, Why, by]

for param in params:
    param.attach_grad()

In [21]:
epochs = 12
moving_loss = 0.

learning_rate = .1

# state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
for e in range(epochs):
    ############################
    # Attenuate the learning rate by a factor of 2 every 100 epochs.
    ############################
    if ((e+1) % 100 == 0):
        learning_rate = learning_rate / 2.0
    state = nd.zeros(shape=(batch_size, num_hidden), ctx=mx.cpu(0))
    for i in range(num_batches):
#         print(i)
        dat_one_hot = train_data[i]
        lab_one_hot = train_label[i]
        data_one_hot = nd.zeros(dat_one_hot.shape, ctx=mx.cpu(0))
        label_one_hot = nd.zeros(lab_one_hot.shape, ctx=mx.cpu(0))
        dat_one_hot.copyto(data_one_hot)
        lab_one_hot.copyto(label_one_hot)
        with autograd.record():
            outputs, state = simple_rnn(data_one_hot, state)
            loss = average_ce_loss(outputs, label_one_hot)
            loss.backward()
        SGD(params, learning_rate)
        print(i)
#         for param in params:
#             print(param.grad)

        ##########################
        #  Keep a moving average of the losses
        ##########################
        if (i == 0) and (e == 0):
            moving_loss = np.mean(loss.asnumpy()[0])
        else:
            moving_loss = .99 * moving_loss + .01 * np.mean(loss.asnumpy()[0])

    print("Epoch %s. Loss: %s" % (e, moving_loss))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
Epoch 0. Loss: -3.300260478674844
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
Epoch 1. Loss: -3.2284833421861565
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
Epoch 2. Loss: -3.193672050406152
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
Epoch 3. Loss: -3.176788874304754
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1

In [0]:
import pickle
with open('Pretrained_embeddings_parameters.pkl','wb') as fp:
    pickle.dump(params,fp)

In [23]:
#inference
seq_length = MAX_SEQUENCE_LENGTH - 1
vocab_size = len(word_list) + 1
batch_size = 50
num_batches = len(test) // batch_size
print('# of batches: ', num_batches)
test_data = test[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, EMBEDDING_DIM))
# swap batch_size and seq_length axis to make later access easier
test_data = nd.swapaxes(test_data, 0, 1)
test_data = nd.swapaxes(test_data, 1, 2)
print(test_data.shape)

# of batches:  13
(13, 17, 50, 100)


In [24]:
seq_length = MAX_SEQUENCE_LENGTH - 1
vocab_size = len(word_list) + 1
batch_size = 50
num_batches = len(test_l) // batch_size
print('# of batches: ', num_batches)
test_label = test_l[:num_batches*batch_size].reshape((batch_size, num_batches, seq_length, EMBEDDING_DIM))
# swap batch_size and seq_length axis to make later access easier
test_label = nd.swapaxes(test_label, 0, 1)
test_label = nd.swapaxes(test_label, 1, 2)
print(test_label.shape)

# of batches:  13
(13, 17, 50, 100)


In [25]:
#Last word only
state = nd.zeros(shape=(batch_size, num_hidden), ctx=mx.cpu(0))
count=0
# preds = []
for i in range(num_batches):
#         print(i)
    dat_one_hot = test_data[i]
    lab_one_hot = test_label[i]
    data_one_hot = nd.zeros(dat_one_hot.shape, ctx=mx.cpu(0))
    label_one_hot = nd.zeros(lab_one_hot.shape, ctx=mx.cpu(0))
    dat_one_hot.copyto(data_one_hot)
    lab_one_hot.copyto(label_one_hot)
#         with autograd.record():
    outputs, state = simple_rnn(data_one_hot, state)
    pred = outputs[-1]
    target = label_one_hot[-1,:,:]
    for i in range(batch_size):
        if sum(list(pred[i] - target[i])) == 1.0000:
            count+=1
print('Accuracy is ' + str(100*count/(batch_size*num_batches)) + str('%.'))

Accuracy is 0.46153846153846156%.
