# High-level RNN Gluon Example

In [11]:
import os
import sys
import numpy as np
import math
import mxnet as mx
from mxnet import gluon
from common.params_lstm import *
from common.utils import *

In [9]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("MXNet: ", mx.__version__)
print("Numpy: ", np.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())

OS:  linux
Python:  3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 18:10:19) 
[GCC 7.2.0]
MXNet:  1.3.0
Numpy:  1.13.3
GPU:  ['Tesla V100-SXM2-16GB', 'Tesla V100-SXM2-16GB', 'Tesla V100-SXM2-16GB', 'Tesla V100-SXM2-16GB']
CUDA Version 9.1.85
CuDNN Version  7.1.3


In [60]:
print(MAXFEATURES)
print(EMBEDSIZE)
print(NUMHIDDEN)
print(MAXLEN)

30000
125
100
150


## Create the model

In [136]:
class RNN(gluon.Block):
    def __init__(self, 
                 maxf=MAXFEATURES, edim=EMBEDSIZE, nhid=NUMHIDDEN, **kwargs):
        super(RNN, self).__init__(**kwargs)
        self.nhid = nhid
        with self.name_scope():
            self.embedding = gluon.nn.Embedding(input_dim=maxf,
                                          output_dim=edim)
            self.gru = gluon.rnn.GRU(
                              hidden_size=nhid, 
                              num_layers=1,
                              layout="NTC",
                              bidirectional=False)   
            self.l_out = gluon.nn.Dense(units=2)

    def forward(self, x):
        x = self.embedding(x) 
        x = self.gru(x) # default state will be all 0
        x = x[:,-1,:].squeeze()
        x = self.l_out(x)
        return x

## Create optimizer

In [137]:
def init_model(net, ctx, lr=LR, b1=BETA_1, b2=BETA_2, eps=EPS):
    net.initialize(mx.init.Xavier(), ctx=ctx)
    trainer = gluon.Trainer(
        net.collect_params(), 
        'adam',
        {'learning_rate': lr, 'beta1':BETA_1, 'beta2':BETA_2, 'epsilon':EPS}
    )
    criterion = gluon.loss.SoftmaxCrossEntropyLoss()
    return trainer, criterion

## Get data

In [138]:
%%time
# Data into format for library
x_train, x_test, y_train, y_test = imdb_for_library(seq_len=MAXLEN, max_features=MAXFEATURES)
# Torch-specific
x_train = x_train.astype(np.int64)
x_test = x_test.astype(np.int64)
y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(x_train.dtype, x_test.dtype, y_train.dtype, y_test.dtype)

Preparing train set...
Preparing test set...
Trimming to 30000 max-features
Padding to length 150
(25000, 150) (25000, 150) (25000,) (25000,)
int64 int64 int64 int64
CPU times: user 5.63 s, sys: 248 ms, total: 5.88 s
Wall time: 5.87 s


## Initialize Model

In [139]:
# Run on one GPU
ctx = mx.gpu(0)

In [140]:
%%time
net = RNN()
trainer, loss_fn = init_model(net, ctx)

CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 3.47 ms


## Train Model

In [141]:
%%time
for i in range(EPOCHS):
    loss_acc = mx.nd.zeros((1), ctx)
    for j, (data, target) in enumerate(yield_mb(x_train, y_train, BATCHSIZE, shuffle=True)):
        # Get samples
        data = mx.nd.array(data, ctx=ctx)
        target = mx.nd.array(target, ctx=ctx)
        # Forwards
        with mx.autograd.record():
            output = net(data)
            loss = loss_fn(output, target)
        # Back-prop
        loss.backward()
        loss_acc += loss.mean()
        trainer.step(data.shape[0])
    print("Epoch [{}], loss: {:.4f}".format(i, loss_acc.asscalar()/(j+1)))

Epoch [0], loss: 0.4858
Epoch [1], loss: 0.2264
Epoch [2], loss: 0.1178
CPU times: user 15.2 s, sys: 3.01 s, total: 18.2 s
Wall time: 11.4 s


## Evaluate

In [142]:
%%time
# Main evaluation loop: 1.52s
n_samples = (y_test.shape[0]//BATCHSIZE)*BATCHSIZE
y_guess = mx.nd.zeros((n_samples), dtype=np.int)
y_truth = y_test[:n_samples]
c = 0
for data, target in yield_mb(x_test, y_test, BATCHSIZE):
    # Get samples
    data = mx.nd.array(data, ctx=ctx)
    target = mx.nd.array(target, ctx=ctx)
    # Forwards
    output = net(data)
    pred = output.topk(k=1).squeeze()
    # Collect results
    y_guess[c*BATCHSIZE:(c+1)*BATCHSIZE] = pred
    c += 1
mx.nd.waitall()

CPU times: user 9.34 s, sys: 16.2 s, total: 25.5 s
Wall time: 1.78 s


In [143]:
print("Accuracy: ", sum(y_guess.asnumpy() == y_truth)/len(y_guess))

Accuracy:  0.857892628205
