In [2]:
import tarfile
import re
import urllib.request
import os
import random

class ImdbMovieReviews:
    DEFAULT_URL = \
        'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    TOKEN_REGEX = re.compile(r'[A-Za-z]+|[!?.:,()]')
    
    def __init__(self):
        self._cache_dir = './imdb'
        self._url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
        
        if not os.path.isfile(self._cache_dir):
            urllib.request.urlretrieve(self._url, self._cache_dir)
        self.filepath = self._cache_dir

    def __iter__(self):
        with tarfile.open(self.filepath) as archive:
            items = archive.getnames()
            for filename in archive.getnames():
                if filename.startswith('aclImdb/train/pos/'):
                    yield self._read(archive, filename), True
                elif filename.startswith('aclImdb/train/neg/'):
                    yield self._read(archive, filename), False
                    
    def _read(self, archive, filename):
        with archive.extractfile(filename) as file_:
            data = file_.read().decode('utf-8')
            data = type(self).TOKEN_REGEX.findall(data)
            data = [x.lower() for x in data]
            return data

In [3]:
import numpy as np
# Spacy is my favourite nlp framework, which havu builtin word embeddings trains on wikipesia
from spacy.en import English

class Embedding:
    
    def __init__(self, length):
#          spaCy makes using word vectors very easy. 
#             The Lexeme , Token , Span  and Doc  classes all have a .vector property,
#             which is a 1-dimensional numpy array of 32-bit floats:
        self.parser = English()
        self._length = length
        self.dimensions = 300
        
    def __call__(self, sequence):
        data = np.zeros((self._length, self.dimensions))
        # you can access known words from the parser's vocabulary
        embedded = [self.parser.vocab[w].vector for w in sequence]
        data[:len(sequence)] = embedded
        return data

In [18]:
from lazy import lazy

class SequenceClassificationModel:
    def __init__(self, data, params):
        self.params = params
        self.cpnt_path = 'att_checkpoints'
        self.attention_size = 500
        self._create_placeholders()
        self.prediction
        self.cost
        self.error
        self.optimize
        self.global_step = 0
        self._create_summaries()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def _create_placeholders(self):
        with tf.name_scope("data"):
            self.data = tf.placeholder(tf.float32, [None, self.params.seq_length, self.params.embed_length])
            self.target = tf.placeholder(tf.float32, [None, 2])
  
    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar('loss', self.cost)
            tf.summary.scalar('erroe', self.error)
            self.summary = tf.summary.merge_all()
            saver = tf.train.Saver()
            
    @lazy
    def length(self):
        with tf.name_scope("seq_length"):
            used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
            length = tf.reduce_sum(used, reduction_indices=1)
            length = tf.cast(length, tf.int32)
        return length
    
    @lazy
    def prediction(self):
        with tf.name_scope("recurrent_layer"):
            rnn_output, _ = tf.nn.dynamic_rnn(
                self.params.rnn_cell(self.params.rnn_hidden),
                self.data,
                dtype=tf.float32,
                sequence_length=self.length
            )
        
        with tf.name_scope("attention"):
            hidden_size = rnn_output.shape[2].value  # D value - hidden size of the RNN layer

            # Trainable parameters
            W_omega = tf.Variable(tf.random_normal([self.params.rnn_hidden, self.attention_size], stddev=0.1))
            b_omega = tf.Variable(tf.random_normal([self.attention_size], stddev=0.1))
            u_omega = tf.Variable(tf.random_normal([self.attention_size], stddev=0.1))

            # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
            #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
            v = tf.tanh(tf.tensordot(rnn_output, W_omega, axes=1) + b_omega)
            # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
            vu = tf.tensordot(v, u_omega, axes=1)   # (B,T) shape
            alphas = tf.nn.softmax(vu)              # (B,T) shape also

            # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
#             output = tf.reduce_sum(rnn_output * tf.expand_dims(alphas, -1), 1)
            last = tf.reduce_sum(rnn_output * tf.expand_dims(alphas, -1), 1)

#         last = self._last_relevant(output, self.length)

        with tf.name_scope("softmax_layer"):
            num_classes = int(self.target.get_shape()[1])
            weight = tf.Variable(tf.truncated_normal(
                [self.params.rnn_hidden, num_classes], stddev=0.01))
            bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
            prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
        return prediction
    
    @lazy
    def cost(self):
        cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
        return cross_entropy
    
    @lazy
    def error(self):
        self.mistakes = tf.not_equal(
            tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(self.mistakes, tf.float32))
    
    @lazy
    def optimize(self):
        with tf.name_scope("optimization"):
            gradient = self.params.optimizer.compute_gradients(self.cost)
            if self.params.gradient_clipping:
                limit = self.params.gradient_clipping
                gradient = [
                    (tf.clip_by_value(g, -limit, limit), v)
                    if g is not None else (None, v)
                    for g, v in gradient]
            optimize = self.params.optimizer.apply_gradients(gradient)
        return optimize
    
    def train(self, batches, save_prefix, save_every=10):
        saver = tf.train.Saver()
        if os.path.isdir(self.cpnt_path):
            saver.restore(self.sess, tf.train.latest_checkpoint(self.cpnt_path))
        else:
            os.makedirs(self.cpnt_path)
        summary_path = os.path.join('att_graphs', 'run{}'.format(self.global_step))
        summary_writer = tf.summary.FileWriter(summary_path, self.sess.graph)
        self.global_step += 1
        for index, batch in enumerate(batches):
            feed = {model.data: batch[0], model.target: batch[1]}
            error, _, summary_str = self.sess.run([model.error, model.optimize, model.summary], feed)
            print('{}: {:3.1f}%'.format(index + 1, 100 * error))
            if index % save_every == 0:
                summary_writer.add_summary(summary_str, index)
                summary_writer.flush()
            if index % save_every == 0:
                save_path = os.path.join(self.cpnt_path, save_prefix)
                print('saving...', save_path)
                saver.save(self.sess, save_path, global_step=index)
                
        saver.save(self.sess, os.path.join(self.cpnt_path, save_prefix + '_final'))

    def predict_proba(self, data):
        feed = {model.data: data, }
        prediction = self.sess.run([model.prediction], feed)        
        return prediction
        
    def close(self):
        tf.reset_default_graph()
        self.session.close()

In [5]:
def preprocess_batched(iterator, length, embedding, batch_size):
    iterator = iter(iterator)
    while True:
        data = np.zeros((batch_size, length, embedding.dimensions))
        target = np.zeros((batch_size, 2))
        for index in range(batch_size):
            text, label = next(iterator)
            data[index] = embedding(text)
            target[index] = [1, 0] if label else [0, 1]
        yield data, target

In [6]:
reviews = list(ImdbMovieReviews())

In [7]:
random.shuffle(reviews)

In [8]:
length = max(len(x[0]) for x in reviews)
embedding = Embedding(length)

In [9]:
from attrdict import AttrDict

params = AttrDict(
    rnn_cell=tf.contrib.rnn.GRUCell,
    rnn_hidden=300,
    optimizer=tf.train.RMSPropOptimizer(0.002),
    batch_size=20,
    gradient_clipping=100,
    seq_length=length,
    embed_length=embedding.dimensions
)

In [19]:
batches = preprocess_batched(reviews, length, embedding, params.batch_size)

In [20]:
tf.reset_default_graph()

model = SequenceClassificationModel(data, params)

In [21]:
model.train(batches, save_prefix='simple-rnn-attention')

1: 50.0%
saving... att_checkpoints/simple-rnn-attention
2: 60.0%
3: 50.0%
4: 50.0%
5: 50.0%
6: 50.0%
7: 60.0%
8: 55.0%
9: 55.0%
10: 35.0%
11: 65.0%
saving... att_checkpoints/simple-rnn-attention
12: 55.0%
13: 35.0%
14: 25.0%
15: 55.0%
16: 45.0%
17: 55.0%
18: 35.0%
19: 60.0%
20: 40.0%
21: 40.0%
saving... att_checkpoints/simple-rnn-attention
22: 60.0%
23: 50.0%
24: 40.0%
25: 70.0%
26: 40.0%
27: 45.0%
28: 55.0%
29: 55.0%
30: 50.0%
31: 50.0%
saving... att_checkpoints/simple-rnn-attention
32: 45.0%
33: 50.0%
34: 40.0%
35: 55.0%
36: 45.0%
37: 45.0%
38: 60.0%
39: 35.0%
40: 55.0%
41: 50.0%
saving... att_checkpoints/simple-rnn-attention
42: 30.0%
43: 50.0%
44: 50.0%
45: 50.0%
46: 35.0%
47: 60.0%
48: 55.0%
49: 55.0%
50: 65.0%
51: 35.0%
saving... att_checkpoints/simple-rnn-attention
52: 50.0%
53: 40.0%
54: 50.0%
55: 60.0%
56: 60.0%
57: 50.0%
58: 55.0%
59: 50.0%
60: 65.0%
61: 65.0%
saving... att_checkpoints/simple-rnn-attention
62: 65.0%
63: 60.0%
64: 55.0%
65: 55.0%
66: 40.0%
67: 60.0%
68: 60.0%


530: 60.0%
531: 40.0%
saving... att_checkpoints/simple-rnn-attention
532: 30.0%
533: 35.0%
534: 55.0%
535: 25.0%
536: 50.0%
537: 50.0%
538: 25.0%
539: 45.0%
540: 45.0%
541: 25.0%
saving... att_checkpoints/simple-rnn-attention
542: 15.0%
543: 55.0%
544: 50.0%
545: 55.0%
546: 20.0%
547: 35.0%
548: 35.0%
549: 45.0%
550: 50.0%
551: 25.0%
saving... att_checkpoints/simple-rnn-attention
552: 40.0%
553: 30.0%
554: 10.0%
555: 25.0%
556: 40.0%
557: 35.0%
558: 20.0%
559: 30.0%
560: 55.0%
561: 25.0%
saving... att_checkpoints/simple-rnn-attention
562: 35.0%
563: 40.0%
564: 20.0%
565: 35.0%
566: 35.0%
567: 55.0%
568: 50.0%
569: 45.0%
570: 40.0%
571: 40.0%
saving... att_checkpoints/simple-rnn-attention
572: 50.0%
573: 25.0%
574: 25.0%
575: 15.0%
576: 45.0%
577: 50.0%
578: 20.0%
579: 35.0%
580: 40.0%
581: 30.0%
saving... att_checkpoints/simple-rnn-attention
582: 45.0%
583: 55.0%
584: 15.0%
585: 35.0%
586: 30.0%
587: 30.0%
588: 45.0%
589: 60.0%
590: 35.0%
591: 15.0%
saving... att_checkpoints/simple-rnn

1052: 20.0%
1053: 15.0%
1054: 5.0%
1055: 5.0%
1056: 25.0%
1057: 10.0%
1058: 15.0%
1059: 15.0%
1060: 30.0%
1061: 10.0%
saving... att_checkpoints/simple-rnn-attention
1062: 15.0%
1063: 30.0%
1064: 35.0%
1065: 20.0%
1066: 20.0%
1067: 5.0%
1068: 10.0%
1069: 10.0%
1070: 15.0%
1071: 5.0%
saving... att_checkpoints/simple-rnn-attention
1072: 20.0%
1073: 15.0%
1074: 20.0%
1075: 20.0%
1076: 30.0%
1077: 20.0%
1078: 30.0%
1079: 30.0%
1080: 15.0%
1081: 5.0%
saving... att_checkpoints/simple-rnn-attention
1082: 15.0%
1083: 20.0%
1084: 15.0%
1085: 15.0%
1086: 15.0%
1087: 0.0%
1088: 10.0%
1089: 5.0%
1090: 25.0%
1091: 15.0%
saving... att_checkpoints/simple-rnn-attention
1092: 15.0%
1093: 15.0%
1094: 5.0%
1095: 5.0%
1096: 10.0%
1097: 25.0%
1098: 25.0%
1099: 10.0%
1100: 20.0%
1101: 20.0%
saving... att_checkpoints/simple-rnn-attention
1102: 20.0%
1103: 35.0%
1104: 10.0%
1105: 10.0%
1106: 15.0%
1107: 10.0%
1108: 15.0%
1109: 25.0%
1110: 10.0%
1111: 10.0%
saving... att_checkpoints/simple-rnn-attention
1112: 1

