In [1]:
import tensorflow as tf
import time

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)

In [2]:
import tarfile
import re
import urllib.request
import os
import random

class ImdbMovieReviews:
    DEFAULT_URL = \
        'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    TOKEN_REGEX = re.compile(r'[A-Za-z]+|[!?.:,()]')
    
    def __init__(self):
        self._cache_dir = './imdb'
        self._url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
        
        if not os.path.isfile(self._cache_dir):
            urllib.request.urlretrieve(self._url, self._cache_dir)
        self.filepath = self._cache_dir

    def __iter__(self):
        with tarfile.open(self.filepath) as archive:
            items = archive.getnames()
            for filename in archive.getnames():
                if filename.startswith('aclImdb/train/pos/'):
                    yield self._read(archive, filename), True
                elif filename.startswith('aclImdb/train/neg/'):
                    yield self._read(archive, filename), False
                    
    def _read(self, archive, filename):
        with archive.extractfile(filename) as file_:
            data = file_.read().decode('utf-8')
            data = type(self).TOKEN_REGEX.findall(data)
            data = [x.lower() for x in data]
            return data

In [3]:
import numpy as np
# Spacy is my favourite nlp framework, which havu builtin word embeddings trains on wikipesia
from spacy.en import English

class Embedding:
    
    def __init__(self):
#          spaCy makes using word vectors very easy. 
#             The Lexeme , Token , Span  and Doc  classes all have a .vector property,
#             which is a 1-dimensional numpy array of 32-bit floats:
        self.parser = English()
#         self._length = length
        self.dimensions = 300
        
    def __call__(self, sequence, length):
        # DO I really need them to be equal length?
        # Let's assume I'm not
        data = np.zeros((length, self.dimensions))
        # you can access known words from the parser's vocabulary
        embedded = [self.parser.vocab[w].vector for w in sequence]
        data[:len(sequence)] = embedded
        return data

In [4]:
import itertools

def preprocess_batched_split(iterator, embedding, batch_size):
    iterator = iter(iterator)
    while True:
        batch = []
        labelss = []
        sentence_sizes_batch = []
        for index in range(batch_size):
            text, label = next(iterator)
            sents = [list(y) for x, y in itertools.groupby(text, lambda z: z == '.') if not x]
            sentence_sizes = [len(s) for s in sents]
            text_embed = [embedding(sent) for sent in sents]
            
            batch.append(text_embed)
            labelss.append(label)
            sentence_sizes_batch.append(sentence_sizes)
            
        labels_batch = np.array(labelss, dtype=np.int32)
        sent_per_doc = np.array([len(x) for x in sentence_sizes_batch])
        words_per_sent_per_doc = np.array(sentence_sizes_batch)
        yield np.array(batch), labels_batch, words_per_sent_per_doc, sent_per_doc

In [5]:
import itertools

def preprocess_batched_split2(iterator, embedding, batch_size):
    iterator = iter(iterator)
    while True:
        batch, labels_b = zip(*itertools.islice(iterator, batch_size))
        
        sents_b = [[list(y) for x, y in itertools.groupby(doc, lambda z: z == '.') if not x] for doc in batch]

        sentence_sizes_b = [[len(sent) for sent in doc] for doc in sents_b]
        sentence_size = max(map(max, sentence_sizes_b))
        
        document_sizes = np.array([len(doc) for doc in sentence_sizes_b], dtype=np.int32)
        document_size = document_sizes.max()

        sentence_sizes_np = np.zeros(shape=[batch_size, document_size], dtype=np.int32)
        for bi, ds, ss in zip(range(sentence_sizes_np.shape[0]), document_sizes, sentence_sizes_b):
            sentence_sizes_np[bi][:ds] = ss
        
        text_embed_b = np.zeros((batch_size, document_size, sentence_size, 300))
        for i, ds, doc_sents in zip(range(text_embed_b.shape[0]), document_sizes, sents_b):
            doc_sents_embed = np.array([embedding(sent, sentence_size) for sent in doc_sents])
            text_embed_b[i][:ds] = doc_sents_embed
        
        yield text_embed_b, np.array(labels_b, dtype=np.int32), np.array(document_sizes), sentence_sizes_np

In [6]:
reviews = list(ImdbMovieReviews())

In [7]:
random.shuffle(reviews)

In [8]:
#################################################

In [9]:
%load_ext autoreload
%autoreload 1
%aimport HanSequenceLabellingModel, model_components
%aimport

In [10]:
batches_split = preprocess_batched_split2(reviews, Embedding(), batch_size=10)

In [11]:
from HanSequenceLabellingModel import HanSequenceLabellingModel

In [12]:
def HAN_model_1(session, restore_only=False):
    """Hierarhical Attention Network"""
    import tensorflow as tf
    try:
        from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, DropoutWrapper
    except ImportError:
        MultiRNNCell = tf.nn.rnn_cell.MultiRNNCell
        GRUCell = tf.nn.rnn_cell.GRUCell
    from bn_lstm import BNLSTMCell
    from HAN_model import HANClassifierModel

    is_training = tf.placeholder(dtype=tf.bool, name='is_training')

    cell = BNLSTMCell(80, is_training) # h-h batchnorm LSTMCell
    cell = MultiRNNCell([cell]*5)

    model = HanSequenceLabellingModel(
            embedding_size=300,
            classes=2,
            word_cell=cell,
            sentence_cell=cell,
            word_output_size=300,
            sentence_output_size=300,
            learning_rate=0.001,
            max_grad_norm=5.0,
            dropout_keep_proba=0.5,
            is_training=is_training,
    )

    saver = tf.train.Saver(tf.global_variables())
    checkpoint_dir = 'checkpoints'
    checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
    if checkpoint:
        print("Reading model parameters from %s" % checkpoint.model_checkpoint_path)
        saver.restore(session, checkpoint.model_checkpoint_path)
    elif restore_only:
        raise FileNotFoundError("Cannot restore model")
    else:
        print("Created model with fresh parameters")
        session.run(tf.global_variables_initializer())
        
    return model, saver

In [None]:
tf.reset_default_graph()

config = tf.ConfigProto(allow_soft_placement=True)

with tf.Session(config=config) as s:
    model, saver = HAN_model_1(s)
    tflog_dir = 'tf_logs'
    summary_writer = tf.summary.FileWriter(tflog_dir, graph=tf.get_default_graph())

    for i, (data, labels_batch, sent_per_doc, words_per_sent_per_doc,) in enumerate(batches_split):

        fd = {
            model.is_training: True,
            model.inputs_embedded: data,
            model.word_lengths: words_per_sent_per_doc,
            model.sentence_lengths: sent_per_doc,
            model.labels: labels_batch,
            model.sample_weights: np.ones(shape=(10))
        }

        t0 = time.clock()
        step, summaries, loss, accuracy, _ = s.run([
                model.global_step,
                model.summary_op,
                model.loss,
                model.accuracy,
                model.train_op,
        ], feed_dict=fd)
        td = time.clock() - t0

        summary_writer.add_summary(summaries, global_step=step)

        checkpoint_frequency = 100
        eval_frequency = 1
        
        if step % 1 == 0:
            print('step %s, loss=%s, accuracy=%s, t=%s, inputs=%s' % (step, loss, accuracy, round(td, 2), fd[model.inputs_embedded].shape))
        if step != 0 and step % checkpoint_frequency == 0:
            print('checkpoint & graph meta')
            checkpoint_path = 'checkpoints/checkpoint'
            saver.save(s, checkpoint_path, global_step=step)
            print('checkpoint done')

Reading model parameters from checkpoints/checkpoint-2
INFO:tensorflow:Restoring parameters from checkpoints/checkpoint-2
step 3, loss=0.66333, accuracy=0.6, t=8.39, inputs=(10, 14, 51, 300)
step 4, loss=0.71693, accuracy=0.5, t=13.51, inputs=(10, 14, 137, 300)
step 5, loss=0.727271, accuracy=0.6, t=6.77, inputs=(10, 17, 57, 300)
step 6, loss=0.694925, accuracy=0.6, t=8.74, inputs=(10, 40, 51, 300)
step 7, loss=0.722953, accuracy=0.4, t=9.9, inputs=(10, 26, 83, 300)
step 8, loss=0.731163, accuracy=0.4, t=9.52, inputs=(10, 26, 79, 300)
step 9, loss=0.724944, accuracy=0.5, t=11.06, inputs=(10, 18, 105, 300)
step 10, loss=0.697851, accuracy=0.6, t=10.48, inputs=(10, 26, 89, 300)
step 11, loss=0.695504, accuracy=0.4, t=9.58, inputs=(10, 21, 86, 300)
step 12, loss=0.686652, accuracy=0.5, t=75.78, inputs=(10, 24, 665, 300)
step 13, loss=0.707082, accuracy=0.5, t=8.24, inputs=(10, 23, 67, 300)
step 14, loss=0.733422, accuracy=0.3, t=11.65, inputs=(10, 34, 89, 300)
step 15, loss=0.712962, accu

step 115, loss=0.652888, accuracy=0.5, t=11.89, inputs=(10, 27, 105, 300)
step 116, loss=0.611489, accuracy=0.7, t=10.29, inputs=(10, 28, 85, 300)
step 117, loss=0.878327, accuracy=0.3, t=10.85, inputs=(10, 29, 95, 300)
step 118, loss=0.61925, accuracy=0.6, t=8.68, inputs=(10, 36, 56, 300)
step 119, loss=0.665193, accuracy=0.9, t=8.6, inputs=(10, 28, 66, 300)
step 120, loss=0.714118, accuracy=0.7, t=9.6, inputs=(10, 20, 86, 300)
step 121, loss=0.53429, accuracy=0.9, t=5.39, inputs=(10, 13, 49, 300)
step 122, loss=0.565999, accuracy=0.8, t=12.78, inputs=(10, 13, 131, 300)
step 123, loss=0.658034, accuracy=0.5, t=6.07, inputs=(10, 15, 52, 300)
step 124, loss=0.625734, accuracy=0.6, t=14.25, inputs=(10, 28, 128, 300)
step 125, loss=0.551441, accuracy=0.9, t=9.7, inputs=(10, 30, 76, 300)
step 126, loss=0.623766, accuracy=0.7, t=11.69, inputs=(10, 40, 83, 300)
step 127, loss=0.609236, accuracy=0.7, t=16.38, inputs=(10, 38, 134, 300)
step 128, loss=0.812811, accuracy=0.5, t=9.78, inputs=(10,

step 228, loss=0.519465, accuracy=0.7, t=18.22, inputs=(10, 22, 183, 300)
step 229, loss=0.417252, accuracy=0.9, t=13.42, inputs=(10, 35, 112, 300)
step 230, loss=0.402421, accuracy=0.9, t=32.91, inputs=(10, 33, 295, 300)
step 231, loss=0.579417, accuracy=0.7, t=8.3, inputs=(10, 27, 63, 300)
step 232, loss=0.493153, accuracy=0.8, t=13.61, inputs=(10, 30, 121, 300)
step 233, loss=0.493715, accuracy=0.7, t=10.43, inputs=(10, 36, 76, 300)
step 234, loss=0.769414, accuracy=0.4, t=10.27, inputs=(10, 52, 53, 300)
step 235, loss=0.549779, accuracy=0.7, t=12.01, inputs=(10, 26, 108, 300)
step 236, loss=0.436335, accuracy=0.8, t=10.11, inputs=(10, 27, 85, 300)
step 237, loss=0.407858, accuracy=0.7, t=9.88, inputs=(10, 39, 67, 300)
step 238, loss=0.352741, accuracy=0.9, t=11.84, inputs=(10, 21, 112, 300)
step 239, loss=0.725988, accuracy=0.7, t=7.71, inputs=(10, 19, 67, 300)
step 240, loss=0.42545, accuracy=0.9, t=14.15, inputs=(10, 39, 114, 300)
step 241, loss=0.509172, accuracy=0.8, t=9.72, in

step 341, loss=0.296343, accuracy=0.9, t=7.62, inputs=(10, 23, 63, 300)
step 342, loss=0.539911, accuracy=0.8, t=7.54, inputs=(10, 17, 69, 300)
step 343, loss=1.02237, accuracy=0.6, t=11.35, inputs=(10, 30, 100, 300)
step 344, loss=0.78774, accuracy=0.6, t=14.26, inputs=(10, 24, 142, 300)
step 345, loss=0.702211, accuracy=0.5, t=17.65, inputs=(10, 53, 140, 300)
step 346, loss=0.664335, accuracy=0.5, t=39.76, inputs=(10, 37, 357, 300)
step 347, loss=0.261919, accuracy=0.9, t=14.55, inputs=(10, 28, 139, 300)
step 348, loss=0.64516, accuracy=0.6, t=8.55, inputs=(10, 30, 67, 300)
step 349, loss=0.522688, accuracy=0.8, t=8.96, inputs=(10, 37, 59, 300)
step 350, loss=0.439989, accuracy=0.8, t=6.48, inputs=(10, 10, 66, 300)
step 351, loss=0.352842, accuracy=0.8, t=17.48, inputs=(10, 28, 170, 300)
step 352, loss=0.506712, accuracy=0.8, t=7.92, inputs=(10, 14, 78, 300)
step 353, loss=0.546981, accuracy=0.7, t=11.59, inputs=(10, 30, 100, 300)
step 354, loss=0.639411, accuracy=0.9, t=7.06, inputs

step 453, loss=0.454296, accuracy=0.7, t=6.52, inputs=(10, 19, 57, 300)
step 454, loss=0.44785, accuracy=0.8, t=11.28, inputs=(10, 30, 99, 300)
step 455, loss=0.482839, accuracy=0.8, t=9.97, inputs=(10, 21, 96, 300)
step 456, loss=0.244192, accuracy=1.0, t=7.35, inputs=(10, 33, 55, 300)
step 457, loss=0.400492, accuracy=0.8, t=12.43, inputs=(10, 35, 113, 300)
step 458, loss=0.424927, accuracy=0.8, t=23.19, inputs=(10, 19, 252, 300)
step 459, loss=0.944612, accuracy=0.7, t=13.24, inputs=(10, 22, 132, 300)
step 460, loss=0.502452, accuracy=0.8, t=9.47, inputs=(10, 19, 91, 300)
step 461, loss=0.576609, accuracy=0.8, t=6.48, inputs=(10, 14, 62, 300)
step 462, loss=0.438964, accuracy=0.7, t=11.11, inputs=(10, 33, 92, 300)
step 463, loss=0.759471, accuracy=0.5, t=7.42, inputs=(10, 11, 77, 300)
step 464, loss=0.49237, accuracy=0.7, t=11.24, inputs=(10, 49, 71, 300)
step 465, loss=0.321426, accuracy=0.9, t=14.19, inputs=(10, 46, 106, 300)
step 466, loss=0.341114, accuracy=0.8, t=7.56, inputs=(

step 566, loss=0.46548, accuracy=0.8, t=10.18, inputs=(10, 24, 94, 300)
step 567, loss=0.401274, accuracy=0.9, t=8.17, inputs=(10, 30, 60, 300)
step 568, loss=0.443411, accuracy=0.7, t=7.68, inputs=(10, 29, 58, 300)
step 569, loss=0.309873, accuracy=0.8, t=12.7, inputs=(10, 30, 116, 300)
step 570, loss=0.489408, accuracy=0.8, t=12.41, inputs=(10, 31, 111, 300)
step 571, loss=0.201368, accuracy=1.0, t=8.44, inputs=(10, 24, 72, 300)
step 572, loss=0.440461, accuracy=0.7, t=8.47, inputs=(10, 32, 63, 300)
step 573, loss=0.348998, accuracy=0.8, t=14.63, inputs=(10, 45, 117, 300)
step 574, loss=0.572072, accuracy=0.7, t=12.1, inputs=(10, 38, 98, 300)
step 575, loss=0.389083, accuracy=0.9, t=12.94, inputs=(10, 41, 110, 300)
step 576, loss=0.777422, accuracy=0.6, t=9.56, inputs=(10, 43, 61, 300)
step 577, loss=0.654523, accuracy=0.6, t=12.34, inputs=(10, 55, 75, 300)
step 578, loss=0.650274, accuracy=0.7, t=7.6, inputs=(10, 21, 67, 300)
step 579, loss=0.35448, accuracy=0.7, t=11.46, inputs=(10

step 679, loss=0.389403, accuracy=0.8, t=6.67, inputs=(10, 14, 63, 300)
step 680, loss=0.151695, accuracy=1.0, t=5.98, inputs=(10, 21, 46, 300)
step 681, loss=0.147311, accuracy=1.0, t=9.3, inputs=(10, 29, 77, 300)
step 682, loss=0.289293, accuracy=0.9, t=11.46, inputs=(10, 37, 91, 300)
step 683, loss=0.414005, accuracy=0.8, t=11.85, inputs=(10, 44, 84, 300)
step 684, loss=0.622377, accuracy=0.7, t=10.85, inputs=(10, 22, 100, 300)
step 685, loss=0.269805, accuracy=0.9, t=8.27, inputs=(10, 18, 76, 300)
step 686, loss=0.249357, accuracy=0.9, t=9.19, inputs=(10, 28, 77, 300)
step 687, loss=0.528613, accuracy=0.6, t=12.0, inputs=(10, 38, 93, 300)
step 688, loss=0.385148, accuracy=0.8, t=16.07, inputs=(10, 31, 149, 300)
step 689, loss=0.538464, accuracy=0.6, t=13.73, inputs=(10, 37, 114, 300)
step 690, loss=0.601713, accuracy=0.8, t=5.37, inputs=(10, 16, 46, 300)
step 691, loss=0.257373, accuracy=0.9, t=13.93, inputs=(10, 29, 129, 300)
step 692, loss=0.241262, accuracy=0.9, t=8.84, inputs=(

step 792, loss=0.351238, accuracy=0.8, t=9.76, inputs=(10, 24, 86, 300)
step 793, loss=0.172818, accuracy=1.0, t=11.52, inputs=(10, 14, 122, 300)
step 794, loss=0.231786, accuracy=0.9, t=11.56, inputs=(10, 26, 111, 300)
step 795, loss=0.333404, accuracy=0.7, t=7.03, inputs=(10, 18, 63, 300)
step 796, loss=0.294379, accuracy=0.9, t=10.57, inputs=(10, 27, 91, 300)
step 797, loss=0.374888, accuracy=0.9, t=10.1, inputs=(10, 18, 98, 300)
step 798, loss=0.573553, accuracy=0.7, t=7.28, inputs=(10, 14, 71, 300)
step 799, loss=0.327221, accuracy=0.9, t=8.81, inputs=(10, 25, 75, 300)
step 800, loss=0.248521, accuracy=1.0, t=15.36, inputs=(10, 35, 136, 300)
checkpoint & graph meta
checkpoint done
step 801, loss=0.484035, accuracy=0.8, t=14.42, inputs=(10, 17, 149, 300)
step 802, loss=0.130657, accuracy=1.0, t=8.98, inputs=(10, 26, 77, 300)
step 803, loss=0.39171, accuracy=0.8, t=8.1, inputs=(10, 13, 81, 300)
step 804, loss=0.32757, accuracy=0.8, t=14.52, inputs=(10, 16, 152, 300)
step 805, loss=0

step 904, loss=0.318745, accuracy=0.9, t=10.53, inputs=(10, 18, 104, 300)
step 905, loss=0.346245, accuracy=0.8, t=8.62, inputs=(10, 41, 51, 300)
step 906, loss=0.260983, accuracy=0.9, t=9.73, inputs=(10, 32, 80, 300)
step 907, loss=0.159693, accuracy=0.9, t=11.18, inputs=(10, 14, 116, 300)
step 908, loss=0.326266, accuracy=0.9, t=18.56, inputs=(10, 19, 198, 300)
step 909, loss=1.26264, accuracy=0.5, t=10.21, inputs=(10, 28, 89, 300)
step 910, loss=0.345577, accuracy=0.9, t=10.18, inputs=(10, 16, 103, 300)
step 911, loss=0.284656, accuracy=0.9, t=7.38, inputs=(10, 20, 65, 300)
step 912, loss=0.277451, accuracy=0.9, t=8.91, inputs=(10, 18, 86, 300)
step 913, loss=0.216745, accuracy=0.9, t=9.37, inputs=(10, 17, 90, 300)
step 914, loss=0.250555, accuracy=1.0, t=11.01, inputs=(10, 49, 68, 300)
step 915, loss=0.255328, accuracy=0.9, t=10.65, inputs=(10, 16, 107, 300)
step 916, loss=0.333381, accuracy=1.0, t=7.26, inputs=(10, 21, 62, 300)
step 917, loss=0.321022, accuracy=0.8, t=7.02, inputs

step 1017, loss=0.469026, accuracy=0.8, t=10.05, inputs=(10, 21, 98, 300)
step 1018, loss=0.296069, accuracy=0.8, t=20.56, inputs=(10, 29, 206, 300)
step 1019, loss=0.392758, accuracy=0.9, t=11.94, inputs=(10, 34, 105, 300)
step 1020, loss=1.00181, accuracy=0.4, t=12.57, inputs=(10, 24, 123, 300)
step 1021, loss=0.298795, accuracy=0.9, t=9.13, inputs=(10, 20, 86, 300)
step 1022, loss=0.255965, accuracy=0.9, t=6.82, inputs=(10, 23, 55, 300)
step 1023, loss=0.706529, accuracy=0.7, t=9.29, inputs=(10, 31, 74, 300)
step 1024, loss=0.340729, accuracy=0.8, t=12.92, inputs=(10, 44, 99, 300)
step 1025, loss=0.192278, accuracy=0.9, t=10.1, inputs=(10, 17, 101, 300)
step 1026, loss=0.274636, accuracy=0.9, t=17.16, inputs=(10, 30, 167, 300)
step 1027, loss=0.381084, accuracy=0.9, t=11.72, inputs=(10, 34, 97, 300)
step 1028, loss=0.464091, accuracy=0.7, t=16.21, inputs=(10, 29, 160, 300)
step 1029, loss=0.408968, accuracy=0.8, t=12.31, inputs=(10, 42, 94, 300)
step 1030, loss=0.719035, accuracy=0.