In [1]:
import tensorflow as tf


def attention(inputs, attention_size, time_major=False, return_alphas=False):
    """
    Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.

    The idea was proposed in the article by Z. Yang et al., "Hierarchical Attention Networks
     for Document Classification", 2016: http://www.aclweb.org/anthology/N16-1174.
    Variables notation is also inherited from the article
    
    Args:
        inputs: The Attention inputs.
            Matches outputs of RNN/Bi-RNN layer (not final state):
                In case of RNN, this must be RNN outputs `Tensor`:
                    If time_major == False (default), this must be a tensor of shape:
                        `[batch_size, max_time, cell.output_size]`.
                    If time_major == True, this must be a tensor of shape:
                        `[max_time, batch_size, cell.output_size]`.
                In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and
                the backward RNN outputs `Tensor`.
                    If time_major == False (default),
                        outputs_fw is a `Tensor` shaped:
                        `[batch_size, max_time, cell_fw.output_size]`
                        and outputs_bw is a `Tensor` shaped:
                        `[batch_size, max_time, cell_bw.output_size]`.
                    If time_major == True,
                        outputs_fw is a `Tensor` shaped:
                        `[max_time, batch_size, cell_fw.output_size]`
                        and outputs_bw is a `Tensor` shaped:
                        `[max_time, batch_size, cell_bw.output_size]`.
        attention_size: Linear size of the Attention weights.
        time_major: The shape format of the `inputs` Tensors.
            If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
            If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
            Using `time_major = True` is a bit more efficient because it avoids
            transposes at the beginning and end of the RNN calculation.  However,
            most TensorFlow data is batch-major, so by default this function
            accepts input and emits output in batch-major form.
        return_alphas: Whether to return attention coefficients variable along with layer's output.
            Used for visualization purpose.
    Returns:
        The Attention output `Tensor`.
        In case of RNN, this will be a `Tensor` shaped:
            `[batch_size, cell.output_size]`.
        In case of Bidirectional RNN, this will be a `Tensor` shaped:
            `[batch_size, cell_fw.output_size + cell_bw.output_size]`.
    """

    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.array_ops.transpose(inputs, [1, 0, 2])

    hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

    with tf.name_scope('v'):
        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)

    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
    alphas = tf.nn.softmax(vu, name='alphas')         # (B,T) shape

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    if not return_alphas:
        return output
    else:
        return output, alphas


import numpy as np


def zero_pad(X, seq_len):
    return np.array([x[:seq_len - 1] + [0] * max(seq_len - len(x), 1) for x in X])


def get_vocabulary_size(X):
    return max([max(x) for x in X]) + 1  # plus the 0th word


def fit_in_vocabulary(X, voc_size):
    return [[w for w in x if w < voc_size] for x in X]


def batch_generator(X, y, batch_size):
    """Primitive batch generator 
    """
    size = X.shape[0]
    X_copy = X.copy()
    y_copy = y.copy()
    indices = np.arange(size)
    np.random.shuffle(indices)
    X_copy = X_copy[indices]
    y_copy = y_copy[indices]
    i = 0
    while True:
        if i + batch_size <= size:
            yield X_copy[i:i + batch_size], y_copy[i:i + batch_size]
            i += batch_size
        else:
            i = 0
            indices = np.arange(size)
            np.random.shuffle(indices)
            X_copy = X_copy[indices]
            y_copy = y_copy[indices]
            continue



from keras.datasets import imdb
from tensorflow.contrib.rnn import GRUCell
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
NUM_WORDS = 10000
INDEX_FROM = 3
SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100
HIDDEN_SIZE = 150
ATTENTION_SIZE = 50
KEEP_PROB = 0.8
BATCH_SIZE = 256
NUM_EPOCHS = 3  # Model easily overfits without pre-trained words embeddings, that's why train for a few epochs
DELTA = 0.5
MODEL_PATH = r'.\deep_learn\tf-rnn-attention\model'

# Load the data set
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
# Sequences pre-processing
vocabulary_size = get_vocabulary_size(X_train)
X_test = fit_in_vocabulary(X_test, vocabulary_size)
X_train = zero_pad(X_train, SEQUENCE_LENGTH)
X_test = zero_pad(X_test, SEQUENCE_LENGTH)

# Different placeholders
with tf.name_scope('Inputs'):
    batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph')
    target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
    seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
    keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')

# Embedding layer
with tf.name_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
    print('embeddings_var',embeddings_var) # (10000, 100)
    tf.summary.histogram('embeddings_var', embeddings_var)
    batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)
    print('batch_embedded', batch_embedded) # (?, 250, 100)

# (Bi-)RNN layer(-s)
rnn_outputs, _ = bi_rnn(GRUCell(150), GRUCell(HIDDEN_SIZE),
# rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(GRUCell(150), GRUCell(HIDDEN_SIZE),
                        inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)
tf.summary.histogram('RNN_outputs', rnn_outputs) # (?, 250, 150)
print('rnn_outputs--------------',rnn_outputs)
# Attention layer
with tf.name_scope('Attention_layer'):
    attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True)
    print('attention_output-----------',attention_output)# (?, 300)
    tf.summary.histogram('alphas', alphas)

# Dropout
drop = tf.nn.dropout(attention_output, keep_prob_ph)
print('drop-------------------', drop)
# Fully connected layer
with tf.name_scope('Fully_connected_layer'):
    W = tf.Variable(tf.truncated_normal([HIDDEN_SIZE * 2, 1], stddev=0.1))  # Hidden size is multiplied by 2 for Bi-RNN
    b = tf.Variable(tf.constant(0., shape=[1]))
    y_hat = tf.nn.xw_plus_b(drop, W, b)
    y_hat = tf.squeeze(y_hat)
    tf.summary.histogram('W', W)

with tf.name_scope('Metrics'):
    # Cross-entropy loss and optimizer initialization
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph))
    tf.summary.scalar('loss', loss)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

    # Accuracy metric
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32))
    tf.summary.scalar('accuracy', accuracy)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
embeddings_var <tf.Variable 'Embedding_layer/Variable:0' shape=(10000, 100) dtype=float32_ref>
batch_embedded Tensor("Embedding_layer/embedding_lookup/Identity:0", shape=(?, 250, 100), dtype=float32)
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
rnn_outputs-------------- (<tf.Tensor 'bidirectional_rnn/fw/fw/transpose_1:0' shape=(?, 250, 150) dtype=float32>, <tf.Tensor 'ReverseSequence:0' shape=(?, 250, 150) dtype=float32>)
attention_output----------- Tensor("Attention_layer/Sum:0", shape=(?, 300), dtype=float32)
Instructions for updating:
Please use `rate` instead of `kee

In [2]:
merged = tf.summary.merge_all()

# Batch generators
train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE)
test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE)

train_writer = tf.summary.FileWriter(r'.\deep_learn\tf-rnn-attention\train', accuracy.graph)
test_writer = tf.summary.FileWriter(r'.\deep_learn\tf-rnn-attention\test', accuracy.graph)

session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))

saver = tf.train.Saver()


with tf.Session(config=session_conf) as sess:
    sess.run(tf.global_variables_initializer())
    print("Start learning...")
    for epoch in range(NUM_EPOCHS):
        loss_train = 0
        loss_test = 0
        accuracy_train = 0
        accuracy_test = 0

        print("epoch: {}\t".format(epoch), end="")

        # Training
        num_batches = X_train.shape[0] // BATCH_SIZE
        for b in range(num_batches):
            x_batch, y_batch = next(train_batch_generator)
            seq_len = np.array([list(x).index(0) + 1 for x in x_batch])# actual lengths of sequences
            print(seq_len)
            
            # print(seq_len)
            loss_tr, acc, _, summary = sess.run([loss, accuracy, optimizer, merged],
                                                feed_dict={batch_ph: x_batch,
                                                           target_ph: y_batch,
                                                           seq_len_ph: seq_len,
                                                           keep_prob_ph: KEEP_PROB})
            accuracy_train += acc
            loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
            train_writer.add_summary(summary, b + num_batches * epoch)
        accuracy_train /= num_batches

        # Testing
        num_batches = X_test.shape[0] // BATCH_SIZE
        for b in range(num_batches):
            x_batch, y_batch = next(test_batch_generator)
            seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
            loss_test_batch, acc, summary = sess.run([loss, accuracy, merged],
                                                     feed_dict={batch_ph: x_batch,
                                                                target_ph: y_batch,
                                                                seq_len_ph: seq_len,
                                                                keep_prob_ph: 1.0})
            accuracy_test += acc
            loss_test += loss_test_batch
            test_writer.add_summary(summary, b + num_batches * epoch)
        accuracy_test /= num_batches
        loss_test /= num_batches

        print("loss: {:.3f}, val_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}".format(
            loss_train, loss_test, accuracy_train, accuracy_test
        ))
    train_writer.close()
    test_writer.close()
    saver.save(sess, MODEL_PATH)
    print("Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.")



Start learning...
epoch: 0	[170 142 233 250 101 178 250 154  75  84  62 191 222 250 148  53 121 107
 250 209 250 160 242 139 122 128  86 131  50 250 125 150  74 250 211 250
 183  66 146 177 250  97 250 151 142 178 124 250 244 147  66 215 162 250
 129 250 132 169 145 129 250 115 125 250 101 210 250 127 250 154 169 123
 119 250 185 250 152 250 242 169 178 140  39 119 157 136  74 130 188 133
 250 250 133 233 250 108 187 218 145 144 250 164 250 203 173 250 115 250
 142 188 194 250 250 174 154 149 169 125 180 113 156 222 250 250 250 250
 250 191 220 157 250 250 189 250 250 128 186 176  46 121 183 113 216 250
  51  40 250 250 250 186 250 206 250 158 250  63  39 193 199 183 119 145
 157 156 139 149  65 135 172  76 250 122 140 250 201 250 250 227 210 218
 116 121 250 114 123 250 123 250 150 250 250 250 162 168 248 250 250  50
 159 125 183 250 250 250 164 155 250 220 181  50 243 250 250 203 250 105
 153 140 189 183 174 114 246 141 250 151 107 143 167 250 214 120 130 185
 246 144 250 127 138  58

KeyboardInterrupt: 