# Introduction: Hierarchical Attention Network

In this notebook, we develop a hierarchical attention network for text classification. This method is one of many competing for the most accurate method for natural language processing. 

In [53]:
# Only want to use one gpu
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # so the IDs match nvidia-smi
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # "0, 1" for multiple

from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

from utils import load_data

seq_arr, test_seq_arr, labels, word_index, index_word, vs, embedding_matrix = load_data('word', 'glove')
seq_arr.shape, test_seq_arr.shape, embedding_matrix.shape

['/device:GPU:0']


((1099063, 30), (56370, 30), (59728, 300))

In [54]:
from utils import f1

from timeit import default_timer as timer

from keras.callbacks import *
from keras.layers import *
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, regularizers, constraints
from keras.utils import multi_gpu_model
from keras import regularizers as reg
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
import re


from collections import defaultdict
import re
import sys


import numpy as np
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from timeit import default_timer as timer

The cell below defines the model parameters. `MAX_SEN` is the maximum number of sentences (clauses) for each question while `MAX_SEN_LEN` is the maximum number of words in a sentence.

In [None]:
MAX_SEN_LEN = 20
MAX_SEN = 5

## Format Data

Hierarchical attention networks require input of shape `[batch_size, sentences, words]`. We are dealing with questions, which often don't have more than one sentence. Therefore, we can split the questions into clauses based on punctuation. 

Each question will be broken into a maximum of 5 clauses, each of which has a maximum length of 20 words. The function below accomplishes this for all of the sequences.

In [55]:
def format_clause_data(sequences,
                max_sen, max_sen_len,
                punc = ['.', ',', '?', '!', ';', ':']):
    """Break data into clauses"""

    # Get indexes of punctuation
    punc_idx = [word_index[i] for i in punc]

    # Data is initially all 0s
    data = np.zeros((len(sequences), max_sen, max_sen_len))

    start = timer()
    
    # Iterate through the sequences
    for i, s in enumerate(seq_arr):
        # Track progress
        if (i + 1) % 10000 == 0:
            print(f'{100 * i / len(sequences):.2f}% complete.', end = '\r')
        
        # Clauses is a list of lists
        clauses = []
        # Track is a single list
        track = []

        # Number of clauses
        j = 0

        # Iterate through the sequence
        for idx in s:
            # If we have already found enough sentences
            if j == max_sen:
                break

            # Record the index
            track.append(idx)

            # If we find punctuation
            if idx in punc_idx:
                j += 1
                clauses.append(track)
                # Reset the tracker
                track = []

        # Record the found clauses padded to the maximum length
        data[i, 0:j, :] = pad_sequences(clauses, max_sen_len)
    
    print(f'Formatted in {timer() - start:.2f} seconds.')
    print('Final data shape: ', data.shape)
    return data

data = format_clause_data(seq_arr, max_sen = MAX_SEN, max_sen_len = MAX_SEN_LEN)
# data = np.load('word_clause_data.npy')
data.shape

Formatted in 304.08 seconds.
Final data shape:  (1099063, 5, 20)


(1099063, 5, 20)

In [56]:
example = data[1, :, :]

for clause in example:
    print([index_word[i] for i in clause])

['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'Do', 'you', 'have', 'an', 'adopted', 'dog', ',']
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'how', 'would', 'you', 'encourage', 'people', 'to', 'adopt', 'and', 'not', 'shop', '?']
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


In [57]:
example = data[100, :, :]

for clause in example:
    print([index_word[i] for i in clause])

['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'What', 'advice', 'do', 'you', 'have', 'for', 'anyone', 'who', 'wishes', 'to', 'accomplish', 'what', 'you', 'have', '?']
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


# Define Model

The next few cells define the model. We start off with an embedding layer for the words. We use pre-trained embeddings, but set the embeddings to be trainable.

In [58]:
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            trainable=True,
                            mask_zero=False)

In [59]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

## Attention Layer

This is the main layer of the network. It uses attention with context to process sequences.

In [60]:
class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = False
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


## Sentence Model

In [61]:
# Processes the sentences
sentence_input = Input(shape=(MAX_SEN_LEN,), dtype='int32')
# Embed sentences
embedded_sequences = embedding_layer(sentence_input)
# Apply a bi-directional lstm
l_lstm = Bidirectional(CuDNNLSTM(10, return_sequences=True, 
                                 kernel_regularizer=reg.l2()))(embedded_sequences)
# Apply the attention layer to the entire sequence
l_att = AttentionWithContext()(l_lstm)
# Apply a dense layer
dense = Dense(32, activation = 'relu')(l_att)
# Dropout
dense = Dropout(0.5)(dense)

# Create the model
sentEncoder = Model(sentence_input, dense)
sentEncoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        (None, 20)                0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 20, 300)           17918400  
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 20, 20)            24960     
_________________________________________________________________
attention_with_context_14 (A (None, 20)                440       
_________________________________________________________________
dense_18 (Dense)             (None, 32)                672       
_________________________________________________________________
dropout_11 (Dropout)         (None, 32)                0         
Total params: 17,944,472
Trainable params: 17,944,472
Non-trainable params: 0
________________________________________________________________

## Review Model

This model takes as input the clauses and applies the sentence encoded model to them. The sentence encoder is applied at each time step.

In [62]:
# Input is the clauses
review_input = Input(shape=(MAX_SEN, MAX_SEN_LEN), dtype='int32')
# Encode the clauses with the sentence encoded applied for each clause
review_encoder = TimeDistributed(sentEncoder)(review_input)

# Apply a bidirectional lstm
l_lstm_sent = Bidirectional(CuDNNLSTM(10, return_sequences=True, 
                                     kernel_regularizer=reg.l2()))(review_encoder)
# Apply the attention layer with context
l_att_sent = AttentionWithContext()(l_lstm_sent)

# Apply a fully connected layer
dense = Dense(32, activation = 'relu')(l_att_sent)
# Apply dropout
dense = Dropout(0.5)(dense)
# Make predictions
preds = Dense(1, activation='sigmoid')(dense)
model = Model(review_input, preds)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 5, 20)             0         
_________________________________________________________________
time_distributed_8 (TimeDist (None, 5, 32)             17944472  
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 5, 20)             3520      
_________________________________________________________________
attention_with_context_15 (A (None, 20)                440       
_________________________________________________________________
dense_19 (Dense)             (None, 32)                672       
_________________________________________________________________
dropout_12 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 33        
Total para

In [66]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizers.Adam(),
              metrics=['binary_crossentropy',
                       'acc', f1])

In [67]:
model.inputs

[<tf.Tensor 'input_17:0' shape=(?, 5, 20) dtype=int32>]

The inputs to the model are `[batch_size, num_sentences, num_words_per_sentence]`. The main model applies the sentence model to each sentence in the question.

In [68]:
model_name = 'word_han'

# Create callbacks
callback_list = [EarlyStopping(monitor = 'val_loss', patience = 4),
                 ModelCheckpoint(f'models/{model_name}.h5', monitor = 'val_loss',
                                 save_best_only = True)]

# Train the model
print('Starting Training')
history = model.fit(data, labels, validation_split = 0.4,
          epochs=10, batch_size=1024, 
          callbacks = callback_list)

Starting Training
Train on 659437 samples, validate on 439626 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [69]:
np.save('word_clause_data.npy', data)

## No Pre-Trained Embeddings

In [70]:
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            trainable=True,
                            mask_zero=False)

In [71]:
# Processes the sentences
sentence_input = Input(shape=(MAX_SEN_LEN,), dtype='int32')
# Embed sentences
embedded_sequences = embedding_layer(sentence_input)
# Apply a bi-directional lstm
l_lstm = Bidirectional(CuDNNLSTM(10, return_sequences=True, 
                                 kernel_regularizer=reg.l2()))(embedded_sequences)
# Apply the attention layer to the entire sequence
l_att = AttentionWithContext()(l_lstm)
# Apply a dense layer
dense = Dense(32, activation = 'relu')(l_att)
# Dropout
dense = Dropout(0.5)(dense)

# Create the model
sentEncoder = Model(sentence_input, dense)

# Input is the clauses
review_input = Input(shape=(MAX_SEN, MAX_SEN_LEN), dtype='int32')
# Encode the clauses with the sentence encoded applied for each clause
review_encoder = TimeDistributed(sentEncoder)(review_input)

# Apply a bidirectional lstm
l_lstm_sent = Bidirectional(CuDNNLSTM(10, return_sequences=True, 
                                     kernel_regularizer=reg.l2()))(review_encoder)
# Apply the attention layer with context
l_att_sent = AttentionWithContext()(l_lstm_sent)

# Apply a fully connected layer
dense = Dense(32, activation = 'relu')(l_att_sent)
# Apply dropout
dense = Dropout(0.5)(dense)
# Make predictions
preds = Dense(1, activation='sigmoid')(dense)
model = Model(review_input, preds)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        (None, 5, 20)             0         
_________________________________________________________________
time_distributed_9 (TimeDist (None, 5, 32)             17944472  
_________________________________________________________________
bidirectional_18 (Bidirectio (None, 5, 20)             3520      
_________________________________________________________________
attention_with_context_17 (A (None, 20)                440       
_________________________________________________________________
dense_22 (Dense)             (None, 32)                672       
_________________________________________________________________
dropout_14 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 33        
Total para

In [72]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizers.Adam(),
              metrics=['binary_crossentropy',
                       'acc', f1])

model_name = 'word_han_no_pretrained'

# Create callbacks
callback_list = [EarlyStopping(monitor = 'val_loss', patience = 4),
                 ModelCheckpoint(f'models/{model_name}.h5', monitor = 'val_loss',
                                 save_best_only = True)]

# Train the model
print('Starting Training')
history = model.fit(data, labels, validation_split = 0.4,
          epochs=10, batch_size=1024, 
          callbacks = callback_list)

Starting Training
Train on 659437 samples, validate on 439626 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
