In [2]:
# Only want to use one gpu
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # so the IDs match nvidia-smi
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # "0, 1" for multiple

from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

from utils import load_data

seq_arr, test_seq_arr, labels, word_index, index_word, vs, embedding_matrix = load_data('word', 'glove')
seq_arr.shape, test_seq_arr.shape, embedding_matrix.shape

['/device:GPU:0']


Using TensorFlow backend.


((1099063, 30), (56370, 30), (59728, 300))

In [44]:
from utils import f1
from keras import callbacks
from timeit import default_timer as timer
from keras import models, losses, metrics, layers, optimizers
from keras.callbacks import *
from keras.utils import multi_gpu_model
import tensorflow as tf
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os
import keras

import numpy as np
import pandas as pd

MAX_SEN_LEN = 15
MAX_SEN = 5


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from timeit import default_timer as timer

In [None]:
def format_clause_data(sequences,
                max_sen, max_sen_len,
                punc = ['.', ',', '?', '!', ';', ':']):
    """Break data into clauses"""

    punc_idx = [word_index[i] for i in punc]

    # Data is initially all 0s
    data = np.zeros((len(sequences), max_sen, max_sen_len))

    start = timer()
    
    # Iterate through the sequences
    for i, s in enumerate(seq_arr):
        # Track progress
        if (i + 1) % 10000 == 0:
            print(f'{100 * i / len(sequences):.2f}% complete.', end = '\r')
        
        # Clauses is a list of lists
        clauses = []
        # Track is a single list
        track = []

        # Number of clauses
        j = 0

        # Iterate through the sequence
        for idx in s:
            # If we have already found enough sentences
            if j == max_sen:
                break

            # Record the index
            track.append(idx)

            # If we find punctuation
            if idx in punc_idx:
                j += 1
                clauses.append(track)
                # Reset the tracker
                track = []

        # Record the found clauses padded to the maximum length
        data[i, 0:j, :] = pad_sequences(clauses, max_sen_len)
    
    print(f'Formatted in {timer() - start:.2f} seconds.')
    print('Final data shape: ', data.shape)
    return data

data = format_clause_data(seq_arr, max_sen = MAX_SEN, max_sen_len = MAX_SEN_LEN)
# data = np.load('word_clause_data.npy')
data.shape

2.73% complete.

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
import re

import sys
import os


from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import *
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, regularizers, constraints


embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            trainable=True,
                            mask_zero=False)


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = False
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


In [None]:
sentence_input = Input(shape=(MAX_SEN_LEN,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(CuDNNGRU(5, return_sequences=True, 
                                kernel_regularizer=reg.l2()))(embedded_sequences)
l_att = AttentionWithContext()(l_lstm)
dense = Dense(32, activation = 'relu')(l_att)
dense = Dropout(0.5)(dense)
sentEncoder = Model(sentence_input, dense)
sentEncoder.summary()

In [None]:
review_input = Input(shape=(MAX_SEN, MAX_SEN_LEN), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(CuDNNGRU(5, return_sequences=True, 
                                     kernel_regularizer=reg.l2()))(review_encoder)
l_att_sent = AttentionWithContext()(l_lstm_sent)

In [None]:
dense = Dense(32, activation = 'relu')(l_att_sent)
dense = Dropout(0.5)(dense)
preds = Dense(1, activation='sigmoid')(dense)
model = Model(review_input, preds)

model.compile(loss='binary_crossentropy',
              optimizer=optimizers.Adam(),
              metrics=['binary_crossentropy',
                       'acc', f1])
model.summary()

In [None]:
model.inputs

In [None]:
model_name = 'word_han'

callback_list = [EarlyStopping(monitor = 'val_loss', patience = 4),
                 ModelCheckpoint(f'models/{model_name}.h5', monitor = 'val_loss',
                             save_best_only = True)]

print("model fitting - Hierachical attention network")
model.fit(data, labels, validation_split = 0.4,
          epochs=10, batch_size=1024, callbacks = callback_list)

In [43]:
from keras import regularizers as reg

In [None]:
reg.l1()

In [None]:
s = seq_arr[100]

max_sen = 3
max_sen_len = 10

data = np.zeros(shape = (1, max_sen, max_sen_len))

clauses = []
track = []
j = 0

for ii, idx in enumerate(s):
    w = index_word[idx]
    track.append(idx)
    if w in punc:
        j += 1
        clauses.append(track)
        track = []
        
data[0, 0:j, :] = pad_sequences(clauses, max_sen_len)

In [None]:
data.astype(int)

In [None]:
punc = ['.', ',', '?', '!', ';', ':']
punc_idx = [word_index[i] for i in punc]
punc_idx

In [None]:
punc = ['.', ',', '?', '!', ';', ':']
punc_idx = [word_index[i] for i in punc]

# Number of sequences to try
trial_num = 1000

# Maximum number of sentences
max_sen = 3
# Maximum words per sentence
max_sen_len = 10

# Data is initially all 0s
data = np.zeros((trial_num, max_sen, max_sen_len))

# Iterate through the sequences
for i, s in enumerate(seq_arr[:trial_num]):    
    
    # Clauses is a list of lists
    clauses = []
    # Track is a single list
    track = []
    
    # Number of clauses
    j = 0

    # Iterate through the sequence
    for idx in s:
        if j == max_sen:
            break
        
        # Record the index
        track.append(idx)
        
        # If we find punctuation
        if idx in punc_idx:
            j += 1
            clauses.append(track)
            # Reset the tracker
            track = []

    # Record the found clauses padded to the maximum length
    data[i, 0:j, :] = pad_sequences(clauses, max_sen_len)
    
data.shape

In [None]:
data = format_clause_data(seq_arr, max_sen = 3,
                          max_sen_len = 10)

In [None]:
np.save('word_clause_data.npy', data)