## Implementing sentence averaging for each sentence in a review in Keras

In [1]:
# Import all necessary libraries
import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
from collections import Counter
import re
import csv

from bs4 import BeautifulSoup

import sys
import os

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializations


import tensorflow as tf
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

Using TensorFlow backend.
  from ._conv import register_converters as _register_converters


In [2]:
# Set all default values
MAX_SENT_LENGTH = 100
#MAX_SENTS = 15
MAX_SENTS = 50
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [3]:
# Function to clean reviews
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"<br />", " ", string)
    
    string = re.sub(r"[^A-Za-z0-9!.\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r" \'t", "\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [4]:
new_dir = '/home/siddharth/TensorCode/My_Code/IMDB_Dataset/'
folders = ['train','test']
sub_folders = ['pos','neg']

In [5]:
# Load the dataset
id_count = 1
Data = []
# We observe that the file name is of the format "id_rating.txt"
# Hence, we extract this information before reading the review presented in the file
for i in sub_folders:
    # Get the content from the folder
    folder_name = new_dir + folders[0] + '/' + i + '/'
    # Read every file in the folder and extract information of - id, rating, review
    all_files = os.listdir(folder_name)
    # Extract the information of the review and the rating
    for j in all_files:
        # Removing the extension ".txt"
        filetemp = j[:-4]
        # Getting the rating of the review
        rating = [filetemp[k+1:] for k in range(len(filetemp)) if filetemp[k] == '_']
        filename = folder_name + j
        # Read the review
        with open(filename,'r') as infile:
            review = infile.read()
        #temp_review = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", review)
        temp_review = review.lower()
        temp_review2 = clean_str(temp_review)
        fin_review = temp_review2.strip()
        # Storing the data in record form
        record = [str(id_count),int(rating[0]),fin_review]
        Data.append(record)
        id_count += 1
print "The total training examples are :", (id_count-1)
print "All done!"

The total training examples are : 25000
All done!


In [6]:
# Load the dataset
id_count = 1
Test_Data = []
# We observe that the file name is of the format "id_rating.txt"
# Hence, we extract this information before reading the review presented in the file
for i in sub_folders:
    # Get the content from the folder
    folder_name = new_dir + folders[1] + '/' + i + '/'
    # Read every file in the folder and extract information of - id, rating, review
    all_files = os.listdir(folder_name)
    # Extract the information of the review and the rating
    for j in all_files:
        # Removing the extension ".txt"
        filetemp = j[:-4]
        # Getting the rating of the review
        rating = [filetemp[k+1:] for k in range(len(filetemp)) if filetemp[k] == '_']
        filename = folder_name + j
        # Read the review
        with open(filename,'r') as infile:
            review = infile.read()
        #temp_review = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", review)
        temp_review = review.lower()
        temp_review2 = clean_str(temp_review)
        fin_review = temp_review2.strip()
        # Storing the data in record form
        record = [str(id_count),int(rating[0]),fin_review]
        Test_Data.append(record)
        id_count += 1
print "The total test examples are :", (id_count-1)
print "All done!"

The total test examples are : 25000
All done!


In [7]:
# Get the dictionary of all words
all_reviews = [Data[i][2] for i in range(len(Data))]
findata, tempdata = [[] for j in range(2)]
for i in all_reviews:
    updatedata = i.split(' ')
    for j in updatedata:
        tempdata.append(j)    
#Creating a unique list of words
all_un_words = list(set(tempdata))
print "Done all the words present in the data corpus!"
print "There are a total of ", len(all_un_words), "words in the data corpus."

Done all the words present in the data corpus!
There are a total of  108520 words in the data corpus.


In [8]:
# Get the dictionary of all words
all_test_reviews = [Test_Data[i][2] for i in range(len(Test_Data))]
findata, tempdata = [[] for j in range(2)]
for i in all_test_reviews:
    updatedata = i.split(' ')
    for j in updatedata:
        tempdata.append(j)    
#Creating a unique list of words
all_test_un_words = list(set(tempdata))
print "Done for all the words present in the data corpus!"
print "There are a total of ", len(all_test_un_words), "words in the data corpus."

Done for all the words present in the data corpus!
There are a total of  107367 words in the data corpus.


In [9]:
# Get all the labels
all_labels = [Data[i][1] for i in range(len(Data))]

print len(all_labels)
print Counter(all_labels)

25000
Counter({1: 5100, 10: 4732, 8: 3009, 4: 2696, 7: 2496, 3: 2420, 2: 2284, 9: 2263})


In [10]:
# Get all the labels
all_test_labels = [Test_Data[i][1] for i in range(len(Test_Data))]

print len(all_test_labels)
print Counter(all_test_labels)

25000
Counter({1: 5022, 10: 4999, 8: 2850, 4: 2635, 3: 2541, 9: 2344, 7: 2307, 2: 2302})


In [11]:
from nltk import tokenize

reviews, labels, texts, test_reviews, test_labels, test_texts = [[] for i in range(6)]

In [12]:
# For training examples
# Clean and tokenize each review. Get each review as a list of sentences.
for idx in range(len(all_reviews)):
    text = BeautifulSoup(all_reviews[idx])
    text = clean_str(text.get_text().encode('ascii','ignore'))
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)
    
    #labels.append(all_labels[idx])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


In [13]:
# For test examples
# Clean and tokenize each review. Get each review as a list of sentences.
for idx in range(len(all_test_reviews)):
    text = BeautifulSoup(all_test_reviews[idx])
    text = clean_str(text.get_text().encode('ascii','ignore'))
    test_texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    test_reviews.append(sentences)
    
    #test_labels.append(all_test_labels[idx])

In [14]:
# Creating a 25000 x 8 matrix using one-hot coding
labels = np.zeros((len(texts), 8), dtype='float32')

for i in range(len(all_labels)):
    temp_labels = np.zeros(8)
    if all_labels[i] <5:
        index = all_labels[i] - 1
    else:
        index = all_labels[i] - 3
    temp_labels[index] = 1
    labels[i] = temp_labels

print "Done getting the labels in the right format!"

Done getting the labels in the right format!


In [15]:
# Creating a 25000 x 8 matrix using one-hot coding
test_labels = np.zeros((len(test_texts), 8), dtype='float32')

for i in range(len(all_test_labels)):
    temp_labels = np.zeros(8)
    if all_test_labels[i] <5:
        index = all_test_labels[i] - 1
    else:
        index = all_test_labels[i] - 3
    temp_labels[index] = 1
    test_labels[i] = temp_labels

print "Done getting the labels in the right format!"

Done getting the labels in the right format!


###### Basic checks

In [16]:
len(texts)

25000

In [17]:
texts[1]

'the first two sequences of this movie set up the two conflicts the thematic conflict between the soldier todd and his suppressed humanity and the physical conflict between todd and his bio engineered replacement. both sequences are quite gripping in different ways. peoples screenplay falters somewhat by resolving the first of these arcs half way through the movie which means the second half is little more than a straightforward action romp. nonetheless kudos to the makers for creating an genre action piece with heart and even a bit of soul and especially to kurt russell who conveys much with very little. not a great film but one worth seeing.'

In [18]:
test_texts[1]

'i guess if they are not brother this film will became very common. how long were they can keep this if we were part what should they do so natural feelings so plain and barren words.but i almost cried last night blood relationship brotherhood love knot film.in another word the elder brother is very cute.if they are not brothers they wont have so many forbidden factors from the family society friends even hearts of their own at the very beginning.the elder brother is doubtful of whether he is coming out or not at the beginning .maybe the little brother being so long time with his brother and even cant got any praise from his father this made him very upset and even sad maybe this is a key blasting fuse let him feel there were no one in the world loving him except his beloved brother. and i want to say this is a so human natural feeling there is nothing to be shamed you may fell in love your mother brother sister.just a frail heart looking for backbone to rely on'

In [19]:
# Calculating the maximum number of sentences a review has
MAX_SENTS_1 = 0
MIN_SENTS = 100
for review in reviews:
    temp = len(review)
    if temp > MAX_SENTS_1:
        MAX_SENTS_1 = temp
    if temp < MIN_SENTS:
        MIN_SENTS = temp
print "The maximum number of sentences in a review are :", MAX_SENTS_1
print "The minimum number of sentences in a review are :", MIN_SENTS

The maximum number of sentences in a review are : 282
The minimum number of sentences in a review are : 1


In [20]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
#tokenizer.fit_on_texts(texts)
tokenizer.fit_on_texts(texts + test_texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
test_data = np.zeros((len(test_texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

In [21]:
tokenizer.word_index['fawn']

29845

In [22]:
# For training examples
# Create our input tensors - Replace zeros with the IDs of each word as created by the tokenizer
for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1                    
                    
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

#labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Total 102390 unique tokens.
('Shape of data tensor:', (25000, 50, 100))
('Shape of label tensor:', (25000, 8))


In [23]:
# For test data
# Create our input tensors - Replace zeros with the IDs of each word as created by the tokenizer
for i, sentences in enumerate(test_reviews):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    test_data[i,j,k] = tokenizer.word_index[word]
                    k=k+1                    
                    
test_word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(test_word_index))

#test_labels = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', test_data.shape)
print('Shape of label tensor:', test_labels.shape)

Total 102390 unique tokens.
('Shape of data tensor:', (25000, 50, 100))
('Shape of label tensor:', (25000, 8))


In [24]:
# Shuffle the data and split the data into train and validation as 0.8 and 0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in training and validation set')
print (y_train.sum(axis=0))
print (y_val.sum(axis=0))

# Store the test data

x_test = test_data[:]
y_test = test_labels[:]

print('Number of positive and negative reviews in test set')
print (y_test.sum(axis=0))

Number of positive and negative reviews in training and validation set
[4076. 1854. 1928. 2143. 1991. 2405. 1821. 3782.]
[1024.  430.  492.  553.  505.  604.  442.  950.]
Number of positive and negative reviews in test set
[5022. 2302. 2541. 2635. 2307. 2850. 2344. 4999.]


In [25]:
# Get the word representations from the file
GLOVE_DIR = "/home/siddharth/TensorCode/word2vecs/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


#### With Attention Layer

In [26]:
# Keras layer with performs sentence averaging
class SentAvg(Layer):
    
    # This function is used to set the basic initializations 
    def __init__(self, **kwargs):
        super(SentAvg, self).__init__(**kwargs)
        
    # This function is to define the weights
    def build(self, input_shape):
        super(SentAvg, self).build(input_shape)
    
    # This function is where our main task is implemented. In our case, it is the sentence averaging.
    def call(self, x, mask=None):
        return K.mean(x, axis=1)
    
    # This function sets the shape of the output
    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [27]:
# Keras layer with builds the attention layer
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        #self.W = self.init((input_shape[-1],1))
        self.W = self.init((input_shape[-1],))
        #self.input_spec = [InputSpec(shape=input_shape)]
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        #eij = K.tanh(K.dot(x, self.W))
        eij = K.tanh(K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1))
        
        ai = K.exp(eij)
        #weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        weights = ai/K.expand_dims(K.sum(ai, axis=1),1)
        
        weighted_input = x*K.expand_dims(weights,2)
        #weighted_input = x*weights.dimshuffle(0,1,'x')
        #return weighted_input.sum(axis=1)
        return K.sum(weighted_input, axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [28]:
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
# Embedding layer in keras        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)

In [29]:
# Creating layers to average all words in a sentence
# First layer performs averaging at a word level for each sentence present in a review
# Second layer sends each review to the first layer and then is passed through an attention layer

# Input layer for sentences in a review
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
# Get embedding for the sentences in a review
embedded_sequences = embedding_layer(sentence_input)
# Average all words in each sentence present in a review
sent_avg = SentAvg()(embedded_sequences)
# A first layer is formed for a two layer MLP
sentEncoder = Model(sentence_input, sent_avg)

# Input layer for reviews
review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
# Get the sentences averaged for each review by calling the previous layer
review_encoder = TimeDistributed(sentEncoder)(review_input)
# Build the attention layer for sentences present in a review
l_att_sent = AttLayer()(review_encoder)
# Build an activation function - softmax on our attention layer
preds = Dense(8, activation='softmax')(l_att_sent)
# Finish the final layer
model = Model(review_input, preds)
# Define the loss function, optimization technique and metric
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [30]:
l_att_sent

<tf.Tensor 'Sum_1:0' shape=(?, 300) dtype=float32>

In [31]:
preds

<tf.Tensor 'Softmax:0' shape=(?, 8) dtype=float32>

#### Fitting the model with Attention Mechanism

In [32]:
print("model fitting - Hierachical attention network")
print (model.summary())
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=50, batch_size=50)

model fitting - Hierachical attention network
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 50, 100)       0                                            
____________________________________________________________________________________________________
timedistributed_1 (TimeDistribut (None, 50, 300)       30717300    input_2[0][0]                    
____________________________________________________________________________________________________
attlayer_1 (AttLayer)            (None, 300)           300         timedistributed_1[0][0]          
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 8)             2408        attlayer_1[0][0]                 
Total params: 30,720,008
Trainable params: 30

<keras.callbacks.History at 0x7f11e8de5690>

#### Testing on test dataset with MAX_SENTS = 50

In [33]:
scores = model.evaluate(x_test,y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 44.30%


In [34]:
y_pred = model.predict(x_test)
Y_pred = np.argmax(y_pred,axis=1)
Y_test = np.argmax(y_test,axis=1)
accuracy = (len(Y_test) - np.count_nonzero(Y_pred - Y_test) + 0.0)/len(Y_test)
score = model.evaluate(x_test, y_test, batch_size=50, verbose=1)

print (accuracy)
print (score)

[1.4363648743629456, 0.44295999947190284]


#### Testing on test dataset with MAX_SENTS = 15

In [45]:
scores = model.evaluate(x_test,y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 41.12%


In [46]:
y_pred = model.predict(x_test)
Y_pred = np.argmax(y_pred,axis=1)
Y_test = np.argmax(y_test,axis=1)
accuracy = (len(Y_test) - np.count_nonzero(Y_pred - Y_test) + 0.0)/len(Y_test)
score = model.evaluate(x_test, y_test, batch_size=50, verbose=1)

print (accuracy)
print (score)

[1.6185614216327666, 0.41115999895334243]
