# Text Classification - sentence level Attentional RNN
    In this post, the model is based on recurrent neural network and attention based LSTM/GRU encoder.
    The attention network is implemented on top of LSTM for the classification task.

In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Concatenate, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model, Sequential

from keras.engine.topology import Layer, InputSpec
from keras import initializers

from keras import backend as K

Using TensorFlow backend.


## (1) Parameter setting

In [3]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

## (2) Data input
Data are from [IMDB Dataset](https://www.kaggle.com/c/word2vec-nlp-tutorial/data)
All the review of movies are classfied as positive (sentiment = 1) or negative (sentiment = 0).

In [4]:
data_train = pd.read_csv('data/labeledTrainData.tsv',sep='\t')
print(data_train.shape)
data_train[0:3]

(25000, 3)


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...


## (3) Data preprocess -- remove some characters

In [5]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\","",string)
    string = re.sub(r"\'","",string)
    string = re.sub(r"\"","",string)
    return string.strip().lower()

texts=[]
labels=[]

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx],'lxml')
    texts.append(clean_str(text.get_text()))
    labels.append(data_train.sentiment[idx])

## (3) Data preprocess -- data and label
Use Keras function to process the data

In [6]:
tokenizer=Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 81501 unique tokens.


Sequences is a list of list, which contains 25000 reviews. Each review is a list of its words. Then pads each sequence to the same length.

In [7]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
labels = np.asarray(labels, dtype = np.float32)
print('Shape of data tensor: ', data.shape)
print('Shape of label tensor: ',labels.shape)

Shape of data tensor:  (25000, 1000)
Shape of label tensor:  (25000, 2)


## (3) Data preprocess -- train data and validation data


In [8]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print ('Number of negative and positive reviews in training and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of negative and positive reviews in training and validation set
[ 10029.   9971.]
[ 2471.  2529.]


## (4) Create the BiLSTM model
![BiLSTM](https://raw.githubusercontent.com/WistariaDing/TextClassification/master/picture/bi_lstm1.jpg)
LSTM only preserves information of the past because the only inputs it has seen are from the past

Using bidirectional LSTM will run your inputs in two ways, one from past to future and one from future to past. 

    The difference between LSTM and BiLSTM is that:
    LSTM runs forwards while BiLSTM runs both forwards and backwards which is able to preserve information from both past and future.

In Keras, there are several merge modes, that is how the forward and backward outputs should be combines before being passed on to the next layer.
The default mode is to concatenate, and this is the method often used in studies fo bidirectional LSTMs.

In [9]:
model = Sequential()
model.add(Embedding(len(word_index) +1,
                    EMBEDDING_DIM,
                    input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(2,activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 100)         8150200   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 402       
Total params: 8,311,402
Trainable params: 8,311,402
Non-trainable params: 0
_________________________________________________________________


## (5) Training

In [10]:
print("model fitting - Bidirectional LSTM")
print("----------The fitting process is ignored here.")
model.fit(x_train, y_train, 
          validation_data=(x_val,y_val),
          epochs=10,batch_size=50)

model fitting - Bidirectional LSTM
----------The fitting process is ignored here.


## (6) Attention model
    CNTK has no reverse funtion when the return_sequences = True in LSTM. So choose tensorflow as the backend of Keras

In [11]:
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        super(AttLayer, self).__init__(** kwargs)
    
    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = self.init((input_shape[-1],1))
        self.trainable_weights=[self.W]
        super(AttLayer,self).build(input_shape)
    def call(self,x,mask=None):
        eij = K.tanh(K.dot(x,self.W))
#        print('W shape:',self.W.shape)
#        print('eij shape: ', eij.shape)
        ai = K.exp(eij)
#        print('aij shape: ',ai.shape)
        weights = ai/K.sum(ai,axis=1)
#        print('weigths shape: ',weights.shape)
#        print('x shape: ',x.shape)
        weighted_input = x*weights
#        print('input shape: ', weighted_input.shape)
        output=K.sum(weighted_input,axis=1)
        
        return output
    
    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

In [12]:
embedding_layer = Embedding(len(word_index) +1,
                    EMBEDDING_DIM,
                    input_length=MAX_SEQUENCE_LENGTH)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_gru = Bidirectional(GRU(100,return_sequences=True))(embedded_sequences)
l_att = AttLayer()(l_gru)
preds = Dense(2, activation='softmax')(l_att)
model = Model(sequence_input,preds)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1000, 100)         8150200   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1000, 200)         120600    
_________________________________________________________________
att_layer_1 (AttLayer)       (None, 200)               200       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 402       
Total params: 8,271,402
Trainable params: 8,271,402
Non-trainable params: 0
_________________________________________________________________


In [13]:
print("model fitting - Attentional LSTM")
print("----------The fitting process is ignored here.------------------------")
model.fit(x_train, y_train, 
          validation_data=(x_val,y_val),
          epochs=1,batch_size=50)

model fitting - Attentional LSTM
----------The fitting process is ignored here.------------------------
