In [1]:
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Flatten, Dense, Dropout, BatchNormalization
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
df = pd.read_csv("data/movieReviews_trainLabel.tsv", sep="\t", index_col="id")
print(df.shape)
df.head(1)

(25000, 2)


Unnamed: 0_level_0,sentiment,review
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5814_8,1,With all this stuff going down at the moment w...


# Tokenize text

In [3]:
vocab_size = 10000
max_lenght = 300

t = Tokenizer(num_words=vocab_size)
t.fit_on_texts(df.review)

In [4]:
# create sequences to feed into Neural network model
sequences = t.texts_to_sequences(df.review)

# As the average length of all reviews is around 250, 
# lets the keep the input dim to 250 and pad the sequences if it is less that 250 words
sequences = pad_sequences(sequences, maxlen=max_lenght)

In [5]:
print(sequences.shape)
sequences

(25000, 300)


array([[ 261,   11,    6, ...,   21,    1, 1559],
       [   0,    0,    0, ...,   27,   91, 5674],
       [3348,   28,    4, ..., 1321,    4, 5514],
       ...,
       [   0,    0,    0, ...,    7,  339,  155],
       [   0,    0,    0, ...,   16,   82,   81],
       [   0,    0,    0, ...,   14,    3,  504]], dtype=int32)

# Model: Recurrent Neural Net (Embedding + LTSM)

In [6]:
# Network architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_lenght, name='embed'))
model.add(Bidirectional(LSTM(32, return_sequences=True), name='lstm'))
model.add(GlobalMaxPool1D(name='pooling'))
model.add(Dense(20, name='dense'))
# model.add(Flatten(name='flatten'))
model.add(Dropout(0.05, name='droput'))
model.add(Dense(1, activation='sigmoid', name='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed (Embedding)            (None, 150, 128)          1280000   
_________________________________________________________________
lstm (Bidirectional)         (None, 150, 64)           41216     
_________________________________________________________________
pooling (GlobalMaxPooling1D) (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 20)                1300      
_________________________________________________________________
droput (Dropout)             (None, 20)                0         
_________________________________________________________________
softmax (Dense)              (None, 1)                 21        
Total params: 1,322,537
Trainable params: 1,322,537
Non-trainable params: 0
____________________________________________

In [7]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(sequences, df.sentiment.values, validation_split=0.2, epochs=5, batch_size=128, verbose=2)

ValueError: Error when checking input: expected embed_input to have shape (150,) but got array with shape (300,)