# Text classification using RNN

**Dataset**: IMDB (within Keras *from keras.datasets import imdb*)

In [8]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [2]:
top_words = 5000 # Only considering the top 5000 words
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [4]:
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

## Implementing LSTM

In [7]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words,
                    embedding_vector_length,
                    input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train, nb_epoch=4, batch_size=64, verbose=2)
scores = model.evaluate(X_test, y_test, verbose=2)
print "Accuracy: " + str(scores[1]*100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 213,301.0
Trainable params: 213,301
Non-trainable params: 0.0
_________________________________________________________________




Epoch 1/4
909s - loss: 0.5065 - acc: 0.7398
Epoch 2/4
516s - loss: 0.2829 - acc: 0.8848
Epoch 3/4
505s - loss: 0.2309 - acc: 0.9107
Epoch 4/4
498s - loss: 0.2105 - acc: 0.9174
Accuracy: 85.936


## RNN with on

In [9]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words,
                    embedding_vector_length,
                    input_length=max_review_length))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train, nb_epoch=2, batch_size=64, verbose=2)
scores = model.evaluate(X_test, y_test, verbose=2)
print "Accuracy: " + str(scores[1]*100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 213,301.0
Trainable params: 213,301.0
Non-trainable params: 0.0
_________________________________________________________________
Epoch 1/2
669s - loss: 0.4753 - acc: 0.7702
Epoch 2/2
576s - loss: 0.3096 - acc: 0.8731
Accuracy: 86.948
