In [42]:
# CNN for the IMDB problem
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

In [33]:
# load the dataset but only keep the top n words, zero the rest
top_words = 8000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# pad dataset to a maximum review length in words
max_words = 300
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [38]:
# create the model
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 300, 32)           256000    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 300, 32)           3104      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 150, 32)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 4800)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               1200250   
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 251       
Total params: 1,459,605
Trainable params: 1,459,605
Non-trainable params: 0
_________________________________________________________________


In [39]:
# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
27s - loss: 0.4811 - acc: 0.7296 - val_loss: 0.2931 - val_acc: 0.8772
Epoch 2/2
28s - loss: 0.2116 - acc: 0.9188 - val_loss: 0.2759 - val_acc: 0.8866
Accuracy: 88.66%


In [64]:
#Convert text to IMDB Format:
word_dict = imdb.get_word_index()
def encode_sentence(text):
    result = []
    arr = text_to_word_sequence(text, lower=True, split=" ")
    for word in arr:
        w = encode_word(word)
        if w is not None:
            result.append(w)
    return result
def encode_word(word):
    if word not in word_dict:
        return None
    if word_dict[word] <= top_words:
        return word_dict[word]
    return None

In [93]:
def SmartPredict(text):
    txt = encode_sentence(text)
    X_test = sequence.pad_sequences([txt], maxlen=max_words)
    return model.predict(numpy.array(X_test))[0][0]
    