In [1]:
from os import cpu_count

import numpy as np
from gensim.models import Word2Vec
import gensim.downloader as api
from keras import Sequential
from keras.backend import clear_session
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.layers import Activation, LSTM, Dropout, Embedding, Dense, MaxPooling1D, GlobalMaxPooling1D, Conv1D, SpatialDropout1D
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.constraints import unit_norm, max_norm
# Load the Dataset
from tensorflow.keras.datasets import imdb
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [8]:
model = api.load("word2vec-google-news-300")  # download the model and return as object ready for use


[('superb', 0.7657862901687622),
 ('marvelous', 0.7389472723007202),
 ('splendid', 0.7077070474624634),
 ('terrific', 0.6837816834449768),
 ('masterful', 0.6830281615257263),
 ('magnificent', 0.6709308624267578),
 ('dazzling', 0.6706756353378296),
 ('brilliantly', 0.6550824046134949),
 ('brilliance', 0.6550251245498657),
 ('scintillating', 0.6493905782699585)]

In [3]:
max_features = 1000  # only use top 1000 words
INDEX_FROM = 3  # word index offset
embedding_size = 300
initializer = 'he_normal'
batch_size = 32
epochs = 50
maxlen = 400
hidden_dims = 500
filters = 250
kernel_size  = 3

learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc',
                                            patience=3,
                                            verbose=1,
                                            factor=0.5,
                                            min_lr=0.0001)

early_stopping = EarlyStopping(monitor='val_acc',
                               min_delta=0.0005,
                               patience=10)

callbacks = [learning_rate_reduction, early_stopping]

In [4]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, index_from=INDEX_FROM)

word_to_id = imdb.get_word_index()
word_to_id = {k: (v + INDEX_FROM) for k, v in word_to_id.items()}
word_to_id[""] = 0
word_to_id[""] = 1
word_to_id["UNK"] = 2

id_to_word = {value: key for key, value in word_to_id.items()}
x_train_words = []
for idx, sentence in enumerate(x_train):
    x_train_words.append([id_to_word[id] for id in sentence if id > 2])
print(x_train_words[0])
x_test_words = []
for idx, sentence in enumerate(x_test):
    x_test_words.append([id_to_word[id] for id in sentence if id > 2])

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)
wv = model.wv
wv.most_similar(positive='brilliant')

['this', 'film', 'was', 'just', 'brilliant', 'casting', 'story', 'direction', 'really', 'the', 'part', 'they', 'played', 'and', 'you', 'could', 'just', 'imagine', 'being', 'there', 'robert', 'is', 'an', 'amazing', 'actor', 'and', 'now', 'the', 'same', 'being', 'director', 'father', 'came', 'from', 'the', 'same', 'as', 'myself', 'so', 'i', 'loved', 'the', 'fact', 'there', 'was', 'a', 'real', 'with', 'this', 'film', 'the', 'throughout', 'the', 'film', 'were', 'great', 'it', 'was', 'just', 'brilliant', 'so', 'much', 'that', 'i', 'the', 'film', 'as', 'soon', 'as', 'it', 'was', 'released', 'for', 'and', 'would', 'recommend', 'it', 'to', 'everyone', 'to', 'watch', 'and', 'the', 'was', 'amazing', 'really', 'at', 'the', 'end', 'it', 'was', 'so', 'sad', 'and', 'you', 'know', 'what', 'they', 'say', 'if', 'you', 'at', 'a', 'film', 'it', 'must', 'have', 'been', 'good', 'and', 'this', 'definitely', 'was', 'also', 'to', 'the', 'two', 'little', 'that', 'played', 'the', 'of', 'and', 'paul', 'they', 'w



[('superb', 0.7657862901687622),
 ('marvelous', 0.7389472723007202),
 ('splendid', 0.7077070474624634),
 ('terrific', 0.6837816834449768),
 ('masterful', 0.6830281615257263),
 ('magnificent', 0.6709308624267578),
 ('dazzling', 0.6706756353378296),
 ('brilliantly', 0.6550824046134949),
 ('brilliance', 0.6550251245498657),
 ('scintillating', 0.6493905782699585)]

In [5]:
w2v = Word2Vec(x_train_words, size=embedding_size, window=15, min_count=5, workers=cpu_count(), iter=100)
wv2 = w2v.wv
wv2.most_similar(positive='brilliant')

[('superb', 0.5741499662399292),
 ('fantastic', 0.5574560165405273),
 ('wonderful', 0.5456534028053284),
 ('excellent', 0.51289963722229),
 ('amazing', 0.48810333013534546),
 ('great', 0.4362568259239197),
 ('perfect', 0.399600088596344),
 ('fine', 0.3871333599090576),
 ('hilarious', 0.3818921446800232),
 ('perfectly', 0.3642035126686096)]

In [16]:
x_train_idx = []
for idx, sentence in enumerate(x_train_words):
    for word in sentence:
        arr = []
        try:
            arr.append(wv.vocab[word].index)
        except KeyError: 
            arr.append(0)
    x_train_idx.append(arr)

x_test_idx = []
for idx, sentence in enumerate(x_train_words):
    for word in sentence:
        arr = []
        try:
            arr.append(wv.vocab[word].index)
        except KeyError: 
            arr.append(0)
    x_test_idx.append(arr)

x_train_idx = pad_sequences(x_train_idx, maxlen=maxlen)
x_test_idx = pad_sequences(x_test_idx, maxlen=maxlen)

# create embedding vector
embedding_matrix = np.zeros((len(wv2.vocab), embedding_size))
for i in range(len(wv2.vocab)):
    embedding_vector = None
    try:
        embedding_vector = wv[wv2.index2word[i]]
    except:
        pass
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [12]:
clear_session()
model = Sequential()
model.add(Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                    weights=[embedding_matrix], trainable=False))
model.add(Dropout(0.2))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(hidden_dims))
model.add(Dropout(0.3))
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history_w2v = model.fit(x_train_idx, y_train, epochs=epochs, batch_size=batch_size, 
                        validation_data=(x_test_idx, y_test), callbacks=callbacks)

Train on 25000 samples, validate on 25000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50

KeyboardInterrupt: 

In [13]:
clear_session()
model = Sequential()
model.add(Embedding(max_features,
                    embedding_size,
                    input_length=maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1,
                kernel_constraint=max_norm(3)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(hidden_dims, kernel_constraint=unit_norm()))
model.add(Dropout(0.3))
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history_norm = model.fit(x_train, y_train, batch_size=batch_size,
           epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks)

Train on 25000 samples, validate on 25000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
 3456/25000 [===>..........................] - ETA: 8s - loss: 0.2003 - acc: 0.9185

KeyboardInterrupt: 

In [None]:
def plot_history(history):
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

In [None]:
plot_history(history_norm)

In [None]:
plot_history(history_w2v)