In [42]:
'''
    Importing requisite modules
'''
from keras.preprocessing.text import Tokenizer
from keras.datasets import imdb
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers import Embedding

In [15]:
'''
    We will first look at tokenizing sample sentences
'''
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer = Tokenizer(num_words=1000)

'''
    Fitting the tokenizer to the sample sentences and then converting them to vectors.
    They still have not been one hot encoded. 
'''
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)

'''
    Doing the one hot encoding
'''
one_hot_result = tokenizer.sequences_to_matrix(sequences, mode = 'binary')

'''
    Which word corresponds to which index
'''
token_indices = tokenizer.word_index

In [35]:
'''
    Word embeddings do a better job than one-hot-encoding when there are many words. Word embeddings manage to vectorize
    the text data much more efficiently without blowing up the feature space. One-hot-encoding uses sparse rep
    while word embeddings use dense representations. We will first look at creating a word embedding using an 
    embedding layer in keras. Note: we might have to create a different word-embedding for different tasks as each 
    of them may have different semantic relationships. pg. 185 Francois Chollet
'''
max_features = 10000 # Restrict to 10,000 most common words - restricts total number of features
max_len = 20

'''
    The following code gives a list of lists. Each sublist is a list of integers of length = length of movie review.
    The integers correspond to the words i.e., the 10000 most common words we selected earlier
'''
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features)

'''
    The following code limits the word count to 20 and makes it uniform across all reviews. Outputs a 2D tensor of 
    size (samples X 20)
'''
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen = max_len)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen = max_len)


In [46]:
model = Sequential()
'''
    The three params provided for the embedding layer are number of unique words/size of our vocab in 
    our dict = 10000, the number of dimensions we wish to embed into (8), and input_length which is the 
    max length of the doc  = 20. 
'''
model.add(Embedding(10000, 8, input_length = max_len)) # dimension = (samples, maxlen, 8)

model.add(Flatten()) #flattens 3D tensor above to 2D, dimension = (samples, maxlen * 8)

model.add(Dense(1, activation = 'sigmoid')) #sigmoid because of binary classification

model.compile(optimizer='rmsprop', metrics=['acc'], loss='binary_crossentropy')

'''
    We shall now train the model
'''
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

result = history.history
print(max(result['val_acc']))
'''
    We get about 76% accuracy on the validation set. But we need to add a 1D covnet or a RNN after the embedding layer
    to better capture inter-word, sentence relationships
'''

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.7534000277519226


In [None]:
'''
    I am leaving the implementation of pre-trained word embeddings lind word2vec or GLOVE for later. Note that such 
    techniques are useful when training data is sparse or not enough.
'''
