In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/keras-imdb/"))
print(os.listdir("../input/glove-global-vectors-for-word-representation/"))

# Any results you write to the current directory are saved as output.

['imdb.npz', 'imdb_word_index.json', 'aclImdb_v1']
['glove.6B.200d.txt', 'glove.6B.100d.txt', 'glove.6B.50d.txt']


In [2]:
imdb_dir = '../input/keras-imdb/aclImdb_v1/aclImdb/'
training_dir = os.path.join(imdb_dir, 'train')
print(training_dir)

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(training_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == ".txt":
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
        if label_type == 'neg':
            labels.append(0)
        else:
            labels.append(1)

# Lengths of the two arrays
len(labels), len(texts)

../input/keras-imdb/aclImdb_v1/aclImdb/train


(25000, 25000)

In [3]:
from keras.preprocessing.text import Tokenizer
max_words = 10000
tokenizer = Tokenizer(max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

Using TensorFlow backend.


In [4]:
word_index = tokenizer.word_index
print("Index for : the:", word_index["this"])
print("Index for : the:", word_index["good"])
print("Index for : the:", word_index["marketing"])
print("Sequences:", sequences[24999][:20])

Index for : the: 11
Index for : the: 49
Index for : the: 4892
Sequences: [11, 17, 6, 287, 316, 15, 1, 1110, 932, 2, 725, 113, 581, 18, 47, 79, 6, 32, 218, 2664]


In [5]:
from keras.preprocessing.sequence import pad_sequences
maxlen = 100
data = pad_sequences(sequences, maxlen)
print(data.shape[0], data.shape[1])

25000 100


In [6]:
labels = np.asarray(labels)
indices = np.arange(labels.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

[[   5 1342   26 ...  110   67   27]
 [   1 2448  344 ...  219   13  160]
 [ 160   48    6 ...    3 2469 4502]
 ...
 [  28 3629   40 ...   20    1 1108]
 [   8   11  179 ...    6   58 1172]
 [  72   23   61 ...  525  715  229]]
[1 0 0 ... 0 0 0]


In [7]:
training_samples = 20000
validation_sample = 5000
x_training_set = data[:training_samples]
y_training_set = labels[:training_samples]
x_validation_set = data[training_samples:training_samples + validation_sample]
y_validation_set = labels[training_samples:training_samples + validation_sample]
print(len(x_validation_set))

5000


In [8]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

embedding_dim = 50
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           500000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                160032    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 660,065
Trainable params: 660,065
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [10]:
history = model.fit(x_training_set, y_training_set, epochs=10,
                    batch_size=32, validation_data=(x_validation_set, y_validation_set))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
glove_dir = '../input/glove-global-vectors-for-word-representation/'
print("Word Embedding Vectors")
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))

Word Embedding Vectors


In [12]:
for lines in f:
    values = lines.split()
    word = values[0]
    embedding = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = embedding
f.close()

In [13]:
print("Number of Vectors = ",len(embeddings_index))

Number of Vectors =  400000


In [14]:
all_embs = np.stack(embeddings_index.values())
emb_mean = all_embs.mean()
emb_std = all_embs.std()
emb_mean, emb_std

  """Entry point for launching an IPython kernel.


(0.004451992, 0.4081574)

In [17]:
embedding_dim = 100
word_index = tokenizer.word_index
number_words = min(max_words, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (number_words, embedding_dim))

10000 100
(10000, 100)


In [18]:
for word, i in word_index.items():
    if i >= max_words: 
        continue
    # Get the embedding vector for the word
    embedding_vector = embeddings_index.get(word)
    # If there is an embedding vector, put it in the embedding matrix
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [19]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights = [embedding_matrix], trainable = False))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 320,065
Non-trainable params: 1,000,000
_________________________________________________________________


In [21]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_training_set, y_training_set,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_validation_set, y_validation_set))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
my_text = 'I love dogs. Dogs are the best. They are lovely, cuddly animals that only want the best for humans.'

seq = tokenizer.texts_to_sequences([my_text])
print('raw seq:',seq)
seq = pad_sequences(seq, maxlen=maxlen)
print('padded seq:',seq)
prediction = model.predict(seq)
print('positivity:',prediction)

raw seq: [[10, 116, 2520, 2520, 23, 1, 115, 33, 23, 1331, 1386, 12, 61, 178, 1, 115, 15, 1706]]
padded seq: [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   10  116
  2520 2520   23    1  115   33   23 1331 1386   12   61  178    1  115
    15 1706]]
positivity: [[0.9881958]]


In [27]:
my_text_n = 'This place lacks good food, good ambience, good music, good drinks. Would not recommend this place.'

seq = tokenizer.texts_to_sequences([my_text_n])
print('raw seq:',seq)
seq = pad_sequences(seq, maxlen=maxlen)
print('padded seq:',seq)
prediction = model.predict(seq)
print('positivity:',prediction)

raw seq: [[11, 270, 1500, 49, 1642, 49, 49, 225, 49, 6461, 59, 21, 383, 11, 270]]
padded seq: [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0   11  270 1500   49 1642   49   49  225   49 6461   59   21  383
    11  270]]
positivity: [[0.9585079]]
