# Sentiment Classification


## Loading the dataset

In [0]:
import numpy as np
from keras.datasets import imdb

np_load_old = np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.
np.load = np_load_old

In [0]:
x_train.shape


(25000,)

In [0]:
x_test.shape


(25000,)

In [0]:
# Creating word index:
word_index = imdb.get_word_index()
word_and_word_index = dict(
[(value, key) for (key, value) in word_index.items()])

decoded_review = ' '.join(
[word_and_word_index.get(i - 3, '?') for i in x_train[10]])
print('---Words---')
print(decoded_review)
print('---Label---')
print(y_train[2])

---Words---
? french horror cinema has seen something of a revival over the last couple of years with great films such as inside and ? romance ? on to the scene ? ? the revival just slightly but stands head and shoulders over most modern horror titles and is surely one of the best french horror films ever made ? was obviously shot on a low budget but this is made up for in far more ways than one by the originality of the film and this in turn is ? by the excellent writing and acting that ensure the film is a winner the plot focuses on two main ideas prison and black magic the central character is a man named ? sent to prison for fraud he is put in a cell with three others the quietly insane ? body building ? marcus and his retarded boyfriend daisy after a short while in the cell together they stumble upon a hiding place in the wall that contains an old ? after ? part of it they soon realise its magical powers and realise they may be able to use it to break through the prison walls br b

In [0]:
#Finding the lenght of the word index
len(word_and_word_index)

88584

In [0]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 300  #number of word used from each review

## Train test split

In [0]:
#load dataset as a list of ints
# (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [0]:

print(x_train)

[[   0    0    0 ...   19  178   32]
 [   0    0    0 ...   16  145   95]
 [   0    0    0 ...    7  129  113]
 ...
 [   0    0    0 ...    4 3586    2]
 [   0    0    0 ...   12    9   23]
 [   0    0    0 ...  204  131    9]]


In [0]:
print(x_test)

[[   0    0    0 ...   14    6  717]
 [   0    0    0 ...  125    4 3077]
 [1239 5189  137 ...    9   57  975]
 ...
 [   0    0    0 ...   21  846 5518]
 [   0    0    0 ... 2302    7  470]
 [   0    0    0 ...   34 2005 2643]]


## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [0]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout,GRU
embedding_size=32

In [0]:
#Adding the LSTM details:
model=Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=maxlen))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))


In [0]:
#Compiling the model:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 300, 32)           320000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 373,301
Trainable params: 373,301
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
#Running 10 epochs by keeping batch size = 64:
batch_size = 64
num_epochs = 10
x_test, y_test = x_train[:batch_size], y_train[:batch_size]
x_train2, y_train2 = x_train[batch_size:], y_train[batch_size:]
model.fit(x_train2, y_train2, validation_data=(x_test, y_test), batch_size=batch_size, epochs=num_epochs)


Train on 24936 samples, validate on 64 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdbc330c650>

In [0]:
#Calculating the score and accuracy:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test Score:', score)
print('Test Accuracy:', acc)

('Test Score:', 0.3328913748264313)
('Test Accuracy:', 0.9375)


In [0]:
#Importing word tokenizer:
import string
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim
from keras.initializers import Constant

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
review_lines = list()
lines = word_and_word_index.values()

In [0]:
#Removing special characters:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [0]:
for line in lines:
  tokens = word_tokenize(line)
  #convert to lowercase
  tokens = [w.lower() for w in tokens]
  #remove puchation and numbers from each word
  words = [remove_special_characters(w,remove_digits=True) for w in tokens]
  #filter out stopwords
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]
  review_lines.append(words)

In [0]:
#Calculating length of review lines:
len(review_lines)

88584

In [0]:
#creating and training word2vec model:
model = gensim.models.Word2Vec(sentences = review_lines, size=embedding_size, window=5, min_count=1, workers=4)

In [0]:
words = list(model.wv.vocab)
print("vocabilary size : %d"%len(words))

vocabilary size : 73723


In [0]:
model.wv.most_similar(positive = "terrible")

  if np.issubdtype(vec.dtype, np.int):


[(u'convents', 0.6671054363250732),
 (u'shriekfest', 0.6665854454040527),
 (u'quarrels', 0.6511059999465942),
 (u'titus', 0.629612147808075),
 (u'rouncewell', 0.6137033700942993),
 (u'fool', 0.6046589612960815),
 (u'racy', 0.6045066118240356),
 (u'dandies', 0.6043086647987366),
 (u'wowser', 0.5990558862686157),
 (u'normalos', 0.5986173152923584)]

In [0]:
#Saving the model:
filename = "imdb_embedding_word2vec.txt"
model.wv.save_word2vec_format(filename,binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
#creating embedding matrix:
import os
embedding_index = {}
f = open(os.path.join('','imdb_embedding_word2vec.txt'))
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:])
  embedding_index[word]=coefs
f.close()

In [0]:
num_words = len(word_and_word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_size))
for word, i in word_and_word_index.items():
  if i > num_words:
    continue
  embedding_vector = embedding_index.get(word)
  if embedding_vector is not None:
    embedding_matix[i]=embedding_vector

In [0]:
#Bulding model using Pre-trained Embedding
model=Sequential()
embedding_layer = Embedding(num_words, embedding_size, weights=[embedding_matrix], input_length=maxlen, trainable = False)
model.add(embedding_layer)
model.add(GRU(units=32,dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

W0112 16:02:01.241247 140585223972736 deprecation.py:506] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 300, 32)           2834720   
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                6240      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 2,840,993
Trainable params: 6,273
Non-trainable params: 2,834,720
_________________________________________________________________
None


In [0]:
#Compiling the model:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

batch_size = 64
num_epochs = 10
x_test, y_test = x_train[:batch_size], y_train[:batch_size]
x_train2, y_train2 = x_train[batch_size:], y_train[batch_size:]
model.fit(x_train2, y_train2, validation_data=(x_test, y_test), batch_size=batch_size, epochs=num_epochs)

Train on 24936 samples, validate on 64 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdbbedc4fd0>

## Retrive the output of each layer in keras for a given single test sample from the trained model you built

In [0]:
from keras import backend as K

inp = model.input                                           # input placeholder
outputs = [layer.output for layer in model.layers]          # all layer outputs
functors = [K.function([inp, K.learning_phase()], [out]) for out in outputs]    # evaluation functions

#Testing the num words:
test = np.random.random(num_words)[np.newaxis,...]
layer_outs = [func([test, 1.]) for func in functors]
print layer_outs

[[array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)], [array([[ 0.00588785, -0.00646025, -0.00996825, -0.00576604, -0.00424212,
         0.00093046, -0.00146908,  0.00016623,  0.00988423, -0.01017534,
        -0.00857754,  0.00288379, -0.00546875,  0.00428849, -0.00956388,
        -0.0072801 , -0.01553406,  0.02446816,  0.00184386,  0.01164285,
         0.00395406, -0.00380243, -0.00530218,  0.00203009,  0.00273696,
         0.00494729, -0.00436864,  0.00285287,  0.01668944,  0.00939758,
        -0.00077096,  0.01227883]], dtype=float32)], [array([[0.4983442]], dtype=float32)]]
