# Sentiment Classification


## Loading the dataset

In [69]:
import numpy as np

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras import backend as K

seed=999
vocab_size = 10000
maxlen = 500  #number of word to be used from each review

In [70]:
from keras.datasets import imdb

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size,seed=seed) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.

In [71]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

print(X.shape)
print(y.shape)

randomNumber = np.random.randint(0,len(X)-1)

(50000,)
(50000,)


In [72]:
max_size=0
for i in range(0,len(X)-1):
    size= len(X[i])
    if(size>=max_size):
        max_size=size
print(max_size)

2494


In [73]:
count=0
for i in range(0,len(X)-1):
    size= len(X[i])
    if(size>=500):
        count+=1
print(count)

4053


# The highest count of words in a review is 2494. Truncating such a large comment is a loss of information.

# However reviews with words greater than 500 is less than 10 percent of the total reviews. So we can set this number to truncate the words in the review

In [74]:
print(np.unique(y)) #Binary classification for bad and good comments respectively

[0 1]


In [75]:
print(len(np.unique(np.hstack(X)))) #After limitting words with vocabulary size, number of words are just close to 10000

9998


In [76]:
# Get the actual review
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()]) 
decoded = " ".join( [reverse_index.get(i, "#") for i in X[randomNumber]] )
print(decoded)

the and gas to about whole he romantic true cast movie about and origins help spots be couple drawing brilliance actors work would it this about commits who's example length they about help but were dressing and about performed it and no about killer it escape unhinged in is and but and about pulp choreography charge and in and some they about help to sun glasses this described of worse couple complex recreate movie much movie of we're br of performances captures like short in person this about personality everyone i i of their it 1994 in is anne if horrific cops to which special make fi something more interesting think uses of before embarrassing of indeed he sound and adaptations it monster surrounding movie of somewhere killer to protect it happened depth movie of terrifying chapter killer br of performances captures not faye and movie and it movies walter in tears of on of period it time honestly to and big world man's that better would there about understand to no would monsters a

In [77]:
from keras.preprocessing.sequence import pad_sequences

## Train test split

In [78]:
#make all sequences of the same length
X_train = pad_sequences(X_train, maxlen=maxlen,padding='post')
X_test =  pad_sequences(X_test, maxlen=maxlen,padding='post')

In [79]:
X_train[1]

array([   1,    4,    2,    7,    2, 5834,   11,    4, 8064,   70,    2,
          6,    2,   11,    4, 6024, 1793,    8,   14, 2604,  820, 2472,
          8,    4, 4117, 2604,  146,   24,  252,  618,   89,  175,  206,
         57,  551,   89,  392,   42, 8966,   80,  380,    6, 3629, 2097,
         15,   70,  485,    8,  194,  687,   14,  392, 6823,    7,    6,
         22,  287,  178,    6,  201,    7, 1573, 5684,  105,   91,    7,
         98,   11, 1450,  625, 1729,   80, 4525,  257,    2,  456,    4,
       1446, 7079,    7,    2, 2604,   70,   30, 7452,   17,   73,  257,
         65,    9,    6,    2,    7,    4, 6823,   15,    9,   14,   22,
          4,  116,    9, 8625, 6858,    5, 7468, 5860,    2,    9,    2,
          4,  537,    2,    2,    4,  486,    9,   43,  208,    5,    4,
        529,  889,    4, 1716,    2,   11,    2,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [80]:
# Only the top 10000 words are loaded, with vocab_size=10000
# Also truncating words in each review to 300, truncating longer reviews and zero padding shorter reviews with pad_sequences
# And we are creating a dimensions of 32, 50 or 100 for each word, and compare the performance of the same

#n_dim=[32,50,100] - A 50 dimension embedding provides better result in our model, as we looped through these and compared the accuracy
n_dim=50

In [81]:
#for dim in n_dim:   

# create the model

model = Sequential()
    #model.add(Embedding(vocab_size, dim, input_length=maxlen))
model.add(Embedding(vocab_size, n_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 50)           500000    
_________________________________________________________________
flatten_7 (Flatten)          (None, 25000)             0         
_________________________________________________________________
dense_22 (Dense)             (None, 200)               5000200   
_________________________________________________________________
dropout_10 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 101       
Total params: 5,520,401
Trainable params: 5,520,401
Non-trainable params: 0
____________________________________________

In [82]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)
    # Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/2
 - 13s - loss: 0.4963 - accuracy: 0.7261 - val_loss: 0.3276 - val_accuracy: 0.8582
Epoch 2/2
 - 12s - loss: 0.1479 - accuracy: 0.9454 - val_loss: 0.3557 - val_accuracy: 0.8610
Accuracy: 86.10%


In [83]:
# A 50 dimensional vector size for each word is providing a better accuracy than a 50D or 300D model

In [84]:
# Even training in two epochs, there is a overfitting issue.Training with 5 epochs make the training accuracy close to 100 percent
# Hence,We will limit the epochs to 2

In [85]:
i=0
predicted_labels=model.predict(X_test)
while(i<10):
    test_randomNumber = np.random.randint(0,y_test.shape[0]-1)
    print("Viewing the actual and predicted output for the review ID: {}".format(test_randomNumber))
    print("Actual Output: {}".format(y_test[test_randomNumber]))
    print("Predicted Output: {}".format(predicted_labels[test_randomNumber]))
    
    if(y_test[test_randomNumber]==0) and (predicted_labels[test_randomNumber]<0.5):
        print("Review has been predicted as expected")
    elif(y_test[test_randomNumber]==0) and (predicted_labels[test_randomNumber]>=0.5):
        print("Wrong prediction of review")
    elif(y_test[test_randomNumber]==1) and (predicted_labels[test_randomNumber]<0.5):
        print("Wrong prediction of review")
    elif(y_test[test_randomNumber]==1) and (predicted_labels[test_randomNumber]>=0.5):
        print("Review has been predicted as expected")
    else:
        print("Check the Predicted and actual value")
    print("\n")
    i+=1

Viewing the actual and predicted output for the review ID: 2017
Actual Output: 0
Predicted Output: [0.00551283]
Review has been predicted as expected


Viewing the actual and predicted output for the review ID: 3142
Actual Output: 1
Predicted Output: [0.8275218]
Review has been predicted as expected


Viewing the actual and predicted output for the review ID: 3031
Actual Output: 1
Predicted Output: [0.9973098]
Review has been predicted as expected


Viewing the actual and predicted output for the review ID: 1178
Actual Output: 0
Predicted Output: [0.7256348]
Wrong prediction of review


Viewing the actual and predicted output for the review ID: 13060
Actual Output: 0
Predicted Output: [0.00030848]
Review has been predicted as expected


Viewing the actual and predicted output for the review ID: 19459
Actual Output: 1
Predicted Output: [0.9786644]
Review has been predicted as expected


Viewing the actual and predicted output for the review ID: 8139
Actual Output: 1
Predicted Output: [0

In [86]:
# As we can see, the predicted outputs are close to 1, and viceversa for the actual outputs to be 1 and 0 respectively
# However some results vary, as we yet have only 86 percent accuracy


In [87]:
print(len(model.layers))

6


In [96]:
for layer in range(1,len(model.layers)):
    get_layer_output = K.function([model.layers[0].input],
                                  [model.layers[layer].output])
    layer_output = get_layer_output([X_test])[0]
    #print("\nOutput of layer - {} is {}".format(layer,layer_output))
    print("\n")
    print(layer_output[test_randomNumber])
    #print(layer_output.size)



[ 4.2039114e-03  8.1935379e-04 -6.6072680e-05 ...  5.8760040e-04
 -1.1084182e-03  4.0380177e-03]


[ 2.73373991e-01 -0.00000000e+00  4.32829946e-01 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  4.03797597e-01 -0.00000000e+00  1.18092686e-01 -0.00000000e+00
  3.23214568e-02 -0.00000000e+00  6.74725100e-02 -0.00000000e+00
  1.29054219e-01  2.94829458e-01  3.74431312e-01  2.10190639e-01
 -0.00000000e+00  2.22499892e-02  3.00118178e-02 -0.00000000e+00
  1.28013834e-01  2.02649802e-01  5.88609397e-01  8.00798386e-02
 -0.00000000e+00  5.12947440e-01 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00  1.64791584e-01
 -0.00000000e+00  2.64106303e-01  2.74552733e-01  2.55601287e-01
  2.70067722e-01 -0.00000000e+00  3.97487521e-01 -0.00000000e+00
  3.12402636e-01 -0.00000000e+00 -0.00000000e+00  2.01763660e-01
  1.28781959e-01  1.66037038e-01  1.39781684e-01 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  1.00367308e-01 -0.00

In [91]:
layer_output[test_randomNumber]

array([0.02262691], dtype=float32)

In [92]:
model.predict([[X_test[test_randomNumber]]])

array([[0.02262694]], dtype=float32)

In [93]:
y_test[test_randomNumber]

0

In [None]:
# Thus we have built a NLP binary classification based on the reviews present for the IMDB movies