# Sentiment Classification


## Loading the dataset

In [64]:
from keras.datasets import imdb

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.

In [65]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 300  #number of word used from each review

## Train test split

In [66]:
#load dataset as a list of ints
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [67]:
print(x_train.shape)

(25000, 300)


In [68]:
import keras
word_index =keras.datasets.imdb.get_word_index()
word_index

{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [69]:
import tensorflow as tf
from keras.models import Sequential, load_model #importing sequential model 
from keras.layers import Flatten,Dropout
from keras.layers import Dense
from keras.layers.embeddings import Embedding

In [70]:
model = Sequential()
model.add(Embedding(vocab_size,32,input_length=300))
model.add(Flatten())
model.add(Dense(250,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

In [71]:
#Compiling the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
print(model.summary()) #looking at the structure of model

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 300, 32)           320000    
_________________________________________________________________
flatten_7 (Flatten)          (None, 9600)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 250)               2400250   
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 251       
Total params: 2,720,501
Trainable params: 2,720,501
Non-trainable params: 0
_________________________________________________________________
None


In [72]:
model.fit(x_train,y_train,epochs=5,batch_size=100, validation_split=0.1, verbose=1)
#Training accuracy is 100%, but validation accuracy around 87%, which means model is overfit

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 22500 samples, validate on 2500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1f6dbd05320>

In [73]:
#Fitting the model again by adding dropout in the model
model = Sequential()
model.add(Embedding(vocab_size,32,input_length=300))
model.add(Dropout(0.2)) # adding a dropout layer with 20% as 10% dropout didn't help, model was still overfit
model.add(Flatten())
model.add(Dense(250,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

In [74]:
#Compiling the model again after adding dropout layer
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
print(model.summary()) #looking at the structure of model

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 300, 32)           320000    
_________________________________________________________________
dropout_7 (Dropout)          (None, 300, 32)           0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 9600)              0         
_________________________________________________________________
dense_15 (Dense)             (None, 250)               2400250   
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 251       
Total params: 2,720,501
Trainable params: 2,720,501
Non-trainable params: 0
_________________________________________________________________
None


In [77]:
# Again training the model for 20 epochs, model still seems to overfit. 
# We can try again by increasing the sample size in above codes, maybe to 20000 from current 10000
model.fit(x_train,y_train,epochs=20,batch_size=64, validation_split=0.1, verbose=1)

Train on 22500 samples, validate on 2500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x1f6d2263828>

## Retrive the output of each layer in keras for a given single test sample from the trained model you built

In [24]:
from keras import backend as bk
model_inp = model.input #Input placeholder
model_out = [layer.output for layer in model.layers]  #taking all output layer in the model
outputs = bk.function([model_inp,bk.learning_phase()],model_out)

In [39]:
layer_outs = outputs([x_test,1])
print(layer_outs)

[array([[[-0.01331859,  0.00073751, -0.0034146 , ...,  0.00848386,
         -0.0013593 ,  0.00617379],
        [-0.01331859,  0.00073751, -0.0034146 , ...,  0.00848386,
         -0.0013593 ,  0.00617379],
        [-0.01331859,  0.00073751, -0.0034146 , ...,  0.00848386,
         -0.0013593 ,  0.00617379],
        ...,
        [-0.00639717,  0.02148611, -0.04017145, ...,  0.05450618,
          0.03047337, -0.02728066],
        [-0.00475401,  0.01818202,  0.0246619 , ...,  0.02776259,
         -0.01032664,  0.00678895],
        [-0.00924181, -0.13423425,  0.07351878, ...,  0.04344928,
         -0.05125131,  0.12178137]],

       [[-0.01331859,  0.00073751, -0.0034146 , ...,  0.00848386,
         -0.0013593 ,  0.00617379],
        [-0.01331859,  0.00073751, -0.0034146 , ...,  0.00848386,
         -0.0013593 ,  0.00617379],
        [-0.01331859,  0.00073751, -0.0034146 , ...,  0.00848386,
         -0.0013593 ,  0.00617379],
        ...,
        [-0.00460463, -0.01876218, -0.03197296, ..., 

In [25]:
len(model_out)

4

In [41]:
# from platform import python_version

# print(python_version())
!pip install numpy==1.16.2 #in order to fix the problem I was getting from print(layer_out)
import numpy as np
print(np.__version__) 

Collecting numpy==1.16.4
  Downloading https://files.pythonhosted.org/packages/ce/61/be72eee50f042db3acf0b1fb86650ad36d6c0d9be9fc29f8505d3b9d6baa/numpy-1.16.4-cp37-cp37m-win_amd64.whl (11.9MB)
Installing collected packages: numpy
  Found existing installation: numpy 1.16.2
    Uninstalling numpy-1.16.2:
      Successfully uninstalled numpy-1.16.2
1.16.2


ERROR: Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'd:\\sumant\\new folder\\anaconda\\lib\\site-packages\\~-mpy\\.libs\\libopenblas.IPBC74C7KURV7CB2PKT5Z5FNR3SIBV4J.gfortran-win_amd64.dll'
Consider using the `--user` option or check the permissions.



In [None]:
#Model could be further improved by increasing the data size from current 10000 as it didn't improve even fter:
# increasing the number of epoch, changing optimizer or activation function
#Slight improvement in accuracy after adding dropout of 10% and 20%