# Sentiment Classification


### Loading the dataset (5 points)

In [12]:
from keras.datasets import imdb

#filter out top 10000 used words
vocab_size = 10000 

In [13]:
from keras.preprocessing.sequence import pad_sequences

## Train test split ( 5 points)

In [14]:
import numpy as np
# save np.load
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# call load_data with allow_pickle implicitly set to true
#load dataset as a list of ints
# vocab_size is no.of words to consider from the dataset, ordering based on frequency.
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

# restore np.load for future normal usage
np.load = np_load_old

In [15]:
#Maximum sequence length
#number of words used from each review
maxlen = 300  

#make all sequences of the same length using pad_sequences
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test =  pad_sequences(X_test, maxlen=maxlen)

In [16]:
print(X_train[8],y_train[8])

#Here the X_train is sequence representing the most commonly used words in the overall data say 1:1st commonly used word,171:171st commonly used word in the data.

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    1   43  188
   46    5  566  264   51    6  530  664   14    9 1713   81   25 1135
   46    7    6   20  750   11  141 4299    5    2 4441  102   28  413
   38  120 5533   15    4 3974    7 5369  142  371  318    5  955 1713
  571    2    2  122   14    8   72   54   12   86  385   46    5   14
   20    9  399    8   72  150   13  161  124    6  155   44   14  159
  170   83   12    5   51    6  866   48   25  842    4 1120   25  238
   79    4  547   15   14    9   31    7  148    2  102   44   35  480
 3823 2380   19  120    4  350  228    5  269    8   28  178 1314 2347
    7   51    6   87   65   12    9  979   21   95   24 3186  178   11
    2 

In [17]:
print(X_train[558],y_train[558])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    1
  207  332 3356    7    2   44   14  500  112   38  729    8 2431 9256
   54   13  256   12    4   86   58   13  197   12   16    6  227  916
    2   40  500   19    6 1762    8 1677  125    4 2431 9256  405   24
   43 3985   58   38   54   13  256   12   18    6  378    7  634   13
 1706 

## Build Keras Embedding Layer Model (30 points)
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [18]:
from keras.models import Model, Sequential
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D

In [19]:
def create_seq_model():
  model = Sequential()
  #Here the 10000 is some random number, which is much larger than needed to reduce the probability of collisions from the hash function
  #The number 10k should be greater than the total no of letters in each sequence
  model.add(Embedding(10000,256,input_length=300))
  model.add(Bidirectional(LSTM(32, return_sequences = True)))
  model.add(GlobalMaxPool1D())
  model.add(Dense(20, activation="relu"))
  model.add(Dropout(0.05))
  model.add(Dense(1, activation="sigmoid"))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

## Accuracy of the model  & Retrive the output of each layer in keras for a given single test sample from the trained model you built (10 Points)

In [20]:
seq_nlp_model=create_seq_model()

# summarize the model

print(seq_nlp_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 300, 256)          2560000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 300, 64)           73984     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 21        
Total params: 2,635,305
Trainable params: 2,635,305
Non-trainable params: 0
_________________________________________________________________


In [None]:
''' 
batch_size = 100
epochs = 3
'''


batch_size = 5
epochs = 2

# fit the model
seq_nlp_model.fit(X_test,y_test, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/2

In [None]:
from sklearn.metrics import f1_score, confusion_matrix

In [None]:
y_pred = seq_nlp_model.predict(X_test)


print('F1-score: {0}'.format(f1_score(y_pred, y_test)))
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)