# Analysing dataset from IMDB with Keras and Textblob

<h2><center>Sentiment Analysis</center></h2>


In [13]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.datasets import imdb
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing.text import Tokenizer

In [14]:
## Declaring constant
num_words = 25000 # number for the number of word loaded from dataset
num_class = 2  # number of output class for the cotegorical data
num_dense_layer = 10

## Loading the dataset

In [15]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = 10)

## Exploring the dataset

##### Looking into the number of elements in the dataset give the verified number expected above.

In [16]:
print("x_train: ", np.prod(x_train.shape))
print("x_test:  ",  np.prod(x_train.shape))
print("y_test:  ",  np.prod(x_train.shape))
print("y_train: ", np.prod(x_train.shape))

x_train:  25000
x_test:   25000
y_test:   25000
y_train:  25000


<h5><center>Inspecting the input and output data type and format</center></h5>

Printing a sample of the dataset shows that it has been process. The words has been process to numbers. For instance, a word is processed to be positive then it is considered good review.  1 - Positive review rating. 0 - Negative review rating. 

In [17]:
# Print the first 5 elements in the x train list
print(x_train[5])

[1, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 8, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [18]:
# Print the first 5 elements from the x test list
print(x_test[10])

[1, 2, 2, 2, 2, 2, 6, 2, 2, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 9, 2, 2, 5, 2, 2, 2, 2, 2, 4, 2, 2, 6, 2, 8, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 4, 2, 2, 2, 4, 2, 2, 5, 2, 2, 2, 2, 2, 2, 4, 2, 7, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 4, 2, 2, 2, 2, 2, 2, 2, 2, 5, 9, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 4, 2, 2, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 7, 2, 2, 8, 2, 2, 7, 2, 2, 2, 2, 2, 7, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 5, 2, 2, 2, 4, 2, 2, 2, 2, 7, 2, 5, 2, 2, 4, 2, 2, 2, 2, 6, 2, 2, 7, 2, 2, 2, 2, 2, 7, 2, 2, 2, 2, 9, 2, 8, 2]


<h5><center>Vectorizing the input dataset</center></h5>
Using Keras Tokenizer, let's vectorize the training and testing input

In [19]:
token = Tokenizer(num_words)
x_train = token.sequences_to_matrix(x_train, mode='binary')
x_test = token.sequences_to_matrix(x_test, mode='binary')

In [20]:
# Print the first 5 elements again from the x test list
print(x_test[10])

[0. 1. 1. ... 0. 0. 0.]


<h5><center>Vectorizing the output dataset</center></h5>
For the output, Keras categorical one hot encoding will be used. This build in class in Keras allows us to convert the ouput dataset in integer vectors into binary matrix

In [21]:
y_train = keras.utils.to_categorical(y_train, num_classes= num_class)
y_test = keras.utils.to_categorical(y_test, num_classes= num_class)

## Creating Keras model architecture

<h5><center> Adding the input output dropout layers</center></h5>
We will use a couple dense layer for testing purposes. Using one dense layers will still give a good accuracy. The dropout layer is given a probably of 0.5 and 0.4 to help prevent overfitting

In [22]:
model = Sequential()
#model.Add(Dense(num_dense_layer, Activation="relu", input_dim=num_words))
model.add(Dense(num_dense_layer, activation='relu', input_dim=num_words))
model.add(Dropout(0.5))
model.add(Dense(num_dense_layer, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_class, activation='softmax'))
model.summary();

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 600)               15000600  
_________________________________________________________________
dropout_3 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 600)               360600    
_________________________________________________________________
dropout_4 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 1202      
Total params: 15,362,402
Trainable params: 15,362,402
Non-trainable params: 0
_________________________________________________________________


## Compiling the model

In [23]:
model.compile(loss= 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Training the Model

In [24]:
train_model = model.fit(x_train, y_train, batch_size=50, epochs=20, verbose=0)

KeyboardInterrupt: 

## Evaluating the Model accuracy

In [None]:
 performance = model.evaluate(x_test, y_test)

## Making a prediction

In [None]:
y_predict = model.predict_classes(x_test)
for i in range(len(Xnew)):
    print("X=%s, Predicted=%s" % (x_test[i], y_predict[i]))