### Import Libraries

In [1]:
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import to_categorical
from keras.layers import Dense, Activation, Dropout, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


### Read Data

In [2]:
text, sentiment = [], []
with open("./movie review/train.tsv", "r") as f:
    for i, line in enumerate(f):
        if i > 0:
            line = line.strip()
            line = line.split('\t')
            text.append(line[2])
            sentiment.append(line[3])

### Load Google News Word2Vec Model

In [3]:
import gensim

vectoriser = gensim.models.KeyedVectors.load_word2vec_format('path to word2vec/GoogleNews-vectors-negative300.bin', binary=True)

### Convering Sentences to word

Variable "text" is a list of sentences which is converted to a list of words

In [4]:
x_train = []
for word in text:
    x_train.append(text_to_word_sequence(word))

Making a vocabulary of all the words availabe in our dataset

In [5]:
all_word = []
for sentence in x_train:
    all_word.extend(sentence)
        
all_word = list(set(all_word))
print(len(all_word), type(all_word))

15288 <class 'list'>


Comparing the words available in our dataset with the words in our word2vec model and removing all the words which are not available in our word2vec model

In [6]:
vect_vocab = []
for checker in all_word:
    if checker in vectoriser:
        vect_vocab.append(checker)
print(len(vect_vocab))

14028


Removing all the words from our training set which are not available in our word2vec model

In [7]:
new_xtrain = []
for word in x_train:
    new_xtrain.append(list(x for x in word if x in vect_vocab))
len(new_xtrain)

156060

Preprocessing our training set by removing all the empty lists

In [8]:
new_xtrain1 = new_xtrain[:]
temp = []
for x in range(len(new_xtrain)):
    if new_xtrain[x] == []:
        temp.append(x)
        
for x in reversed(temp):
    sentiment.pop(x)
    new_xtrain1.pop(x)

In [9]:
len(new_xtrain1), len(sentiment)

(154144, 154144)

Converting the labels(integers) to binary matrix

In [10]:
num_class = 5
y_train = to_categorical(sentiment, num_class)
y_train.shape

(154144, 5)

In [11]:
new_xtrain1 = np.array(new_xtrain1)
new_xtrain1.shape

(154144,)

Converting the words in our dataset to vector(matrix)

In [12]:
vec_xtrain = []
for word in new_xtrain1:
    vec_xtrain.append(vectoriser[word])
vec_xtrain = np.array(vec_xtrain)

In [13]:
vec_xtrain.shape

(154144,)

In [14]:
vec_xtrain[0].shape

(28, 300)

Taking average of all the words in a sentence which are available in the matrix form so that the vector of the entire sentence can be found

In [15]:
vec_xtrain1 = []
def sent_vectorizer(sent):
    sent_vec = np.zeros(300)
    numw = 0
    for w in sent:
        try:
            sent_vec = np.add(sent_vec, w)
            numw+=1
        except:
            pass
    return sent_vec / numw
for x in vec_xtrain:
    vec_xtrain1.append(sent_vectorizer(x))

In [16]:
vec_xtrain1 = np.array(vec_xtrain1)
vec_xtrain1.shape

(154144, 300)

Seperating our data into training and test set

In [17]:
x_train_vector, x_test_vector, y_train_vector, y_test_vector = train_test_split(vec_xtrain1, y_train, test_size=0.2)

In [18]:
x_train_vector.shape, x_test_vector.shape, y_train_vector.shape, y_test_vector.shape

((123315, 300), (30829, 300), (123315, 5), (30829, 5))

### Creating a neural network

In [44]:
model = Sequential()
model.add(Dense(1024, activation='relu',input_dim=300))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_69 (Dense)             (None, 1024)              308224    
_________________________________________________________________
dropout_35 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_70 (Dense)             (None, 512)               524800    
_________________________________________________________________
dropout_36 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_71 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_72 (Dense)             (None, 5)                 1285      
Total params: 965,637
Trainable params: 965,637
Non-trainable params: 0
_________________________________________________________________


In [45]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['categorical_accuracy', 'accuracy'])

In [None]:
model.fit(x_train_vector, y_train_vector, epochs=20, batch_size=32,validation_data=(x_test_vector,y_test_vector), verbose=1)

Train on 123315 samples, validate on 30829 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
 12256/123315 [=>............................] - ETA: 9s - loss: 0.8237 - categorical_accuracy: 0.7019 - acc: 0.7019

In [29]:
model.evaluate(x_test_vector, y_test_vector, batch_size=32)



[0.9152104708129044, 0.6486100749371831, 0.6486100749371831]

In [23]:
model.predict_classes(x_test_vector[:2])

array([2, 2])

In [24]:
y_test_vector[:2]

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [38]:
32+32

64

In [34]:
np.sqrt(16)

4.0

In [40]:
c = []
x=16
while x<=1024:
    x = x*2
    c.append(x)
c

[32, 64, 128, 256, 512, 1024, 2048]

In [42]:
acc = 0.0
for i in c:
    model = Sequential()
    model.add(Dense(i, activation='relu',input_dim=300))
    model.add(Dropout(0.2))
    model.add(Dense(i//2, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(i//4, activation='relu'))
    model.add(Dense(5, activation='softmax'))
#     model.summary()
    model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['categorical_accuracy', 'accuracy'])
    model.fit(x_train_vector, y_train_vector, epochs=10, batch_size=32,validation_data=(x_test_vector,y_test_vector), verbose=1)
    a = model.evaluate(x_test_vector, y_test_vector, batch_size=32)
    if a[1] > acc:
        acc = a[1]
        loss = a[0]
        inter = i

Train on 123315 samples, validate on 30829 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 123315 samples, validate on 30829 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 123315 samples, validate on 30829 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 123315 samples, validate on 30829 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


Epoch 9/10
Epoch 10/10
Train on 123315 samples, validate on 30829 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 123315 samples, validate on 30829 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 123315 samples, validate on 30829 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
acc, loss, inter

(0.6546757922831757, 0.9232274577934645, 1024)