In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
imdb,info=tfds.load('imdb_reviews',with_info=True,as_supervised=True)

In [3]:
train_data,test_data=imdb['train'],imdb['test']

In [4]:
training_sentences=[]
testing_sentences=[]
training_labels=[]
testing_labels=[]
for s,l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

for s,l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())


In [5]:
training_sentences[0]

'b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

In [6]:
training_labels[0]

0

In [7]:
len(training_sentences),len(testing_sentences)

(25000, 25000)

In [8]:
from collections import Counter

In [9]:
Counter(training_labels)

Counter({0: 12500, 1: 12500})

In [10]:
Counter(testing_labels)

Counter({1: 12500, 0: 12500})

In [11]:
training_labels_final=np.array(training_labels)
testing_labels_final=np.array(testing_labels)

In [12]:
vocab_size=10000
embedded_dim=16
max_lenght=120
trunc_type='post'

In [13]:
pip install keras_preprocessing



In [14]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer=Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(training_sentences)
word_index=tokenizer.word_index
word_index;
len(word_index)

86538

In [15]:
sequences=tokenizer.texts_to_sequences(training_sentences)
padded=pad_sequences(sequences,maxlen=max_lenght,truncating=trunc_type)

testing_sequences=tokenizer.texts_to_sequences(testing_sentences)
testing_padded=pad_sequences(testing_sequences,maxlen=max_lenght)

In [16]:
padded.shape

(25000, 120)

In [17]:
testing_padded.shape

(25000, 120)

In [18]:
padded

array([[   0,    0,    0, ...,  873,  144,    9],
       [   0,    0,    0, ...,   31,   30,   46],
       [6174,    1, 4915, ...,    8, 6175,   46],
       ...,
       [7628,   36,   10, ...,  167,    5,   28],
       [2676,   10,  215, ...,    1,   88,   10],
       [3874,    4,   30, ...,    5,  994, 5125]], dtype=int32)

In [19]:
word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 "'s": 20,
 'on': 21,
 'you': 22,
 'not': 23,
 'are': 24,
 'his': 25,
 'he': 26,
 'have': 27,
 'be': 28,
 'one': 29,
 'all': 30,
 'at': 31,
 'by': 32,
 'they': 33,
 'an': 34,
 'who': 35,
 'so': 36,
 'from': 37,
 'like': 38,
 'her': 39,
 "'t": 40,
 'or': 41,
 'just': 42,
 'there': 43,
 'about': 44,
 'out': 45,
 "'": 46,
 'has': 47,
 'if': 48,
 'some': 49,
 'what': 50,
 'good': 51,
 'more': 52,
 'very': 53,
 'when': 54,
 'she': 55,
 'up': 56,
 'can': 57,
 'b': 58,
 'time': 59,
 'no': 60,
 'even': 61,
 'my': 62,
 'would': 63,
 'which': 64,
 'story': 65,
 'only': 66,
 'really': 67,
 'see': 68,
 'their': 69,
 'had': 70,
 'were': 71,
 'me': 72,
 'well': 73,
 'we': 74,
 'than': 75,
 'much': 76,
 'been': 77,
 'get': 78,
 'bad': 79,
 'will': 80,
 'people': 81,
 'do': 82,
 'also': 83,


In [20]:
from keras.models import Sequential
from keras.layers import Dense,SimpleRNN,Embedding

In [21]:
model_rnn=Sequential([
    Embedding(vocab_size,embedded_dim,input_length=max_lenght),
    SimpleRNN(32),
    Dense(10,activation='relu'),
    Dense(1,activation='sigmoid')
])

model_rnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           160000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                1568      
                                                                 
 dense (Dense)               (None, 10)                330       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 161909 (632.46 KB)
Trainable params: 161909 (632.46 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
model_rnn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [23]:
history=model_rnn.fit(padded,training_labels_final,epochs=5,validation_data=(testing_padded,testing_labels_final))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
import pandas as pd

In [26]:
res_df = pd.DataFrame(history.history)

In [27]:
res_df

Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.693444,0.50788,0.691872,0.5186
1,0.636075,0.6424,0.510537,0.765
2,0.477312,0.77384,0.669459,0.62808
3,0.247037,0.90112,0.733809,0.70996
4,0.120564,0.95712,0.941729,0.69692


In [28]:
from keras.layers import Bidirectional, GRU

In [31]:
model_gru = Sequential([
    Embedding(vocab_size, embedded_dim,
              input_length = max_lenght),

    Bidirectional(GRU(32)),

    Dense(10, activation = 'relu'),

    Dense(1, activation = 'sigmoid')
])

model_gru.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 16)           160000    
                                                                 
 bidirectional (Bidirection  (None, 64)                9600      
 al)                                                             
                                                                 
 dense_2 (Dense)             (None, 10)                650       
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 170261 (665.08 KB)
Trainable params: 170261 (665.08 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
model_gru.compile(loss = 'binary_crossentropy',
                  optimizer = 'adam', metrics = ['accuracy'])


history = model_gru.fit(padded, training_labels_final, epochs = 3,
                        validation_data = (testing_padded, testing_labels_final))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [33]:
from keras.layers import LSTM

In [34]:
model_lstm = Sequential([
    Embedding(vocab_size, embedded_dim,
              input_length = max_lenght),

    Bidirectional(LSTM(32)),

    Dense(10, activation = 'relu'),

    Dense(1, activation = 'sigmoid')
])

In [35]:
model_lstm.compile(loss = 'binary_crossentropy',
                  optimizer = 'adam', metrics = ['accuracy'])


history = model_lstm.fit(padded, training_labels_final, epochs = 3,
                        validation_data = (testing_padded, testing_labels_final))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [36]:
new_1 = 'The was worst movie. Please do not watch it. Waste of money and time.'
new_2 = 'A must watch movie. I loved to watch it. It was so amazing!'

In [64]:
def pred_preprocessing(text):
  testing_sequences=tokenizer.texts_to_sequences(text)
  testing_padded=pad_sequences(testing_sequences,maxlen=max_lenght,truncating=trunc_type)
  return testing_padded

In [65]:
x_test=pred_preprocessing(new_1)

In [66]:
print(x_test)

[[ 0  0  0 ...  0  0  1]
 [ 0  0  0 ...  0  0 10]
 [ 0  0  0 ...  0  0  2]
 ...
 [ 0  0  0 ...  0  0  7]
 [ 0  0  0 ...  0  0  2]
 [ 0  0  0 ...  0  0  0]]


In [67]:
word_index

{'t': 1,
 'e': 2,
 'o': 3,
 'a': 4,
 'w': 5,
 's': 6,
 'm': 7,
 'i': 8,
 'n': 9,
 'h': 10,
 'd': 11,
 'r': 12,
 'v': 13,
 'p': 14,
 'l': 15,
 'c': 16,
 'f': 17,
 'y': 18}

In [68]:
y_pred=model_lstm.predict(x_test,verbose=False)

In [69]:
percent_pos=y_pred[0,0]
print(percent_pos)

0.37845975


In [70]:
x_test=pred_preprocessing(new_2)
y_pred=model_lstm.predict(x_test,verbose=False)
percent_pos=y_pred[0,0]
print(percent_pos)

0.35555494
