In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

In [2]:
imdb_review_df = pd.read_csv('data/imdb_reviews.csv')
test_review_df = pd.read_csv('data/test_reviews.csv')
imdb_review_df.head()

Unnamed: 0,Reviews,Sentiment
0,<START this film was just brilliant casting lo...,positive
1,<START big hair big boobs bad music and a gian...,negative
2,<START this has to be one of the worst films o...,negative
3,<START the <UNK> <UNK> at storytelling the tra...,positive
4,<START worst mistake of my life br br i picked...,negative


In [3]:
print(set(imdb_review_df.Sentiment))

{'negative', 'positive'}


In [4]:
word_indexes_df = pd.read_csv('data/word_indexes.csv')
word_indexes_df.head()

Unnamed: 0,Words,Indexes
0,tsukino,52009
1,nunnery,52010
2,sonja,16819
3,vani,63954
4,woods,1411


In [5]:
word_indexes = dict(zip(word_indexes_df.Words, word_indexes_df.Indexes))
word_indexes["<PAD>"]=0
word_indexes["<START"]=1
word_indexes["<UNK>"]=2
word_indexes["<UNUSED>"]=3

In [6]:
import nltk

def text_encoder(text: str):
  # tokens = nltk.word_tokenize(text)
  # encoding = [word_indexes[token] for token in tokens]
  
  encoding = [word_indexes[word] for word in text.split()]

  return encoding

In [7]:
train_data,train_labels = imdb_review_df['Reviews'], imdb_review_df['Sentiment']
test_data, test_labels = test_review_df['Reviews'], test_review_df['Sentiment']

In [8]:
train_data[0]

"<START this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for wha

In [9]:
train_data = train_data.apply(text_encoder)
test_data = test_data.apply(text_encoder)

In [10]:
train_data.head()

0    [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ...
1    [1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463,...
2    [1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5...
3    [1, 4, 2, 2, 33, 2804, 4, 2040, 432, 111, 153,...
4    [1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1...
Name: Reviews, dtype: object

In [11]:
def sentiment_encoder(sentiment: str) -> int:
  if sentiment == 'positive':
    return 1
  else: 
    return 0

In [12]:
train_labels = train_labels.apply(sentiment_encoder)
test_labels = test_labels.apply(sentiment_encoder)

In [13]:
train_data = keras.preprocessing.sequence.pad_sequences(
    train_data, value=word_indexes["<PAD>"], padding='post', maxlen=500)
test_data = keras.preprocessing.sequence.pad_sequences(
    test_data, value=word_indexes["<PAD>"], padding='post', maxlen=500)

In [14]:
model = keras.Sequential([
    keras.layers.Embedding(10000, 16, input_length=500),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
    ])

In [15]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [17]:
history = model.fit(train_data, train_labels, epochs=30,
                    batch_size=512, validation_data=(test_data, test_labels))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [18]:
index = np.random.randint(1, 1000)
user_review = test_review_df.loc[index]
print(user_review)

Reviews      <START horrible horrible movie i still can't b...
Sentiment                                             negative
Name: 405, dtype: object


In [19]:
user_review = test_data[index]
user_review = np.array([user_review])
if (model.predict(user_review) > 0.5).astype("int32"):
  print("positive sentiment")
else:
  print("negative sentiment")


negative sentiment
