In [6]:
import io
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [9]:
import pandas as pd
import numpy as np

In [10]:
data = pd.read_csv("IMDB Dataset.csv")

In [11]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [12]:
data.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [16]:
data.sentiment = le.fit_transform(data.sentiment)

In [17]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [39]:
from sklearn.model_selection import train_test_split
train_sen ,val_sen ,train_lab ,val_lab  = train_test_split(data.review.to_numpy(),
                                                           data.sentiment.to_numpy(),
                                                           test_size=0.3, random_state=42)

In [40]:
len(train_sen)

35000

In [41]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [42]:
text_vectorizer = TextVectorization(max_tokens=12000, 
                                    standardize="lower_and_strip_punctuation",
                                    split = "whitespace",
                                    ngrams=None,output_mode = "int", output_sequence_length = None, pad_to_max_tokens=True)

In [43]:
text_vectorizer.adapt(train_sen)

In [44]:
train_sen

array(['As much as I love trains, I couldn\'t stomach this movie. The premise that one could steal a locomotive and "drive" from Arkansas to Chicago without hitting another train along the way has to be right up there on the Impossible Plot lines hit board. Imagine two disgruntled NASA employees stealing the "crawler" that totes the shuttles to and fro and driving it to New York and you get the idea.<br /><br />Having said all that, it\'s a nice try. Wilford Brimely is at his Quaker Oats best, and Levon Helm turns a good performance as his dimwitted but well-meaning sidekick. Bob Balaban is suitably wormy as the Corporate Guy, and the "little guy takes on Goliath" story gets another airing.',
       "This was a very good PPV, but like Wrestlemania XX some 14 years later, the WWE crammed so many matches on it, some of the matches were useless. I'm not going to go through every match on the card because it would take forever to do.<br /><br />However major highlights included the HUGE po

In [45]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-6:]

In [46]:
top_5_words, bottom_5_words, len(words_in_vocab)

(['', '[UNK]', 'the', 'and', 'a'],
 ['mattei', 'mathieu', 'mastery', 'massively', 'masquerading', 'lubitsch'],
 12000)

In [47]:
#Embedding
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=12000, # set the input shape
                             output_dim=128, # set the size of the embedding vector
                             embeddings_initializer="uniform", # default, initialize embedding vectors randomly
                             input_length=15 # how long is each input
                             )

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf",TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sen, train_lab )

In [49]:
model_0.score(val_sen,  val_lab)

0.8608666666666667

In [50]:
inputs = layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64,return_sequences=True)(x)
#x = layers.LSTM(42,return_sequences=True)(x)
#x = layers.GRU(99)(x)
#x = layers.Dense(64,activation="relu")(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1,activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

model_3.compile(loss="binary_crossentropy",
               optimizer = "Adam",
               metrics=["accuracy"])

model_3_history = model_3.fit(train_sen,train_lab,epochs=5, validation_data = (val_sen,val_lab))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [52]:
#MODEL4 BiDirectional
inputs = layers.Input(shape=(1,),dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
#x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64))(x)
outputs = layers.Dense(1,activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs,name="model_4_bidirectional")

model_4.compile(loss="binary_crossentropy",
               optimizer = "Adam",
               metrics=["accuracy"])

model_4_history = model_4.fit(train_sen,train_lab,epochs=4, validation_data=(val_sen,val_lab))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
