<a href="https://colab.research.google.com/github/SilahicAmil/NLP-NLTK/blob/main/Rotten_Tomatoes_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rotten Tomatoes Sentiment

https://www.kaggle.com/mrbaloglu/rotten-tomatoes-reviews-dataset

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Tensorflow
import tensorflow as tf

#Keras
import keras
from keras.layers import Embedding, TextVectorization
from tensorflow.keras import layers

# SkLearn
from sklearn.model_selection import train_test_split

In [2]:
# Unzip data
#!unzip "/content/drive/MyDrive/Rotten_Tomatoes_Sentiment/data_rt.csv (1).zip" -d "/content/drive/MyDrive/Rotten_Tomatoes_Sentiment/"

In [3]:
full_data = pd.read_csv("/content/drive/MyDrive/Rotten_Tomatoes_Sentiment/data_rt.csv")
full_data.head()

Unnamed: 0,reviews,labels
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [4]:
full_data.labels.value_counts()

0    5331
1    5331
Name: labels, dtype: int64

# Shuffle Data

In [5]:
full_data = full_data.sample(frac=1, random_state=42)
full_data.head()

Unnamed: 0,reviews,labels
6830,"this film seems thirsty for reflection , itsel...",1
8600,the movie's thesis -- elegant technology for t...,1
4080,tries too hard to be funny in a way that's too...,0
3079,disturbingly superficial in its approach to th...,0
582,"an ugly , pointless , stupid movie .",0


# Creating Train/Test sets

In [6]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(full_data["reviews"].to_numpy(),
                                                                              full_data["labels"].to_numpy(),
                                                                              test_size=0.12,
                                                                              random_state=42)

In [7]:
len(train_sentences), len(test_sentences)

(9382, 1280)

In [8]:
round(sum([len(i.split()) for i in train_sentences]))/len(train_sentences)

21.04188872308676

In [9]:
MAX_VOCAB = 10_000
MAX_LEN = 21

txt_vect = TextVectorization(max_tokens=MAX_VOCAB,
                             output_mode="int",
                             output_sequence_length=MAX_LEN)

In [10]:
txt_vect.adapt(train_sentences)

# Embeddings Model - Functional API

In [11]:
Embedding = keras.layers.Embedding(input_dim=MAX_VOCAB,
                                   output_dim=128,
                                   input_length=MAX_LEN)

In [12]:
inputs = keras.layers.Input(shape=(1,), dtype="string")

x = txt_vect(inputs)

x = Embedding(x)

# Pooling layer
x = keras.layers.GlobalAveragePooling1D()(x)

# Dense output
outputs = keras.layers.Dense(1, activation="sigmoid")(x)

model_0 = keras.Model(inputs, outputs , name="model_0_functional")

In [13]:
model_0.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

In [14]:
model_0.summary()

Model: "model_0_functional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 21)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 21, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,

In [15]:
hist_0 = model_0.fit(train_sentences,
            train_labels,
            epochs=5,
            validation_data=(test_sentences, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Eval

In [16]:
loss, accuracy = model_0.evaluate(test_sentences, test_labels)
print(f"Loss: {loss}\nAccuracy: {accuracy*100:2f}")

Loss: 0.5167263746261597
Accuracy: 75.625002


# LSTM Model

In [30]:
updated_embeddings = keras.layers.Embedding(input_dim=MAX_VOCAB,
                                           output_dim=128,
                                           embeddings_initializer="uniform",
                                           input_length=MAX_LEN)

In [31]:
# LSTM Model
inputs = keras.layers.Input(shape=(1,), dtype="string")

X = txt_vect(inputs)

X = updated_embeddings(X)

#LSTM Layers
X =keras.layers.LSTM(128)(X)
x = keras.layers.Dense(64, activation="relu")(X)

# Dense output
outputs = keras.layers.Dense(1, activation="sigmoid")(X)

model_1 = keras.Model(inputs, outputs, name="LSTM_Model")

In [36]:
model_1.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

In [37]:
model_1.summary()

Model: "LSTM_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 21)               0         
 torization)                                                     
                                                                 
 embedding_2 (Embedding)     (None, 21, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
______________________________________________

In [38]:
hist_2 = model_1.fit(train_sentences,
                     train_labels,
                     epochs=5,
                     validation_data=(test_sentences, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Eval LSTM

In [39]:
loss, accuracy = model_1.evaluate(test_sentences, test_labels, verbose=1)
print(f"Loss: {loss}\nAccuracy: {accuracy}")

Loss: 0.7915079593658447
Accuracy: 0.7132812738418579
