In [1]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import strip_tags, strip_numeric
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras import utils
utils.set_random_seed(42)

In [2]:
# loading data
df = pd.read_csv('/content/imdb_reviews_sentiment_analysis.csv')
df.sample(5)

Unnamed: 0,text,label
1501,I wanted to watch this movie because of Eliza ...,0
2586,"""Tourist Trap"" is a genuinely spooky low-budge...",1
2653,Cops Logan Alexander and Debbie Rochon escort ...,0
1055,This movie was terrible!I rented it not knowin...,0
705,I absolutely LOVED this movie as a child. I ca...,1


In [3]:
# getting info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


In [4]:
# getting value count of target
df['label'].value_counts()

1    2505
0    2495
Name: label, dtype: int64

In [5]:
# data preprocessing
df['text'] = df['text'].apply(strip_tags)
df['text'] = df['text'].apply(strip_numeric)

In [6]:
# creating X & y
X = df['text'].values
y = df['label'].values

In [7]:
# splitting the data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
# text vectorization
Vectorizer = layers.TextVectorization(max_tokens=2000+2, output_mode='int', output_sequence_length=100,
                                      pad_to_max_tokens=True, standardize='lower_and_strip_punctuation', split='whitespace')
Vectorizer.adapt(X_train)

In [9]:
# creating tf datasets
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train)).batch(32).cache().prefetch(tf.data.AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32).prefetch(tf.data.AUTOTUNE)

In [10]:
# model building
model = Sequential()
model.add(layers.Input(shape=(1,), dtype=tf.string))
model.add(Vectorizer)
model.add(layers.Embedding(input_dim=Vectorizer.vocabulary_size(), output_dim=2, input_length=(100)))
model.add(layers.GRU(units=32, activation='tanh', return_sequences=True, dropout=0.4, input_shape=(32, 100, 1)))
model.add(layers.BatchNormalization())
model.add(layers.GRU(units=16, activation='tanh', dropout=0.4))
model.add(layers.BatchNormalization())
model.add(layers.Dense(units=8, activation='relu'))
model.add(layers.Dense(units=1, activation='sigmoid'))

In [11]:
# model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 2)            4004      
                                                                 
 gru (GRU)                   (None, 100, 32)           3456      
                                                                 
 batch_normalization (BatchN  (None, 100, 32)          128       
 ormalization)                                                   
                                                                 
 gru_1 (GRU)                 (None, 16)                2400      
                                                                 
 batch_normalization_1 (Batc  (None, 16)               6

In [12]:
# model config
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
# model training
history = model.fit(train_ds, epochs=5, verbose=1, callbacks=[callbacks.EarlyStopping(monitor='accuracy', patience=2)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
# model evaluation
scores = model.evaluate(test_ds, verbose=0)
print('Accuracy:', round(scores[1] * 100, 2))

Accuracy: 51.3
