In [1]:
#!pip install pandas numpy 
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
import contractions
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

In [3]:
#Showing loaded data
file_path = '/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'
imdb_data = pd.read_csv(file_path)
print(imdb_data.head(10))

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
5  Probably my all-time favorite movie, a story o...  positive
6  I sure would like to see a resurrection of a u...  positive
7  This show was an amazing, fresh & innovative i...  negative
8  Encouraged by the positive comments about this...  negative
9  If you like original gut wrenching laughter yo...  positive


In [4]:
#Data cleaning
#Load stopwords
stop_words = set(stopwords.words('english'))
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

# Ensure data types are correct
imdb_data['sentiment'] = imdb_data['sentiment'].astype(int)
def clean_text(text):
    text = re.sub('<br />', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters
    text = text.lower()  # Convert to lowercase
    text = contractions.fix(text)  # Expand contractions like "don't" to "do not"
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

imdb_data['review'] = imdb_data['review'].apply(clean_text)
imdb_data.head(10)

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ho...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter matteis love time money visually stunni...,1
5,probably alltime favorite movie story selfless...,1
6,sure would like see resurrection dated seahunt...,1
7,show amazing fresh innovative idea first aired...,0
8,encouraged positive comments film looking forw...,0
9,like original gut wrenching laughter like movi...,1


In [5]:
#Splitting the data
X = imdb_data['review']
y = imdb_data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print(f"Training set: {len(X_train)} reviews")
print(f"Testing set: {len(X_test)} reviews")
print(f"Training set: {len(y_train)} sentiments")
print(f"Testing set: {len(y_test)} sentiments")

#For finding optimal maxlen
#review_lengths = [len(seq) for seq in X_train_seq]
#plt.hist(review_lengths, bins=100)
#plt.show()

Training set: 42500 reviews
Testing set: 7500 reviews
Training set: 42500 sentiments
Testing set: 7500 sentiments


In [6]:
#Text Vectorization
#Initialize the tokenizer
tokenizer = Tokenizer(num_words=30000)  # Use the top 20,000 words
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

maxlen = 150  # Maximum review length , Check from the graph
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

print(f"Padded training data shape: {X_train_pad.shape}")
print(f"Padded testing data shape: {X_test_pad.shape}")

Padded training data shape: (42500, 150)
Padded testing data shape: (7500, 150)


<font size = 5 color = 'red'>Building and training an RNN model using a Bidirectional LSTM layer for sentiment analysis</font>

In [7]:
# Define the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=200, input_length=100),
    Dropout(0.15),
    Bidirectional(LSTM(128, dropout=0.15, recurrent_dropout=0.15, kernel_regularizer=regularizers.l2(0.01))),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.15),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.0003), loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.00001)

model.summary()



In [8]:
# Training the model
history = model.fit(
    X_train_pad, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/100
[1m1063/1063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 346ms/step - accuracy: 0.7167 - loss: 2.6637 - val_accuracy: 0.8631 - val_loss: 0.3728 - learning_rate: 3.0000e-04
Epoch 2/100
[1m1063/1063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 346ms/step - accuracy: 0.8852 - loss: 0.3416 - val_accuracy: 0.8693 - val_loss: 0.3588 - learning_rate: 3.0000e-04
Epoch 3/100
[1m1063/1063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m366s[0m 344ms/step - accuracy: 0.9085 - loss: 0.2845 - val_accuracy: 0.8747 - val_loss: 0.3502 - learning_rate: 3.0000e-04
Epoch 4/100
[1m1063/1063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 346ms/step - accuracy: 0.9211 - loss: 0.2522 - val_accuracy: 0.8671 - val_loss: 0.3683 - learning_rate: 3.0000e-04
Epoch 5/100
[1m1063/1063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m366s[0m 344ms/step - accuracy: 0.9315 - loss: 0.2258 - val_accuracy: 0.8654 - val_loss: 0.3870 - learning_rate: 3.0000e-04
Epoch 6/100
[1

In [9]:
model.save_weights('model_4.weights.h5')
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 65ms/step - accuracy: 0.8856 - loss: 0.3209
Test Accuracy: 88.51%
