# **EDUNET FOUNDATION**

# **Lab: Sentiment Analysis: Movie Review Sentiment Prediction by using RNN​**

In [1]:
import pandas as pd
import numpy as np

In [4]:
# Load the dataset
imdb = pd.read_csv('/content/imdb.csv', sep='\t',encoding='latin-1')
imdb.head()

Unnamed: 0,id,review,sentiment
0,5814_8,With all this stuff going down at the moment w...,1
1,2381_9,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,7759_3,The film starts with a manager (Nicholas Bell)...,0
3,3630_4,It must be assumed that those who praised this...,0
4,9495_8,Superbly trashy and wondrously unpretentious 8...,1


In [6]:
imdb.shape

(22500, 3)

In [7]:
df = imdb.head(15000).copy()

# Encode the sentiment labels
df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})
# Drop the 'id' column from the DataFrame
df.drop('id', axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,With all this stuff going down at the moment w...,1
1,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,The film starts with a manager (Nicholas Bell)...,0
3,It must be assumed that those who praised this...,0
4,Superbly trashy and wondrously unpretentious 8...,1


In [10]:
import re
# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub('<br />', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text

df['review'] = df['review'].apply(preprocess_text)

In [11]:
df.head()

Unnamed: 0,review,sentiment
0,with all this stuff going down at the moment w...,1
1,the classic war of the worlds by timothy hi...,1
2,the film starts with a manager nicholas bell ...,0
3,it must be assumed that those who praised this...,0
4,superbly trashy and wondrously unpretentious ...,1


In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['review'])
X = tokenizer.texts_to_sequences(df['review'])
X = pad_sequences(X, maxlen=20)

In [14]:
X[0]

array([   6,  349,   34,  581,  331,   18,  373,  217,   41,   29,    4,
          1,   88,    9,  426,   24,    6,   23,    1, 1441], dtype=int32)

In [15]:
# Encode labels
y = df['sentiment'].values

In [16]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout

In [21]:


model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))  # `input_length` is deprecated, so remove it
model.add(SimpleRNN(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
# Train the model
history = model.fit(X_train, y_train, epochs=3, batch_size=4, validation_data=(X_test, y_test))

Epoch 1/3
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 6ms/step - accuracy: 0.5065 - loss: 0.7111 - val_accuracy: 0.5832 - val_loss: 0.6688
Epoch 2/3
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.6295 - loss: 0.6378 - val_accuracy: 0.5757 - val_loss: 0.6685
Epoch 3/3
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.6803 - loss: 0.5841 - val_accuracy: 0.5811 - val_loss: 0.6770


In [24]:
# Predict sentiment for new reviews
new_reviews = [ "Well done Al Gore!","I loved the movie, it was fantastic!", "terrible and boring."]
new_reviews_preprocessed = [preprocess_text(review) for review in new_reviews]
new_reviews_sequences = tokenizer.texts_to_sequences(new_reviews_preprocessed)
new_reviews_padded = pad_sequences(new_reviews_sequences, maxlen=20)
predictions = model.predict(new_reviews_padded)

# Print raw predictions to debug
print("Raw predictions:", predictions)

# Convert predictions to "positive" or "negative"
predicted_labels = ['positive' if pred >= 0.5 else 'negative' for pred in predictions]

for review, sentiment in zip(new_reviews, predicted_labels):
    print(f'Review: "{review}" - Sentiment: {sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 712ms/step
Raw predictions: [[0.6227374 ]
 [0.57867944]
 [0.26292512]]
Review: "Well done Al Gore!" - Sentiment: positive
Review: "I loved the movie, it was fantastic!" - Sentiment: positive
Review: "terrible and boring." - Sentiment: negative
