# Sentiment Analysis Task Using RNN

## 1. Preprocessing

### 1.1 Count Unique words

In [11]:
from collections import Counter
import re
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from bs4 import BeautifulSoup  
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")  # Replace with your dataset path
df = df.sample(frac=1, random_state=42)

# Function to clean and tokenize text
def count_unique_words(texts):
    all_words = []
    for text in texts:
        words = re.findall(r'\b\w+\b', text.lower())  # Tokenize words, ignoring punctuation
        all_words.extend(words)
    
    word_counts = Counter(all_words)  # Count occurrences
    return len(word_counts), word_counts  # Return unique word count and frequency

# Count unique words in the "reviews" column
unique_word_count, word_freq = count_unique_words(df["review"].values)

print(f"Total Unique Words: {unique_word_count}")


Total Unique Words: 101944


### 1.2 Tokenize Words

In [12]:
texts = df["review"].values
labels = df["sentiment"].values  # Assuming labels are 0 (negative) and 1 (positive)

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-z0-9\s]", "", text)  # Remove punctuation
    return text

# Apply cleaning to all texts
cleaned_texts = [clean_text(text) for text in texts]

vocab_size = 5000  # unique words are 100000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(cleaned_texts)

# Convert texts to sequences (numbers)
sequences = tokenizer.texts_to_sequences(cleaned_texts)

# Auto-Select `max_length`
sentence_lengths = [len(seq) for seq in sequences]
max_length = int(np.percentile(sentence_lengths, 95))  # Use the 95th percentile

# Pad sequences
X = pad_sequences(sequences, maxlen=max_length)

# Convert labels to NumPy array
labels = np.array(labels)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

print(f"Final shape of padded sequences: {y.shape}")
print(f"Max sequence length used: {max_length}")
print(y)


  text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML


Final shape of padded sequences: (50000,)
Max sequence length used: 504
[1 1 0 ... 0 1 1]


## 2. Define The Rnn Model

In [13]:
embedding_dim = 64  # Size of word embeddings
rnn_units = 128  # Number of RNN neurons

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SimpleRNN(128, return_sequences=False, 
               kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.7))
model.add(Dense(32, activation='relu', 
                kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


## 3. Train The model

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=48)
# Train the model
batch_size = 128
epochs = 10
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, 
                    epochs=10, batch_size=128, 
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping])

model.save("sentiment_rnn_model.keras")

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 62ms/step - accuracy: 0.5007 - loss: 2.3823 - val_accuracy: 0.5325 - val_loss: 1.9214
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 53ms/step - accuracy: 0.5216 - loss: 1.8090 - val_accuracy: 0.6420 - val_loss: 1.4397
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 53ms/step - accuracy: 0.5296 - loss: 1.4287 - val_accuracy: 0.6619 - val_loss: 1.1567
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 53ms/step - accuracy: 0.7197 - loss: 1.0933 - val_accuracy: 0.7854 - val_loss: 0.8590
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 53ms/step - accuracy: 0.8144 - loss: 0.8491 - val_accuracy: 0.8151 - val_loss: 0.7788
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 53ms/step - accuracy: 0.8227 - loss: 0.7361 - val_accuracy: 0.8469 - val_loss: 0.6068
Epoch 7/10
[1m3

## 4. Evaluation

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.8758 - loss: 0.4133
Test Accuracy: 0.8797


## 5. Accuracy

In [16]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f'Accuracy Score: {accuracy_score(y_test, y_pred):.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step
Accuracy Score: 0.8797
