# Comparing Naive Bayes with RNN on the IMDB Dataset

In this notebook we load and preprocess the IMDB dataset, then build two classifiers:

- A **Naive Bayes** classifier using a binary bag-of-words representation (via `CountVectorizer` and `BernoulliNB`).
- An **RNN model** (using the GRU variant with global max pooling) built with PyTorch.

We then train both models and compare their performance in terms of accuracy, precision, recall, and F1 score.

In [None]:
# Import necessary libraries
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.datasets import imdb
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import re

# Set device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Using device:', device)

## Data Loading and Preprocessing

We load the IMDB dataset (using Keras), convert the tokenized sequences back to text, and split the data into training, validation, and test sets. We then build a custom vocabulary using `CountVectorizer`.

In [None]:
# Parameters for vocabulary
m = 1000  # number of words in vocabulary
n = 20    # skip top 20 most frequent words
k = 0     # skip 0 least frequent words

# Load IMDB dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=m-k, skip_top=n)
word_index = imdb.get_word_index()

# Create index-to-word mapping
index2word = {i + 3: word for word, i in word_index.items()}
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'

# Convert tokenized sequences back to text
x_train = [' '.join([index2word.get(idx, '[oov]') for idx in text]) for text in x_train]
x_test = [' '.join([index2word.get(idx, '[oov]') for idx in text]) for text in x_test]

# Split training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Create custom vocabulary using CountVectorizer
vectorizer = CountVectorizer(max_features=m, binary=True)
vectorizer.fit(X_train)
custom_vocab = vectorizer.vocabulary_

# Ensure special tokens are in the vocabulary
custom_vocab['PAD'] = len(custom_vocab)
custom_vocab['UNK'] = len(custom_vocab)

# Compute average sequence length
avg_length = int(np.mean([len(re.sub(r'[^a-zA-Z]', ' ', text.lower()).split()) for text in X_train]))
print('Average sequence length:', avg_length)

## Create Representations

We prepare two representations:

1. **Bag-of-Words representation** for the Naive Bayes classifier.
2. **Tokenized and padded sequences** for the RNN models.

In [None]:
# Create Bag-of-Words representation (for Naive Bayes)
X_train_binary = vectorizer.transform(X_train).toarray()
X_val_binary = vectorizer.transform(X_val).toarray()
X_test_binary = vectorizer.transform(x_test).toarray()

# Convert labels to numpy arrays
y_train_nb = np.array(y_train)
y_val_nb = np.array(y_val)
y_test_nb = np.array(y_test)

print(f'Training samples: {len(X_train_binary)}')
print(f'Validation samples: {len(X_val_binary)}')
print(f'Test samples: {len(X_test_binary)}')
print(f'Vocabulary size: {len(custom_vocab)}')

# Define a custom Dataset for tokenized text (for RNN models)
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_length):
        self.texts = [self.tokenize(text, vocab, max_length) for text in texts]
        self.labels = labels

    def tokenize(self, text, vocab, max_length):
        text = re.sub(r'[^a-zA-Z]', ' ', text.lower()).split()
        tokens = [vocab.get(word, vocab['UNK']) for word in text]
        if len(tokens) < max_length:
            tokens += [vocab['PAD']] * (max_length - len(tokens))
        else:
            tokens = tokens[:max_length]
        return tokens

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

# Create Datasets and DataLoaders for RNN models
train_dataset_rnn = TextDataset(X_train, y_train, custom_vocab, avg_length)
val_dataset_rnn = TextDataset(X_val, y_val, custom_vocab, avg_length)
test_dataset_rnn = TextDataset(x_test, y_test, custom_vocab, avg_length)

train_loader_rnn = DataLoader(train_dataset_rnn, batch_size=64, shuffle=True)
val_loader_rnn = DataLoader(val_dataset_rnn, batch_size=64, shuffle=False)
test_loader_rnn = DataLoader(test_dataset_rnn, batch_size=64, shuffle=False)

## Naive Bayes Classifier

We train a Bernoulli Naive Bayes classifier using the bag-of-words representation and then evaluate its performance.

In [None]:
from sklearn.naive_bayes import BernoulliNB

# Train Naive Bayes classifier
nb = BernoulliNB(alpha=1.0)
nb.fit(X_train_binary, y_train_nb)

# Evaluate on test set
predictions_nb = nb.predict(X_test_binary)
acc_nb = accuracy_score(y_test_nb, predictions_nb)
prec_nb = precision_score(y_test_nb, predictions_nb)
rec_nb = recall_score(y_test_nb, predictions_nb)
f1_nb = f1_score(y_test_nb, predictions_nb)

print(f"Naive Bayes: Accuracy={acc_nb:.4f}, Precision={prec_nb:.4f}, Recall={rec_nb:.4f}, F1={f1_nb:.4f}")

## RNN Model (GRU with Global Max Pooling)

We now define an RNN model that uses the GRU variant with global max pooling. The model is trained on the tokenized dataset.

In [None]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim,
                 model_type='GRU', use_pooling=True, num_layers=2, bidirectional=True):
        super(RNNModel, self).__init__()
        self.use_pooling = use_pooling
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # Select the RNN variant
        rnn_class = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[model_type]
        self.rnn = rnn_class(input_size=embed_dim,
                             hidden_size=hidden_dim,
                             num_layers=num_layers,
                             batch_first=True,
                             bidirectional=bidirectional)
        
        fc_input_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(fc_input_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)  # (batch_size, seq_length, embed_dim)
        output, _ = self.rnn(embedded)  # (batch_size, seq_length, hidden_dim * (2 if bidirectional else 1))
        
        if self.use_pooling:
            pooled = torch.max(output, dim=1)[0]  
            return torch.sigmoid(self.fc(pooled))
        else:
            return torch.sigmoid(self.fc(output[:, -1, :]))

# Instantiate and train the GRU model
vocab_size = len(custom_vocab) + 1  # +1 for potential PAD token
embed_dim = 300
hidden_dim = 64
output_dim = 1

model_gru = RNNModel(vocab_size, embed_dim, hidden_dim, output_dim, model_type='GRU', use_pooling=True, num_layers=2, bidirectional=True)
model_gru = model_gru.float().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model_gru.parameters(), lr=0.001)

print("Training GRU model...")
epochs = 10
train_losses, val_losses = [], []
for epoch in range(epochs):
    model_gru.train()
    running_train_loss = 0
    for texts, labels in train_loader_rnn:
        optimizer.zero_grad()
        texts = texts.to(device)
        labels = labels.float().to(device)
        outputs = model_gru(texts).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_train_loss += loss.item()
    avg_train_loss = running_train_loss / len(train_loader_rnn)
    train_losses.append(avg_train_loss)
    
    model_gru.eval()
    running_val_loss = 0
    with torch.no_grad():
        for texts, labels in val_loader_rnn:
            texts = texts.to(device)
            labels = labels.float().to(device)
            outputs = model_gru(texts).squeeze()
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()
    avg_val_loss = running_val_loss / len(val_loader_rnn)
    val_losses.append(avg_val_loss)
    
    if epoch % 2 == 0:
        print(f"Epoch: {epoch:4d} / {epochs} | Train Loss: {avg_train_loss:.5f}, Val Loss: {avg_val_loss:.5f}")

# Plot learning curves for the GRU model
plt.figure(figsize=(8,6))
plt.plot(range(1, epochs+1), train_losses, marker='o', label='Train Loss')
plt.plot(range(1, epochs+1), val_losses, marker='o', label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('GRU Model Learning Curves')
plt.legend()
plt.show()

In [None]:
# Evaluation function for RNN models
def evaluate_model(model, data_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for texts, labels in data_loader:
            texts = texts.to(device)
            preds = (model(texts).squeeze() > 0.5).cpu().tolist()
            y_true.extend(labels.tolist())
            y_pred.extend(preds)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return acc, prec, rec, f1

# Evaluate the GRU model on test data
acc_gru, prec_gru, rec_gru, f1_gru = evaluate_model(model_gru, test_loader_rnn)
print(f"GRU: Accuracy={acc_gru:.4f}, Precision={prec_gru:.4f}, Recall={rec_gru:.4f}, F1={f1_gru:.4f}")

## Comparing the Two Models

Now we compare the evaluation metrics of the Naive Bayes classifier and the GRU model on the test data. The metrics include accuracy, precision, recall, and F1 score.

In [None]:
# Create a comparison DataFrame
comparison_data = {
    'Model': ['Naive Bayes', 'GRU'],
    'Accuracy': [acc_nb, acc_gru],
    'Precision': [prec_nb, prec_gru],
    'Recall': [rec_nb, rec_gru],
    'F1 Score': [f1_nb, f1_gru]
}

df_comparison = pd.DataFrame(comparison_data)
print(df_comparison.to_string(index=False))

## End of Comparison

The table above shows the performance of both the Naive Bayes classifier and the GRU-based RNN on the test set.