In [None]:
!pip install datasets transformers
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np

# 2. Define the RNN model
class RNNSentiment(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()

        # Embedding layer
         # Embedding layer converts input indices into dense vectors of fixed size
        # `input_dim`: size of vocabulary (number of unique tokens in the dataset)
        # `embedding_dim`: dimensionality of the embedding space (length of each vector representation)

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        # RNN layer
        # RNN layer processes the embedded text sequences
        # `embedding_dim`: input size for the RNN (same as embedding vector size)
        # `hidden_dim`: number of hidden units in the RNN
        # `num_layers`: number of stacked RNN layers
        # `dropout`: dropout rate to apply between RNN layers to prevent overfitting
        # `batch_first=True`: ensures the input/output shape is (batch_size, seq_len, features)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    # The forward method defines the computation graph for the model
    def forward(self, text):
        # Pass the input through embedding layer
        embedded = self.embedding(text)

        # Pass the embedded input through RNN
        # `rnn_out`: the hidden states for each time step in the sequence
        # `_`: we don't need the RNN hidden states across layers, so we discard them
        rnn_out, _ = self.rnn(embedded)

        #The variable rnn_out is the output of the RNN layer in the model, and it has the shape (batch_size, seq_len, hidden_dim).
        #The hidden_dim dimension represents the number of features in the RNN's hidden state at each time step.
        # Use the output from the last time step
        # We use the output from the last time step of the RNN for classification
        # The last time step contains information about the entire sequence (after processing all tokens)

        last_hidden = rnn_out[:, -1, :]

        # Apply dropout
        last_hidden = self.dropout(last_hidden)

        # Output layer
        output = self.fc(last_hidden)
        return output

# 3. Set up hyperparameters
embedding_dim = 100
hidden_dim = 128
output_dim = 1  # Binary classification
n_layers = 2
dropout = 0.2
batch_size = 64
epochs = 5
learning_rate = 0.001

# 4. Load the IMDb dataset using Hugging Face
dataset = load_dataset('imdb')

# Tokenizer
# Importing the AutoTokenizer from the transformers library
# The AutoTokenizer class is a generic class that can load the tokenizer for any pre-trained model
from transformers import AutoTokenizer

# Load the pre-trained tokenizer for the BERT (Bidirectional Encoder Representations from Transformers) model (uncased version)
# This will handle tokenization (i.e., converting text to token IDs) for any text input.
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
# Function to tokenize input text data
# This function takes in a dictionary of examples (typically with a 'text' field) and tokenizes them
def tokenize_function(examples):
    # Tokenizes the text using the pre-loaded tokenizer.
    # `padding='max_length'` ensures all sequences are padded to the maximum length of the model.
    # `truncation=True` ensures that texts longer than the model's input size are truncated.
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Take a random sample from the train and test sets
# Select a random sample from the training dataset and apply tokenization
# The training data is shuffled with a fixed seed (42) for reproducibility
# Then, a subset (4% of the total training set) is selected to work with
train_data = dataset['train'].shuffle(seed=42).select([i for i in range(int(0.04 * len(dataset['train'])))]).map(tokenize_function, batched=True)

# Select a random sample from the test dataset and apply tokenization
# Similar to the training set, the test data is shuffled with a fixed seed
# A smaller subset (1% of the total test set) is selected for testing
test_data = dataset['test'].shuffle(seed=42).select([i for i in range(int(0.01 * len(dataset['test'])))]).map(tokenize_function, batched=True)

# 5. Create DataLoader for batching
# Convert the tokenized train data into a format suitable for PyTorch training
# `input_ids` are the token IDs, `attention_mask` is a mask that tells the model which tokens to pay attention to (1 for real tokens, 0 for padding)
# `label` is the target label for each input example

# Convert the train data into a PyTorch TensorDataset for easy batching
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_data['input_ids']),
    torch.tensor(train_data['attention_mask']),
    torch.tensor(train_data['label'])
)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_data['input_ids']),
    torch.tensor(test_data['attention_mask']),
    torch.tensor(test_data['label'])
)

#  Create DataLoaders for efficient batching and shuffling
# `train_loader` will iterate over `train_dataset` in batches of `batch_size` and shuffle the data for randomness
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# 6. Initialize the model
# The input dimension is set to the vocabulary size of the tokenizer (BERT vocabulary)
input_dim = tokenizer.vocab_size
# Instantiate the sentiment analysis model using an RNN architecture
# The model takes `input_dim` (size of the vocabulary), `embedding_dim` (size of word embeddings),
# `hidden_dim` (number of hidden units in RNN), `output_dim` (number of output classes),
# `n_layers` (number of stacked RNN layers), and `dropout` (dropout rate to prevent overfitting)
model = RNNSentiment(input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout)

# 7. Define loss function and optimizer
# Binary Cross Entropy with Logits Loss is used because this is a binary classification problem. Logits refer to the raw,
#unnormalized output values produced by the last layer of a neural network before any activation function (like sigmoid or softmax) is applied.
# This loss function expects raw logits as inputs and applies a sigmoid activation internally
criterion = nn.BCEWithLogitsLoss()

# Adam optimizer is used to update model parameters during training
# `learning_rate` defines the step size for updates to minimize loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 8. Train the model
# Loop through multiple epochs (full passes through the training dataset)
for epoch in range(epochs):
    model.train() # Set the model to training mode
    total_loss = 0 # Accumulate total loss for the epoch
    correct_preds = 0 # Track correct predictions for accuracy calculation
    total_preds = 0 # Track total number of predictions
    for input_ids, attention_mask, labels in train_loader:
        optimizer.zero_grad() # Reset gradients before each batch

        # Forward pass: pass tokenized input text through the model
        output = model(input_ids) # Output is raw logits (before applying sigmoid)

        # Compute loss and backpropagate
        # Compute the loss between predicted logits and true labels
        # `squeeze(1)` removes extra dimension to match expected shape
        loss = criterion(output.squeeze(1), labels.float())

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters using optimizer
        optimizer.step()

        # Accumulate total loss
        total_loss += loss.item()

        # Compute accuracy
        preds = torch.round(torch.sigmoid(output))# Apply sigmoid and round to get binary predictions (0 or 1)
        correct_preds += (preds.squeeze(1) == labels).sum().item() # Count correct predictions
        total_preds += labels.size(0) # Keep track of total predictions

    train_accuracy = correct_preds / total_preds
    # Print epoch-wise training loss and accuracy
    print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss/len(train_loader):.4f} | Train Accuracy: {train_accuracy:.4f}")

# 9. Evaluate the model
# Set the model to evaluation mode
# In evaluation mode, layers like dropout and batch normalization behave differently from training mode.
# This ensures that the model makes stable predictions without any random noise introduced by dropout.

model.eval()
correct_preds = 0
total_preds = 0

# Disable gradient calculation to reduce memory usage and computation during evaluation
# `torch.no_grad()` temporarily disables gradient calculation, which speeds up inference and reduces memory usage.
# This is because gradients are not needed for the forward pass when evaluating the model.
with torch.no_grad():
    # Evaluation code (e.g., loop through test dataset and make predictions)
    for input_ids, attention_mask, labels in test_loader:
        output = model(input_ids)
        preds = torch.round(torch.sigmoid(output))
        correct_preds += (preds.squeeze(1) == labels).sum().item()
        total_preds += labels.size(0)

test_accuracy = correct_preds / total_preds
print(f"Test Accuracy: {test_accuracy:.4f}")


# After evaluating the model, print a sentence from the dataset with its predicted sentiment
model.eval()
with torch.no_grad():
    # Select a random sample from the test dataset
    idx = np.random.randint(0, len(test_dataset))  # Random index
    input_ids, attention_mask, label = test_dataset[idx]

    # Convert to a batch
    input_ids = input_ids.unsqueeze(0)  # Add batch dimension
    attention_mask = attention_mask.unsqueeze(0)  # Add batch dimension

    # Get the model's prediction
    output = model(input_ids)
    prediction = torch.round(torch.sigmoid(output)).squeeze(1).item()  # Get predicted sentiment (0 or 1)

    # Map the prediction to a sentiment string
    sentiment = "Positive" if prediction == 1 else "Negative"

    # Get the original sentence
    sentence = tokenizer.decode(input_ids[0], skip_special_tokens=True)

    # Print the sentence and the predicted sentiment
    print(f"Sample Sentence: {sentence}")
    print(f"Predicted Sentiment: {sentiment}")




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Epoch 1/5 | Loss: 0.7001 | Train Accuracy: 0.5050
Epoch 2/5 | Loss: 0.6991 | Train Accuracy: 0.5190
Epoch 3/5 | Loss: 0.6816 | Train Accuracy: 0.5490
Epoch 4/5 | Loss: 0.6657 | Train Accuracy: 0.5520
Epoch 5/5 | Loss: 0.6377 | Train Accuracy: 0.5740
Test Accuracy: 0.5080
Sample Sentence: like " the blair witch project " before it, " hatchet " has garnered its own fair share of publicity from the bottom - on - up ( as an avid reader of fangoria magazine, the full - page ads are hard to miss ) ; even after its middling theatrical run, the film is bound to subsist solely on the hype surrounding it, and will probably turn into a cult item at some point. with a myspace url and a mighty ( if puzzlingly subjective ) promise of preserving so - called " old school american horror, " " hatchet " will draw a lot of curiosity seekers with its dvd release ( where that claim is emblazoned on the disc itself ). perhaps it was the large - print blurb from ain ' t it cool news on the ads that caused me