# Insightly - The Recurrent Neural Network Implementation
## RNN Model

### Author: Ronald Li

In [1]:
import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import re
from sklearn.metrics import classification_report

# Reproducibility
torch.manual_seed(42)
np.random.seed(42)

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using device:", device)

base_dir = "ecommerce_dataset"


Using device: mps


In [2]:
# Load preprocessed NumPy arrays and metadata

import pickle

X_train = np.load(os.path.join(base_dir, "X_train.npy"))
X_test = np.load(os.path.join(base_dir, "X_test.npy"))
y_train = np.load(os.path.join(base_dir, "y_train.npy"))
y_test = np.load(os.path.join(base_dir, "y_test.npy"))

with open(os.path.join(base_dir, "vocab.pkl"), "rb") as f:
    vocab = pickle.load(f)

with open(os.path.join(base_dir, "label_mapping.pkl"), "rb") as f:
    label2idx = pickle.load(f)

with open(os.path.join(base_dir, "sequence_length.txt"), "r") as f:
    sequence_length = int(f.read().strip())

vocab_size = len(vocab)
num_classes = len(label2idx) 

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("Vocab size:", vocab_size)
print("Sequence length:", sequence_length)
print("Label mapping:", label2idx)


X_train shape: (3200, 80)
X_test shape: (800, 80)
y_train shape: (3200,)
y_test shape: (800,)
Vocab size: 2944
Sequence length: 80
Label mapping: {'Negative': 0, 'Neutral': 1, 'Positive': 2}


In [3]:
# Build PyTorch datasets and data loaders

X_train_tensor = torch.LongTensor(X_train)
X_test_tensor = torch.LongTensor(X_test)
y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print("Batch size:", batch_size)
print("Number of training batches:", len(train_loader))
print("Number of test batches:", len(test_loader))


Batch size: 32
Number of training batches: 100
Number of test batches: 25


In [4]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int = 64, hidden_size: int = 64, num_classes: int = 3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)                 # (batch, seq_len, emb)
        out, _ = self.rnn(embedded)                  # (batch, seq_len, hidden)
        last_hidden = out[:, -1, :]                  # (batch, hidden)
        logits = self.fc(last_hidden)                # (batch, num_classes)
        return logits


model = SentimentRNN(vocab_size=vocab_size, num_classes=num_classes).to(device)
print(model)
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")


SentimentRNN(
  (embedding): Embedding(2944, 64, padding_idx=0)
  (rnn): RNN(64, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)
Total parameters: 196,931


In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 20

train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

print("Starting baseline training...\n")

for epoch in range(1, EPOCHS + 1):
    # ----- Training -----
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        logits = model(xb)
        loss = criterion(logits, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)

    avg_train_loss = running_loss / total
    train_acc = correct / total
    train_losses.append(avg_train_loss)
    train_accuracies.append(train_acc)

    # ----- Evaluation -----
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            logits = model(xb)
            loss = criterion(logits, yb)

            running_loss += loss.item() * xb.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)

    avg_test_loss = running_loss / total
    test_acc = correct / total
    test_losses.append(avg_test_loss)
    test_accuracies.append(test_acc)

    print(
        f"Epoch {epoch:02d}/{EPOCHS} | "
        f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f} | "
        f"Test Loss: {avg_test_loss:.4f} | Test Acc: {test_acc:.4f}"
    )

print("\nTraining finished.")

# Save training history and model for the analysis notebook
history_path = os.path.join(base_dir, "baseline_history.npz")
model_path = os.path.join(base_dir, "baseline_rnn.pt")

np.savez(
    history_path,
    train_losses=np.array(train_losses),
    test_losses=np.array(test_losses),
    train_accuracies=np.array(train_accuracies),
    test_accuracies=np.array(test_accuracies),
)

torch.save(model.state_dict(), model_path)

print("Saved training history to:", history_path)
print("Saved model weights to:", model_path)


Starting baseline training...

Epoch 01/20 | Train Loss: 0.2486 | Train Acc: 0.9463 | Test Loss: 0.2685 | Test Acc: 0.9363
Epoch 02/20 | Train Loss: 0.2472 | Train Acc: 0.9463 | Test Loss: 0.2637 | Test Acc: 0.9375
Epoch 03/20 | Train Loss: 0.2441 | Train Acc: 0.9459 | Test Loss: 0.2751 | Test Acc: 0.9363
Epoch 04/20 | Train Loss: 0.2441 | Train Acc: 0.9466 | Test Loss: 0.2733 | Test Acc: 0.9363
Epoch 05/20 | Train Loss: 0.2446 | Train Acc: 0.9459 | Test Loss: 0.2746 | Test Acc: 0.9363
Epoch 06/20 | Train Loss: 0.2442 | Train Acc: 0.9466 | Test Loss: 0.2705 | Test Acc: 0.9375
Epoch 07/20 | Train Loss: 0.2434 | Train Acc: 0.9466 | Test Loss: 0.2685 | Test Acc: 0.9375
Epoch 08/20 | Train Loss: 0.2437 | Train Acc: 0.9466 | Test Loss: 0.2674 | Test Acc: 0.9387
Epoch 09/20 | Train Loss: 0.2434 | Train Acc: 0.9463 | Test Loss: 0.2760 | Test Acc: 0.9350
Epoch 10/20 | Train Loss: 0.2441 | Train Acc: 0.9466 | Test Loss: 0.2706 | Test Acc: 0.9363
Epoch 11/20 | Train Loss: 0.2431 | Train Acc: 0.9

### Testing with Custom Reviews

In [7]:
def clean_text(text: str) -> str:
    """Lowercase, remove non-letter characters, normalize whitespace."""
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = " ".join(text.split())
    return text


def tokenize(text: str):
    """Very simple whitespace tokenization."""
    return text.split()


def tokens_to_ids(tokens):
    return [vocab.get(tok, vocab["<UNK>"]) for tok in tokens]


def pad_sequence(seq, max_len, pad_value=0):
    if len(seq) >= max_len:
        return seq[:max_len]
    return seq + [pad_value] * (max_len - len(seq))


# Map predicted indices back to label names
idx2label = {v: k for k, v in label2idx.items()}


def predict_sentiments(texts, model_to_use):
    """Run the trained 3-class sentiment model on a list of raw text reviews."""
    model_to_use.eval()

    processed = []
    for text in texts:
        cleaned = clean_text(text)
        tokens = tokenize(cleaned)
        ids = tokens_to_ids(tokens)
        padded = pad_sequence(ids, sequence_length)
        processed.append(padded)

    X_new = torch.LongTensor(processed).to(device)

    with torch.no_grad():
        logits = model_to_use(X_new)  # (batch, num_classes)
        probs = torch.softmax(logits, dim=1)
        pred_indices = torch.argmax(probs, dim=1).cpu().numpy()
        probs_np = probs.cpu().numpy()

    results = []
    for text, idx, prob_vec in zip(texts, pred_indices, probs_np):
        label = idx2label[int(idx)]
        confidence = float(prob_vec[idx])
        results.append({"text": text, "label": label, "confidence": confidence})

    return results


sample_reviews = [
    "This product is absolutely amazing! Best purchase ever!",
    "Terrible quality, complete waste of money. Very disappointed.",
    "It's okay, nothing special but does the job.",
    "Good value for the price. Pretty satisfied with this purchase.",
    "Worst product I've ever bought. Do not recommend!"
]


In [8]:
print("Baseline model predictions (unweighted):")
for r in predict_sentiments(sample_reviews, model):
    print("Text:", r["text"])
    print(f"Predicted sentiment: {r['label']} (confidence {r['confidence']:.3f})")
    print("-" * 80)

Baseline model predictions (unweighted):
Text: This product is absolutely amazing! Best purchase ever!
Predicted sentiment: Positive (confidence 0.947)
--------------------------------------------------------------------------------
Text: Terrible quality, complete waste of money. Very disappointed.
Predicted sentiment: Positive (confidence 0.947)
--------------------------------------------------------------------------------
Text: It's okay, nothing special but does the job.
Predicted sentiment: Positive (confidence 0.947)
--------------------------------------------------------------------------------
Text: Good value for the price. Pretty satisfied with this purchase.
Predicted sentiment: Positive (confidence 0.947)
--------------------------------------------------------------------------------
Text: Worst product I've ever bought. Do not recommend!
Predicted sentiment: Positive (confidence 0.947)
--------------------------------------------------------------------------------


### Model Adjustments

Recall that about 94% of the reviews are positive, with very few negative or neutral ones, so positive reviews are heavily overrepresented in the data. We address this imbalance by using class weights.

In [9]:
weighted_model = SentimentRNN(vocab_size=vocab_size, num_classes=num_classes).to(device)

# Compute class weights from training labels 
class_counts = np.bincount(y_train)
print("\nClass counts in training set:")
for idx, count in enumerate(class_counts):
    print(f"  class {idx}: {count} samples")

class_weights = len(y_train) / (len(class_counts) * class_counts)
print("Class weights (used in weighted loss):", class_weights)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

criterion_weighted = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer_weighted = torch.optim.Adam(weighted_model.parameters(), lr=1e-3)




Class counts in training set:
  class 0: 74 samples
  class 1: 127 samples
  class 2: 2999 samples
Class weights (used in weighted loss): [14.41441441  8.39895013  0.35567411]


Let's retrain the model using a class-weighted cross-entropy loss to address the strong class imbalance.

In [13]:
EPOCHS_WEIGHTED = 20

train_losses_w = []
train_accuracies_w = []
test_losses_w = []
test_accuracies_w = []

print("\nStarting training with class-weighted loss...\n")

for epoch in range(1, EPOCHS_WEIGHTED + 1):
    # ----- Training -----
    weighted_model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        logits = weighted_model(xb)
        loss = criterion_weighted(logits, yb)

        optimizer_weighted.zero_grad()
        loss.backward()
        optimizer_weighted.step()

        running_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)

    avg_train_loss = running_loss / total
    train_acc = correct / total
    train_losses_w.append(avg_train_loss)
    train_accuracies_w.append(train_acc)

    # ----- Evaluation -----
    weighted_model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            logits = weighted_model(xb)
            loss = criterion_weighted(logits, yb)

            running_loss += loss.item() * xb.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)

    avg_test_loss = running_loss / total
    test_acc = correct / total
    test_losses_w.append(avg_test_loss)
    test_accuracies_w.append(test_acc)

    print(
        f"[Weighted] Epoch {epoch:02d}/{EPOCHS_WEIGHTED} | "
        f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f} | "
        f"Test Loss: {avg_test_loss:.4f} | Test Acc: {test_acc:.4f}"
    )

print("\nWeighted training finished.")

# Save history and model separately so we keep the original artifacts
history_path_w = os.path.join(base_dir, "baseline_weighted_history.npz")
model_path_w = os.path.join(base_dir, "baseline_rnn_weighted.pt")

np.savez(
    history_path_w,
    train_losses=np.array(train_losses_w),
    test_losses=np.array(test_losses_w),
    train_accuracies=np.array(train_accuracies_w),
    test_accuracies=np.array(test_accuracies_w),
)

torch.save(weighted_model.state_dict(), model_path_w)

print("Saved weighted training history to:", history_path_w)
print("Saved weighted model weights to:", model_path_w)




Starting training with class-weighted loss...

[Weighted] Epoch 01/20 | Train Loss: 0.9686 | Train Acc: 0.8119 | Test Loss: 1.0570 | Test Acc: 0.6937
[Weighted] Epoch 02/20 | Train Loss: 0.9720 | Train Acc: 0.9072 | Test Loss: 1.0835 | Test Acc: 0.9137
[Weighted] Epoch 03/20 | Train Loss: 0.9699 | Train Acc: 0.8016 | Test Loss: 1.0745 | Test Acc: 0.8175
[Weighted] Epoch 04/20 | Train Loss: 0.9748 | Train Acc: 0.8341 | Test Loss: 1.0719 | Test Acc: 0.9113
[Weighted] Epoch 05/20 | Train Loss: 0.9493 | Train Acc: 0.8109 | Test Loss: 1.0874 | Test Acc: 0.7400
[Weighted] Epoch 06/20 | Train Loss: 0.9712 | Train Acc: 0.7966 | Test Loss: 1.1021 | Test Acc: 0.9012
[Weighted] Epoch 07/20 | Train Loss: 0.9733 | Train Acc: 0.8231 | Test Loss: 1.0843 | Test Acc: 0.6325
[Weighted] Epoch 08/20 | Train Loss: 0.9430 | Train Acc: 0.9009 | Test Loss: 1.0958 | Test Acc: 0.9137
[Weighted] Epoch 09/20 | Train Loss: 0.9566 | Train Acc: 0.8078 | Test Loss: 1.1087 | Test Acc: 0.8250
[Weighted] Epoch 10/20 | 

In [14]:
print("\nWeighted model predictions (class-weighted):")
for r in predict_sentiments(sample_reviews, weighted_model):
    print("Text:", r["text"])
    print(f"Predicted sentiment: {r['label']} (confidence {r['confidence']:.3f})")
    print("-" * 80)



Weighted model predictions (class-weighted):
Text: This product is absolutely amazing! Best purchase ever!
Predicted sentiment: Positive (confidence 0.374)
--------------------------------------------------------------------------------
Text: Terrible quality, complete waste of money. Very disappointed.
Predicted sentiment: Positive (confidence 0.389)
--------------------------------------------------------------------------------
Text: It's okay, nothing special but does the job.
Predicted sentiment: Neutral (confidence 0.423)
--------------------------------------------------------------------------------
Text: Good value for the price. Pretty satisfied with this purchase.
Predicted sentiment: Positive (confidence 0.431)
--------------------------------------------------------------------------------
Text: Worst product I've ever bought. Do not recommend!
Predicted sentiment: Positive (confidence 0.494)
--------------------------------------------------------------------------------

While the weighted model did not improve overall performance, let's check the per-class metrics like precision/recall for Positive/Neutral/Negative using a classification report.

In [15]:
def evaluate_model_on_test(model_to_use, name: str):
    """Print a classification report for the given model on the test set."""
    model_to_use.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            logits = model_to_use(xb)
            preds = logits.argmax(dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(yb.cpu().numpy())

    # Ensure labels are in the correct order 0, 1, 2 -> Negative, Neutral, Positive
    label_order = sorted(idx2label.keys())
    target_names = [idx2label[i] for i in label_order]

    print(f"\n{name}")
    print(classification_report(all_labels, all_preds, labels=label_order, target_names=target_names))


# Compare baseline and class-weighted models on the test set
evaluate_model_on_test(model, "Baseline model (unweighted)")
evaluate_model_on_test(weighted_model, "Weighted model (class-weighted loss)")




Baseline model (unweighted)
              precision    recall  f1-score   support

    Negative       0.25      0.11      0.15        19
     Neutral       1.00      0.10      0.18        31
    Positive       0.94      0.99      0.97       750

    accuracy                           0.94       800
   macro avg       0.73      0.40      0.43       800
weighted avg       0.93      0.94      0.92       800


Weighted model (class-weighted loss)
              precision    recall  f1-score   support

    Negative       0.07      0.05      0.06        19
     Neutral       0.04      0.16      0.06        31
    Positive       0.94      0.82      0.87       750

    accuracy                           0.77       800
   macro avg       0.35      0.34      0.33       800
weighted avg       0.88      0.77      0.82       800



It looks like the classâ€‘weighted model is worse overall and only slightly better at catching minority examples, with many more false positives. 