# Insightly - The Recurrent Neural Network Implementation
## Data Preprocessing

### Author: Ronald Li

### Setup - import required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from collections import Counter
import os
import pickle


import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device configuration
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using device:", device)

Using device: mps


### Data Preprocessing

Load and inspect the e-commerce reviews dataset


In [2]:

base_dir = "ecommerce_dataset"
train_path = os.path.join(base_dir, "train_data.csv")

raw_df = pd.read_csv(train_path)
print(f"Loaded train_data.csv with shape: {raw_df.shape}")
print("Columns:", raw_df.columns.tolist())

# Keep only the text and sentiment columns, drop missing values
text_col = "reviews.text"
label_col = "sentiment"

df = raw_df[[text_col, label_col]].dropna()
print(f"After dropping NAs: {df.shape[0]} rows")

# Keep all three sentiments: Negative, Neutral, Positive
valid_labels = ["Negative", "Neutral", "Positive"]
df = df[df[label_col].isin(valid_labels)].copy()
print("Label distribution (all three sentiments):")
print(df[label_col].value_counts())

# Map sentiment strings to 3-class labels: Negative -> 0, Neutral -> 1, Positive -> 2
label2idx = {"Negative": 0, "Neutral": 1, "Positive": 2}
df["label"] = df[label_col].map(label2idx)

print("\nFirst few cleaned rows (text + label):")
print(df[[text_col, label_col, "label"]].head())


Loaded train_data.csv with shape: (4000, 8)
Columns: ['name', 'brand', 'categories', 'primaryCategories', 'reviews.date', 'reviews.text', 'reviews.title', 'sentiment']
After dropping NAs: 4000 rows
Label distribution (all three sentiments):
sentiment
Positive    3749
Neutral      158
Negative      93
Name: count, dtype: int64

First few cleaned rows (text + label):
                                        reviews.text sentiment  label
0  Purchased on Black FridayPros - Great Price (e...  Positive      2
1  I purchased two Amazon in Echo Plus and two do...  Positive      2
2  Just an average Alexa option. Does show a few ...   Neutral      1
3  very good product. Exactly what I wanted, and ...  Positive      2
4  This is the 3rd one I've purchased. I've bough...  Positive      2


In [3]:
# Basic text cleaning and tokenization

def clean_text(text: str) -> str:
    """Lowercase, remove non-letter characters, normalize whitespace."""
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = " ".join(text.split())
    return text


def tokenize(text: str):
    """Simple whitespace tokenization."""
    return text.split()


# Apply cleaning and tokenization
df["cleaned_text"] = df[text_col].apply(clean_text)
df = df[df["cleaned_text"].str.len() > 0].copy()

df["tokens"] = df["cleaned_text"].apply(tokenize)

print(f"Number of reviews after cleaning: {len(df)}")
print("Example cleaned review:")
print(df[[text_col, "cleaned_text", "tokens"]].head(1))


Number of reviews after cleaning: 4000
Example cleaned review:
                                        reviews.text  \
0  Purchased on Black FridayPros - Great Price (e...   

                                        cleaned_text  \
0  purchased on black fridaypros great price even...   

                                              tokens  
0  [purchased, on, black, fridaypros, great, pric...  


In [4]:
# Build a simple vocabulary based on token frequency

all_tokens = []
for toks in df["tokens"]:
    all_tokens.extend(toks)

word_counts = Counter(all_tokens)
print(f"Total tokens: {len(all_tokens):,}")
print(f"Unique tokens: {len(word_counts):,}")

min_freq = 2  # ignore very rare words

vocab = {"<PAD>": 0, "<UNK>": 1}
for word, count in word_counts.items():
    if count >= min_freq:
        vocab[word] = len(vocab)

vocab_size = len(vocab)
print(f"Vocabulary size (including <PAD>/<UNK>): {vocab_size}")

# Quick peek at most frequent tokens
print("Most common tokens:")
for token, count in Counter(word_counts).most_common(10):
    print(f"  {token}: {count}")


Total tokens: 123,292
Unique tokens: 4,778
Vocabulary size (including <PAD>/<UNK>): 2944
Most common tokens:
  the: 5365
  it: 4201
  and: 3990
  to: 3990
  i: 3904
  for: 2965
  a: 2734
  is: 2418
  my: 2274
  this: 2103


In [5]:
# Convert tokens to integer sequences and pad to a fixed length

def tokens_to_ids(tokens):
    return [vocab.get(tok, vocab["<UNK>"]) for tok in tokens]


df["sequence"] = df["tokens"].apply(tokens_to_ids)

seq_lengths = df["sequence"].apply(len)
print("Sequence length stats:")
print("  min:", int(seq_lengths.min()))
print("  max:", int(seq_lengths.max()))
print("  mean:", float(seq_lengths.mean()))
print("  95th percentile:", float(np.percentile(seq_lengths, 95)))

# Use the 95th percentile as a simple fixed length
sequence_length = int(np.percentile(seq_lengths, 95))
print(f"\nUsing fixed sequence length: {sequence_length}")


def pad_sequence(seq, max_len, pad_value=0):
    if len(seq) >= max_len:
        return seq[:max_len]
    return seq + [pad_value] * (max_len - len(seq))


df["padded"] = df["sequence"].apply(lambda s: pad_sequence(s, sequence_length))

print("Example padded sequence:")
print(df[["cleaned_text", "padded"]].head(1))


Sequence length stats:
  min: 1
  max: 1606
  mean: 30.823
  95th percentile: 80.0

Using fixed sequence length: 80
Example padded sequence:
                                        cleaned_text  \
0  purchased on black fridaypros great price even...   

                                              padded  
0  [2, 3, 4, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...  


In [6]:
# Create NumPy arrays, perform a simple train/test split, and save artifacts

X = np.asarray(df["padded"].tolist(), dtype=np.int64)
y = df["label"].values.astype(np.int64)

print("Feature matrix shape:", X.shape)
print("Label vector shape:", y.shape)
print("Label distribution (0=Negative, 1=Neutral, 2=Positive):")
(unique, counts) = np.unique(y, return_counts=True)
print(dict(zip(unique.astype(int), counts)))

# 80/20 train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("\nTrain shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

# Save processed arrays and metadata for the RNN model notebook
X_train_path = os.path.join(base_dir, "X_train.npy")
X_test_path = os.path.join(base_dir, "X_test.npy")
y_train_path = os.path.join(base_dir, "y_train.npy")
y_test_path = os.path.join(base_dir, "y_test.npy")

np.save(X_train_path, X_train)
np.save(X_test_path, X_test)
np.save(y_train_path, y_train)
np.save(y_test_path, y_test)

# Save vocab and label mapping via pickle
vocab_path = os.path.join(base_dir, "vocab.pkl")
label_map_path = os.path.join(base_dir, "label_mapping.pkl")
seq_len_path = os.path.join(base_dir, "sequence_length.txt")

with open(vocab_path, "wb") as f:
    pickle.dump(vocab, f)

with open(label_map_path, "wb") as f:
    pickle.dump(label2idx, f)

with open(seq_len_path, "w") as f:
    f.write(str(sequence_length))

print("\nSaved:")
print("  ", X_train_path)
print("  ", X_test_path)
print("  ", y_train_path)
print("  ", y_test_path)
print("  ", vocab_path)
print("  ", label_map_path)
print("  ", seq_len_path)


Feature matrix shape: (4000, 80)
Label vector shape: (4000,)
Label distribution (0=Negative, 1=Neutral, 2=Positive):
{np.int64(0): np.int64(93), np.int64(1): np.int64(158), np.int64(2): np.int64(3749)}

Train shape: (3200, 80) (3200,)
Test shape: (800, 80) (800,)

Saved:
   ecommerce_dataset/X_train.npy
   ecommerce_dataset/X_test.npy
   ecommerce_dataset/y_train.npy
   ecommerce_dataset/y_test.npy
   ecommerce_dataset/vocab.pkl
   ecommerce_dataset/label_mapping.pkl
   ecommerce_dataset/sequence_length.txt
