<a href="https://colab.research.google.com/github/SM-Learning/advanced-rag-techniques/blob/main/Text_Sentiment_Amazon_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# =========================
# 1. SETUP & DATA DOWNLOAD
# =========================

# Install required packages (if running in Colab, uncomment these lines)
# !pip install torch torchvision torchaudio
# !pip install pandas scikit-learn matplotlib tqdm nltk
# !pip install torchsummary

import os
import gzip
import json
import random
import urllib.request
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gc
import time
import psutil
import ssl

nltk.download('stopwords')
nltk.download('wordnet')

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
#torch.cuda.manual_seed(SEED)
#torch.backends.cudnn.deterministic = True

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


<torch._C.Generator at 0x7f9c182a88b0>

In [5]:
# Create an unverified SSL context
ssl._create_default_https_context = ssl._create_unverified_context

In [6]:
# Download the "All Beauty" dataset if not already present
'''
DATA_URL = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/All_Beauty_5.json.gz"
DATA_FILE = "All_Beauty_5.json.gz"
if not os.path.exists(DATA_FILE):
    print("Downloading dataset...")
    urllib.request.urlretrieve(DATA_URL, DATA_FILE)
    print("Download complete.")
'''

Downloading dataset...
Download complete.


In [None]:
# Download the "Books" dataset if not already present
DATA_URL = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"
DATA_FILE = "Books_5.json.gz"
if not os.path.exists(DATA_FILE):
    print("Downloading dataset...")
    urllib.request.urlretrieve(DATA_URL, DATA_FILE)
    print("Download complete.")

Downloading dataset...


In [None]:
# =========================
# 2. DATA LOADING & PREVIEW
# =========================

# Load a sample of the data to preview all fields
def load_sample_records(filename, n=2):
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        records = [json.loads(line) for _, line in zip(range(n), f)]
    return pd.DataFrame(records)

sample_df = load_sample_records(DATA_FILE, n=2)
print("Sample records with all fields:\n", sample_df)

In [None]:
# =========================
# 3. LOAD & BALANCE DATASET
# =========================

# Load 200K records and keep only reviewText and overall
def load_balanced_subset(filename, n_total=200000):
    # Read all records
    records = []
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        for line in tqdm(f, total=n_total):
            rec = json.loads(line)
            if 'reviewText' in rec and 'overall' in rec:
                records.append({'reviewText': rec['reviewText'], 'overall': int(float(rec['overall']))})
            if len(records) >= n_total:
                break
    df = pd.DataFrame(records)
    # Remove any rows with missing data or out-of-range ratings
    df = df[df['overall'].isin([1,2,3,4,5])].dropna()
    # Note: For large datasets, natural distribution is often better for generalization.
    # If you want to balance, uncomment the next lines.
    # min_count = df['overall'].value_counts().min()
    # df = df.groupby('overall').sample(n=min_count, random_state=SEED)
    print("Class distribution (note: natural, not balanced):\n", df['overall'].value_counts())
    return df

df = load_balanced_subset(DATA_FILE, n_total=200000)

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
# =========================
# 4. BPE TOKENIZATION
# =========================

# Train a Byte-Pair Encoding (BPE) tokenizer on the review texts
bpe_tokenizer = ByteLevelBPETokenizer()
bpe_tokenizer.train_from_iterator(df['reviewText'], vocab_size=50000, min_frequency=2, show_progress=True)
bpe_tokenizer.enable_truncation(max_length=200)

# Save and reload tokenizer for reproducibility
bpe_tokenizer.save_model(".", "books_bpe")
bpe_tokenizer = ByteLevelBPETokenizer("books_bpe-vocab.json", "books_bpe-merges.txt")
bpe_tokenizer.enable_truncation(max_length=200)

# Tokenize all reviews
def encode_bpe(text):
    return bpe_tokenizer.encode(text).ids

df['bpe_ids'] = df['reviewText'].apply(encode_bpe)

# Pad/truncate to MAX_LEN=200
MAX_LEN = 200
def pad_seq(seq, max_len=MAX_LEN):
    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))
    else:
        return seq[:max_len]

df['bpe_ids'] = df['bpe_ids'].apply(lambda x: pad_seq(x, MAX_LEN))

In [None]:
# =========================
# 5. DATASET & DATALOADER
# =========================

class ReviewDataset(Dataset):
    def __init__(self, df):
        self.seqs = np.stack(df['bpe_ids'].values)
        self.labels = df['overall'].values - 1  # 0-based classes

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return torch.tensor(self.seqs[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

dataset = ReviewDataset(df)

In [None]:
# =========================
# 6. MODEL: MULTI-CHANNEL CNN + RESIDUAL + BiLSTM + MULTIHEAD ATTENTION
# =========================

# Helper: Multi-head self-attention
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.ln = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        out = self.ln(x + attn_out)  # Residual + LayerNorm
        return out

def build_model(
    vocab_size=50000,
    embed_dim=128,
    cnn_out=150,
    lstm_hidden=128,
    num_classes=5,
    num_heads=6,
    dropout=0.3
):
    # Embedding
    embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

    # Multi-channel CNN frontend
    conv3 = nn.Conv1d(embed_dim, cnn_out, kernel_size=3, padding=1)
    conv4 = nn.Conv1d(embed_dim, cnn_out, kernel_size=4, padding=2)
    conv5 = nn.Conv1d(embed_dim, cnn_out, kernel_size=5, padding=2)
    bn3 = nn.BatchNorm1d(cnn_out)
    bn4 = nn.BatchNorm1d(cnn_out)
    bn5 = nn.BatchNorm1d(cnn_out)

    # Residual projection for embedding
    proj = nn.Linear(embed_dim, cnn_out * 3)

    # BiLSTM (2 layers, hidden=128, dropout=0.3)
    bilstm = nn.LSTM(
        input_size=cnn_out * 3,
        hidden_size=lstm_hidden,
        num_layers=2,
        dropout=dropout,
        batch_first=True,
        bidirectional=True
    )
    # LayerNorm between LSTM layers
    ln_lstm = nn.LayerNorm(lstm_hidden * 2)

    # Multi-head self-attention
    attn = MultiHeadSelfAttention(lstm_hidden * 2, num_heads)

    # Output
    fc = nn.Linear(lstm_hidden * 2, num_classes)
    drop = nn.Dropout(dropout)

    # Compose model as a function (not a class)
    def model(x):
        # x: (batch, seq)
        emb = embedding(x)  # (batch, seq, embed_dim)
        emb_t = emb.transpose(1, 2)  # (batch, embed_dim, seq)
        # Multi-channel CNN
        c3 = F.relu(bn3(conv3(emb_t)))
        c4 = F.relu(bn4(conv4(emb_t)))
        c5 = F.relu(bn5(conv5(emb_t)))
        # (batch, cnn_out, seq)
        cnn_cat = torch.cat([c3, c4, c5], dim=1)  # (batch, cnn_out*3, seq)
        cnn_cat = cnn_cat.transpose(1, 2)  # (batch, seq, cnn_out*3)
        # Residual connection from embedding
        emb_proj = proj(emb)  # (batch, seq, cnn_out*3)
        x_cnn = cnn_cat + emb_proj  # Residual add
        # BiLSTM
        lstm_out, _ = bilstm(x_cnn)
        lstm_out = ln_lstm(lstm_out)
        # Multi-head self-attention
        attn_out = attn(lstm_out)
        # Pooling (mean)
        pooled = attn_out.mean(dim=1)
        out = drop(pooled)
        out = fc(out)
        return out
    return model

In [12]:
# =========================
# 7. TRAINING & EVALUATION UTILS
# =========================

def train_model(
    model_fn,
    train_loader,
    val_loader,
    num_epochs=20,
    patience=4,
    max_lr=2e-3,
    grad_clip=5.0,
    scheduler_type='1cycle'
):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Model as a function, so wrap in nn.Module for optimizer
    class Wrapper(nn.Module):
        def __init__(self, model_fn):
            super().__init__()
            self.model_fn = model_fn
        def forward(self, x):
            return self.model_fn(x)
    model = Wrapper(model_fn).to(device)
    optimizer = optim.Adam(model.parameters(), lr=max_lr)
    criterion = nn.CrossEntropyLoss()
    if scheduler_type == '1cycle':
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer, max_lr=max_lr, steps_per_epoch=len(train_loader), epochs=num_epochs
        )
    else:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=2, verbose=True
        )
    best_val_loss = float('inf')
    patience_counter = 0
    train_losses, val_losses = [], []
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(X)
            loss = criterion(out, y)
            loss.backward()
            # Gradient clipping (norm)
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            if scheduler_type == '1cycle':
                scheduler.step()
            running_loss += loss.item() * X.size(0)
        train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(train_loss)
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                out = model(X)
                loss = criterion(out, y)
                val_loss += loss.item() * X.size(0)
        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)
        if scheduler_type == 'plateau':
            scheduler.step(val_loss)
        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break
    model.load_state_dict(best_model_state)
    return model, train_losses, val_losses

def evaluate_model(model, loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            out = model(X)
            preds = torch.argmax(out, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(y.cpu().numpy())
    return np.array(all_preds), np.array(all_labels)

def measure_inference_time(model, loader, n_batches=10):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    times = []
    with torch.no_grad():
        for i, (X, _) in enumerate(loader):
            if i >= n_batches:
                break
            X = X.to(device)
            start = time.time()
            _ = model(X)
            times.append(time.time() - start)
    avg_time = np.mean(times)
    print(f"Average inference time per batch: {avg_time:.4f} seconds")
    return avg_time

def print_memory_footprint():
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / 1024**2
    print(f"CPU Memory usage: {mem:.2f} MB")
    if torch.cuda.is_available():
        print("GPU Allocated:", torch.cuda.memory_allocated()//1024**2, "MB")
        print("GPU Cached:   ", torch.cuda.memory_reserved()//1024**2, "MB")

In [None]:
# =========================
# 8. DATA SPLIT & TRAINING
# =========================

# Split into train/val/test
train_idx, test_idx = train_test_split(
    np.arange(len(df)), test_size=0.1, stratify=df['overall'], random_state=SEED
)
val_idx, train_idx = train_test_split(
    train_idx, test_size=0.9, stratify=df.iloc[train_idx]['overall'], random_state=SEED
)
train_ds = Subset(dataset, train_idx)
val_ds = Subset(dataset, val_idx)
test_ds = Subset(dataset, test_idx)

BATCH_SIZE = 128
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

# Build model
model_fn = build_model(
    vocab_size=50000,
    embed_dim=128,
    cnn_out=150,
    lstm_hidden=128,
    num_classes=5,
    num_heads=6,
    dropout=0.3
)

# Train
trained_model, train_losses, val_losses = train_model(
    model_fn, train_loader, val_loader,
    num_epochs=20, patience=4, max_lr=2e-3, grad_clip=5.0, scheduler_type='1cycle'
)

In [None]:
# =========================
# 9. EVALUATION & VISUALIZATION
# =========================

# Evaluate on test set
preds, labels = evaluate_model(trained_model, test_loader)
print(classification_report(labels, preds, zero_division=0))
cm = confusion_matrix(labels, preds)
disp = ConfusionMatrixDisplay(cm, display_labels=[1,2,3,4,5])
disp.plot()
plt.title("Test Confusion Matrix")
plt.show()

# Plot learning curves
plt.figure()
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.title("Learning Curves")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Inference time and memory
measure_inference_time(trained_model, test_loader)
print_memory_footprint()