In [None]:
# =========================
# 1. SETUP & DATA DOWNLOAD
# =========================

# Install required packages (if running in Colab, uncomment these lines)
# !pip install torch torchvision torchaudio
# !pip install pandas scikit-learn matplotlib tqdm nltk tokenizers
# !pip install torchtext

import os
import gzip
import json
import random
import urllib.request
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
import nltk
import gc
import time
import psutil

from tokenizers import ByteLevelBPETokenizer
import ssl

In [None]:

nltk.download('stopwords')
nltk.download('wordnet')

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
#torch.cuda.manual_seed(SEED)
#torch.backends.cudnn.deterministic = True

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<torch._C.Generator at 0x7b5a163b46f0>

In [None]:
# Create an unverified SSL context
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
# Download the "All Beauty" dataset if not already present
'''
DATA_URL = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/All_Beauty_5.json.gz"
DATA_FILE = "All_Beauty_5.json.gz"
if not os.path.exists(DATA_FILE):
    print("Downloading dataset...")
    urllib.request.urlretrieve(DATA_URL, DATA_FILE)
    print("Download complete.")
'''

'\nDATA_URL = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/All_Beauty_5.json.gz"\nDATA_FILE = "All_Beauty_5.json.gz"\nif not os.path.exists(DATA_FILE):\n    print("Downloading dataset...")\n    urllib.request.urlretrieve(DATA_URL, DATA_FILE)\n    print("Download complete.")\n'

In [None]:
# Download the "Books" dataset if not already present
DATA_URL = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"
DATA_FILE = "Books_5.json.gz"
if not os.path.exists(DATA_FILE):
    print("Downloading dataset...")
    urllib.request.urlretrieve(DATA_URL, DATA_FILE)
    print("Download complete.")

In [None]:
# =========================
# 2. DATA LOADING & PREVIEW
# =========================

# Load a sample of the data to preview all fields
def load_sample_records(filename, n=2):
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        records = [json.loads(line) for _, line in zip(range(n), f)]
    return pd.DataFrame(records)

sample_df = load_sample_records(DATA_FILE, n=2)
print("Sample records with all fields:\n", sample_df)

Sample records with all fields:
    overall  verified   reviewTime      reviewerID        asin  \
0      5.0     False  03 30, 2005  A1REUF3A1YCPHM  0001713353   
1      5.0      True  06 20, 2016   AVP0HXC9FG790  0001713353   

                       style     reviewerName  \
0  {'Format:': ' Hardcover'}      TW Ervin II   
1                        NaN  Amazon Customer   

                                          reviewText  \
0  The King, the Mice and the Cheese by Nancy Gur...   
1                                 The kids loved it!   

                                     summary  unixReviewTime  
0  A story children will love and learn from      1112140800  
1                                 Five Stars      1466380800  


In [None]:
# =========================
# 3. LOAD & BALANCE DATASET
# =========================

# Load 200K records and keep only reviewText and overall
def load_balanced_subset(filename, n_total=200000):
    # Read all records
    records = []
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        for line in tqdm(f, total=n_total):
            rec = json.loads(line)
            if 'reviewText' in rec and 'overall' in rec:
                records.append({'reviewText': rec['reviewText'], 'overall': int(float(rec['overall']))})
            if len(records) >= n_total:
                break
    df = pd.DataFrame(records)
    # Remove any rows with missing data or out-of-range ratings
    df = df[df['overall'].isin([1,2,3,4,5])].dropna()
    # Note: For large datasets, natural distribution is often better for generalization.
    # If you want to balance, uncomment the next lines.
    # min_count = df['overall'].value_counts().min()
    # df = df.groupby('overall').sample(n=min_count, random_state=SEED)
    print("Class distribution (note: natural, not balanced):\n", df['overall'].value_counts())
    return df

df = load_balanced_subset(DATA_FILE, n_total=200000)

200028it [00:04, 48502.03it/s]                            


Class distribution (note: natural, not balanced):
 overall
5    127726
4     39172
3     17622
2      8427
1      7053
Name: count, dtype: int64


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# =========================
# 4. BPE TOKENIZATION
# =========================

# Train a Byte-Pair Encoding (BPE) tokenizer on the review texts
bpe_tokenizer = ByteLevelBPETokenizer()
bpe_tokenizer.train_from_iterator(df['reviewText'], vocab_size=50000, min_frequency=2, show_progress=True)
bpe_tokenizer.enable_truncation(max_length=200)

# Save and reload tokenizer for reproducibility
bpe_tokenizer.save_model(".", "books_bpe")
bpe_tokenizer = ByteLevelBPETokenizer("books_bpe-vocab.json", "books_bpe-merges.txt")
bpe_tokenizer.enable_truncation(max_length=200)

# Tokenize all reviews
def encode_bpe(text):
    return bpe_tokenizer.encode(text).ids

df['bpe_ids'] = df['reviewText'].apply(encode_bpe)

# Pad/truncate to MAX_LEN=200
MAX_LEN = 200
def pad_seq(seq, max_len=MAX_LEN):
    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))
    else:
        return seq[:max_len]

df['bpe_ids'] = df['bpe_ids'].apply(lambda x: pad_seq(x, MAX_LEN))

In [None]:
# =========================
# 5. DATASET & DATALOADER
# =========================
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.seqs = np.stack(df['bpe_ids'].values)
        self.labels = df['overall'].values - 1  # 0-based classes

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return torch.tensor(self.seqs[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

# Create dataset splits
dataset = ReviewDataset(df)

# Split into train/val/test
train_idx, test_idx = train_test_split(
    np.arange(len(dataset)),
    test_size=0.1,
    stratify=df['overall'],
    random_state=SEED
)
train_idx, val_idx = train_test_split(
    train_idx,
    test_size=0.1,
    stratify=df.iloc[train_idx]['overall'],
    random_state=SEED
)

# Create subset datasets
train_ds = Subset(dataset, train_idx)
val_ds = Subset(dataset, val_idx)
test_ds = Subset(dataset, test_idx)

# Print split sizes
print(f"Training set size: {len(train_ds)}")
print(f"Validation set size: {len(val_ds)}")
print(f"Test set size: {len(test_ds)}")

'''class ReviewDataset(Dataset):
    def __init__(self, df):
        self.seqs = np.stack(df['bpe_ids'].values)
        self.labels = df['overall'].values - 1  # 0-based classes

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return torch.tensor(self.seqs[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

dataset = ReviewDataset(df)
'''

Training set size: 162000
Validation set size: 18000
Test set size: 20000


"class ReviewDataset(Dataset):\n    def __init__(self, df):\n        self.seqs = np.stack(df['bpe_ids'].values)\n        self.labels = df['overall'].values - 1  # 0-based classes\n\n    def __len__(self):\n        return len(self.seqs)\n\n    def __getitem__(self, idx):\n        return torch.tensor(self.seqs[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)\n\ndataset = ReviewDataset(df)\n"

In [None]:
# =========================
# 6. MODEL ARCHITECTURE
# =========================

class EnhancedSentimentModel(nn.Module):
    def __init__(
        self,
        vocab_size=50000,
        embed_dim=128,
        cnn_out=150,
        lstm_hidden=126,  # Makes lstm_hidden*2 (252) divisible by num_heads (6)
        num_heads=6,
        num_classes=5,
        dropout=0.3
    ):
        super().__init__()
        # Ensure hidden size is divisible by num_heads
        self.lstm_hidden = lstm_hidden
        assert (lstm_hidden * 2) % num_heads == 0, "lstm_hidden * 2 must be divisible by num_heads"

        # Embedding
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # Multi-channel CNN
        self.conv3 = nn.Conv1d(embed_dim, cnn_out, kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(embed_dim, cnn_out, kernel_size=4, padding=2)
        self.conv5 = nn.Conv1d(embed_dim, cnn_out, kernel_size=5, padding=2)
        self.bn3 = nn.BatchNorm1d(cnn_out)
        self.bn4 = nn.BatchNorm1d(cnn_out)
        self.bn5 = nn.BatchNorm1d(cnn_out)

        # Residual projection
        self.proj = nn.Linear(embed_dim, cnn_out * 3)

        # BiLSTM
        self.bilstm = nn.LSTM(
            input_size=cnn_out * 3,
            hidden_size=lstm_hidden,
            num_layers=2,
            dropout=dropout,
            batch_first=True,
            bidirectional=True
        )

        # Layer Normalization
        self.ln_lstm = nn.LayerNorm(lstm_hidden * 2)

        # Multi-head attention
        self.attention = nn.MultiheadAttention(
            embed_dim=lstm_hidden * 2,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True
        )
        self.ln_attn = nn.LayerNorm(lstm_hidden * 2)

        # Output
        self.fc = nn.Linear(lstm_hidden * 2, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Embedding
        emb = self.embedding(x)  # (batch, seq, embed_dim)
        emb_t = emb.transpose(1, 2)  # (batch, embed_dim, seq)

        # Multi-channel CNN
        c3 = F.relu(self.bn3(self.conv3(emb_t)))
        c4 = F.relu(self.bn4(self.conv4(emb_t)))
        c5 = F.relu(self.bn5(self.conv5(emb_t)))

        # Adjust the size of c4 to match c3 and c5
        c4 = c4[:, :, :-1]  # Remove the extra element from the sequence dimension

        # Concatenate CNN outputs
        cnn_cat = torch.cat([c3, c4, c5], dim=1)  # (batch, cnn_out*3, seq)
        cnn_cat = cnn_cat.transpose(1, 2)  # (batch, seq, cnn_out*3)

        # Residual connection
        emb_proj = self.proj(emb)  # (batch, seq, cnn_out*3)
        x_cnn = cnn_cat + emb_proj

        # BiLSTM
        lstm_out, _ = self.bilstm(x_cnn)
        lstm_out = self.ln_lstm(lstm_out)

        # Multi-head attention
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        attn_out = self.ln_attn(lstm_out + attn_out)  # Residual connection

        # Global average pooling
        pooled = attn_out.mean(dim=1)

        # Output
        out = self.dropout(pooled)
        out = self.fc(out)

        return out

In [None]:
# =========================
# 7. TRAINING & EVALUATION UTILS
# =========================

def train_model(
    model,
    train_loader,
    val_loader,
    num_epochs=20,
    patience=4,
    max_lr=2e-3,
    grad_clip=5.0,
    device='cuda'
):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=max_lr)
    criterion = nn.CrossEntropyLoss()

    # 1cycle scheduler
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=max_lr,
        epochs=num_epochs,
        steps_per_epoch=len(train_loader)
    )

    best_val_loss = float('inf')
    patience_counter = 0
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
        # Training
        model.train()
        running_loss = 0.0

        for X, y in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            X, y = X.to(device), y.to(device)

            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

            optimizer.step()
            scheduler.step()

            running_loss += loss.item() * X.size(0)

        train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(train_loss)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                outputs = model(X)
                loss = criterion(outputs, y)
                val_loss += loss.item() * X.size(0)

        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)

        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Training Loss: {train_loss:.4f}')
        print(f'Validation Loss: {val_loss:.4f}')

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print('Early stopping triggered')
                break

    model.load_state_dict(best_model_state)
    return model, train_losses, val_losses

def evaluate_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(y.cpu().numpy())
    return np.array(all_preds), np.array(all_labels)

def measure_inference_time(model, loader, device, n_batches=10):
    model.eval()
    times = []
    with torch.no_grad():
        for i, (X, _) in enumerate(loader):
            if i >= n_batches:
                break
            X = X.to(device)
            start = time.time()
            _ = model(X)
            times.append(time.time() - start)
    avg_time = np.mean(times)
    print(f"Average inference time per batch: {avg_time:.4f} seconds")
    return avg_time

def print_memory_footprint():
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / 1024**2
    print(f"CPU Memory usage: {mem:.2f} MB")
    if torch.cuda.is_available():
        print("GPU Allocated:", torch.cuda.memory_allocated()//1024**2, "MB")
        print("GPU Cached:   ", torch.cuda.memory_reserved()//1024**2, "MB")

In [None]:
# =========================
# 8. TRAINING EXECUTION
# =========================

# Create data loaders
BATCH_SIZE = 128
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

# Initialize model
model = EnhancedSentimentModel(
    vocab_size=50000,
    embed_dim=128,
    cnn_out=150,
    lstm_hidden=126,  # Makes lstm_hidden*2 (252) divisible by num_heads (6)
    num_heads=6,
    num_classes=5,
    dropout=0.3
)

# Print model summary
print(model)
print("\nModel parameters:", sum(p.numel() for p in model.parameters()))

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Train model
trained_model, train_losses, val_losses = train_model(
    model,
    train_loader,
    val_loader,
    num_epochs=20,
    patience=4,
    max_lr=2e-3,
    grad_clip=5.0,
    device=device
)

EnhancedSentimentModel(
  (embedding): Embedding(50000, 128, padding_idx=0)
  (conv3): Conv1d(128, 150, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv4): Conv1d(128, 150, kernel_size=(4,), stride=(1,), padding=(2,))
  (conv5): Conv1d(128, 150, kernel_size=(5,), stride=(1,), padding=(2,))
  (bn3): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn4): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn5): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (proj): Linear(in_features=128, out_features=450, bias=True)
  (bilstm): LSTM(450, 126, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (ln_lstm): LayerNorm((252,), eps=1e-05, elementwise_affine=True)
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=252, out_features=252, bias=True)
  )
  (ln_attn): LayerNorm((252,), eps=1e-05, elementwise_affine=True)
  (fc): Linea

Epoch 1/20: 100%|██████████| 1265/1265 [2:56:59<00:00,  8.39s/it]


Epoch 1/20:
Training Loss: 0.9386
Validation Loss: 0.8724


Epoch 2/20:  43%|████▎     | 549/1265 [1:16:56<1:37:52,  8.20s/it]

In [None]:
# =========================
# 9. EVALUATION & VISUALIZATION
# =========================

# Evaluate on test set
preds, labels = evaluate_model(trained_model, test_loader, device)
print("\nTest Set Classification Report:")
print(classification_report(labels, preds, zero_division=0))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(labels, preds)
disp = ConfusionMatrixDisplay(cm, display_labels=[1,2,3,4,5])
disp.plot()
plt.title("Test Set Confusion Matrix")
plt.show()

# Plot learning curves
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title("Learning Curves")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Measure inference time and memory usage
print("\nPerformance Metrics:")
measure_inference_time(trained_model, test_loader, device)
print_memory_footprint()

# Version 2 - Use all GPUs

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler
import time
import subprocess

def print_gpu_utilization():
    result = subprocess.run(
        ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,nounits,noheader'],
        stdout=subprocess.PIPE, text=True
    )
    for idx, line in enumerate(result.stdout.strip().split('\n')):
        util, mem_used, mem_total = line.split(',')
        print(f"GPU {idx}: Utilization: {util.strip()}%, Memory: {mem_used.strip()} MiB / {mem_total.strip()} MiB")

def ddp_train(rank, world_size, model_class, dataset, batch_size, num_epochs, lr, grad_clip, results_dict):
    # DDP setup
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)
    device = torch.device(f'cuda:{rank}')

    # Distributed sampler
    train_sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, drop_last=True)

    # Model, optimizer, loss
    model = model_class().to(device)
    model = DDP(model, device_ids=[rank])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training loop
    epoch_times = []
    for epoch in range(num_epochs):
        model.train()
        train_sampler.set_epoch(epoch)
        start_time = time.time()
        running_loss = 0.0
        correct, total = 0, 0

        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            running_loss += loss.item() * X.size(0)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)
        avg_loss = running_loss / len(train_loader.dataset)
        acc = correct / total

        if rank == 0:
            print(f"[GPU {rank}] Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f} | Acc: {acc:.4f} | Time: {epoch_time:.2f}s")
            print_gpu_utilization()

    # Gather metrics from all ranks
    if rank == 0:
        results_dict['avg_epoch_time'] = sum(epoch_times) / len(epoch_times)
        results_dict['final_acc'] = acc
        results_dict['final_loss'] = avg_loss

    dist.destroy_process_group()

def run_ddp_experiment(model_class, dataset, batch_size=128, num_epochs=3, lr=2e-3, grad_clip=5.0, num_gpus=1):
    manager = mp.Manager()
    results_dict = manager.dict()
    mp.spawn(
        ddp_train,
        args=(num_gpus, model_class, dataset, batch_size, num_epochs, lr, grad_clip, results_dict),
        nprocs=num_gpus,
        join=True
    )
    print(f"\n=== Results for {num_gpus} GPU(s) ===")
    for k, v in results_dict.items():
        print(f"{k}: {v}")

Flexible GPU usage: Set num_gpus to 1, 2, or 4 to test scaling.
Tracks:
Per-epoch time
Final accuracy and loss
GPU utilization and memory (printed per epoch, per GPU)
Creative metric tracking: Results are printed and can be logged for later analysis.

In [None]:
## Execute

# Try with 1, 2, and 4 GPUs
for NUM_GPUS in [1, 2, 4]:
    run_ddp_experiment(
        model_class=lambda: EnhancedSentimentModel(
            vocab_size=50000,
            embed_dim=128,
            cnn_out=150,
            lstm_hidden=192,   # Example: larger LSTM
            num_heads=12,
            num_classes=5,
            dropout=0.3
        ),
        dataset=train_ds,
        batch_size=128,
        num_epochs=3,   # For sizing, 3 epochs is enough
        lr=2e-3,
        grad_clip=5.0,
        num_gpus=NUM_GPUS
    )

##