In [1]:
# =========================
# Cell 1: Install Packages
# =========================
!pip install torch torchtext pandas numpy matplotlib seaborn tokenizers tqdm --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m853.0 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# =========================
# Cell 2: Imports & Setup
# =========================

import os
import re
import json
import gzip
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from tokenizers import ByteLevelBPETokenizer

import subprocess

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# GPU monitoring utility
def get_gpu_stats():
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,nounits,noheader'],
            stdout=subprocess.PIPE, text=True
        )
        stats = []
        for idx, line in enumerate(result.stdout.strip().split('\n')):
            util, mem_used, mem_total = line.split(',')
            stats.append({
                'gpu': idx,
                'utilization': float(util),
                'mem_used': float(mem_used),
                'mem_total': float(mem_total)
            })
        return stats
    except Exception as e:
        return []

# For reproducibility
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# =========================
# Cell 3: Data Loading & Preprocessing
# =========================

# Download Amazon Books dataset (200K for grid search)
DATA_URL = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz"
DATA_FILE = "Books_5.json.gz"
if not os.path.exists(DATA_FILE):
    print("Downloading dataset...")
    import urllib.request
    urllib.request.urlretrieve(DATA_URL, DATA_FILE)
    print("Download complete.")

def preprocess_text(text):
    # Simple text cleaning
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def load_amazon_subset(filename, n_total=200_000):
    records = []
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        for line in tqdm(f, total=n_total):
            rec = json.loads(line)
            if 'reviewText' in rec and 'overall' in rec:
                text = preprocess_text(rec['reviewText'])
                records.append({'reviewText': text, 'overall': int(float(rec['overall']))})
            if len(records) >= n_total:
                break
    df = pd.DataFrame(records)
    df = df[df['overall'].isin([1,2,3,4,5])].dropna()
    print("Class distribution:\n", df['overall'].value_counts())
    return df

df = load_amazon_subset(DATA_FILE, n_total=200_000)

# BPE Tokenization
bpe_tokenizer = ByteLevelBPETokenizer()
bpe_tokenizer.train_from_iterator(df['reviewText'], vocab_size=50000, min_frequency=2, show_progress=True)
bpe_tokenizer.enable_truncation(max_length=200)
bpe_tokenizer.save_model(".", "books_bpe")
bpe_tokenizer = ByteLevelBPETokenizer("books_bpe-vocab.json", "books_bpe-merges.txt")
bpe_tokenizer.enable_truncation(max_length=200)

def encode_bpe(text):
    return bpe_tokenizer.encode(text).ids

MAX_LEN = 200
def pad_seq(seq, max_len=MAX_LEN):
    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))
    else:
        return seq[:max_len]

df['bpe_ids'] = df['reviewText'].apply(lambda x: pad_seq(encode_bpe(x), MAX_LEN))

class ReviewDataset(Dataset):
    def __init__(self, df):
        self.seqs = np.stack(df['bpe_ids'].values)
        self.labels = df['overall'].values - 1  # 0-based classes

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return torch.tensor(self.seqs[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

dataset = ReviewDataset(df)

# Train/val/test split
train_idx, test_idx = train_test_split(
    np.arange(len(dataset)), test_size=0.1, stratify=df['overall'], random_state=SEED
)
train_idx, val_idx = train_test_split(
    train_idx, test_size=0.1, stratify=df.iloc[train_idx]['overall'], random_state=SEED
)
train_ds = Subset(dataset, train_idx)
val_ds = Subset(dataset, val_idx)
test_ds = Subset(dataset, test_idx)

In [None]:
# =========================
# Cell 4: Model Architecture
# =========================

class CNN_BiLSTM_Attn(nn.Module):
    def __init__(self, vocab_size=50000, embed_dim=128, cnn_out=150, lstm_hidden=128, num_heads=6, num_classes=5, dropout=0.3):
        super().__init__()
        assert (lstm_hidden * 2) % num_heads == 0, "lstm_hidden*2 must be divisible by num_heads"
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.conv3 = nn.Conv1d(embed_dim, cnn_out, kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(embed_dim, cnn_out, kernel_size=4, padding=2)
        self.conv5 = nn.Conv1d(embed_dim, cnn_out, kernel_size=5, padding=2)
        self.bn3 = nn.BatchNorm1d(cnn_out)
        self.bn4 = nn.BatchNorm1d(cnn_out)
        self.bn5 = nn.BatchNorm1d(cnn_out)
        self.proj = nn.Linear(embed_dim, cnn_out * 3)
        self.bilstm = nn.LSTM(
            input_size=cnn_out * 3,
            hidden_size=lstm_hidden,
            num_layers=2,
            dropout=dropout,
            batch_first=True,
            bidirectional=True
        )
        self.ln_lstm = nn.LayerNorm(lstm_hidden * 2)
        self.attn = nn.MultiheadAttention(
            embed_dim=lstm_hidden * 2,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True
        )
        self.ln_attn = nn.LayerNorm(lstm_hidden * 2)
        self.fc = nn.Linear(lstm_hidden * 2, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        emb = self.embedding(x)  # (batch, seq, embed_dim)
        emb_t = emb.transpose(1, 2)  # (batch, embed_dim, seq)
        c3 = F.relu(self.bn3(self.conv3(emb_t)))
        c4 = F.relu(self.bn4(self.conv4(emb_t)))
        c5 = F.relu(self.bn5(self.conv5(emb_t)))
        cnn_cat = torch.cat([c3, c4, c5], dim=1)  # (batch, cnn_out*3, seq)
        cnn_cat = cnn_cat.transpose(1, 2)  # (batch, seq, cnn_out*3)
        emb_proj = self.proj(emb)  # (batch, seq, cnn_out*3)
        x_cnn = cnn_cat + emb_proj
        lstm_out, _ = self.bilstm(x_cnn)
        lstm_out = self.ln_lstm(lstm_out)
        attn_out, _ = self.attn(lstm_out, lstm_out, lstm_out)
        attn_out = self.ln_attn(lstm_out + attn_out)
        pooled = attn_out.mean(dim=1)
        out = self.dropout(pooled)
        out = self.fc(out)
        return out

In [None]:
# =========================
# Cell 5: Training & DDP Setup
# =========================

def train_model(
    model,
    train_loader,
    val_loader,
    num_epochs=3,
    lr=2e-3,
    grad_clip=5.0,
    device='cuda',
    gpu_stats_log=None
):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    best_val_acc = 0
    best_model_state = None
    train_metrics = []
    val_metrics = []
    gpu_stats = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0
        for X, y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            running_loss += loss.item() * X.size(0)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
        train_loss = running_loss / len(train_loader.dataset)
        train_acc = correct / total
        train_metrics.append((train_loss, train_acc))

        # Validation
        model.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                outputs = model(X)
                loss = criterion(outputs, y)
                val_loss += loss.item() * X.size(0)
                preds = torch.argmax(outputs, dim=1)
                val_correct += (preds == y).sum().item()
                val_total += y.size(0)
        val_loss = val_loss / len(val_loader.dataset)
        val_acc = val_correct / val_total
        val_metrics.append((val_loss, val_acc))

        # GPU stats
        stats = get_gpu_stats()
        gpu_stats.append({'epoch': epoch, 'stats': stats})

        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict()

    if gpu_stats_log is not None:
        gpu_stats_log.extend(gpu_stats)
    model.load_state_dict(best_model_state)
    return model, train_metrics, val_metrics

In [None]:
# =========================
# Cell 6: Grid Search Implementation
# =========================

from itertools import product

def grid_search(train_ds, val_ds, grid_params, device='cuda', batch_size=128, num_epochs=3):
    results = []
    best_models = []
    gpu_stats_all = []
    param_combos = list(product(*grid_params.values()))
    for idx, combo in enumerate(param_combos):
        params = dict(zip(grid_params.keys(), combo))
        print(f"\nGrid Search {idx+1}/{len(param_combos)}: {params}")
        model = CNN_BiLSTM_Attn(
            vocab_size=50000,
            embed_dim=128,
            cnn_out=150,
            lstm_hidden=params['lstm_hidden'],
            num_heads=params['num_heads'],
            num_classes=5,
            dropout=0.3
        ).to(device)
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
        val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
        gpu_stats_log = []
        model, train_metrics, val_metrics = train_model(
            model, train_loader, val_loader,
            num_epochs=num_epochs, lr=params['learning_rate'],
            grad_clip=5.0, device=device, gpu_stats_log=gpu_stats_log
        )
        val_acc = val_metrics[-1][1]
        results.append({
            'params': params,
            'val_acc': val_acc,
            'model_state': model.state_dict(),
            'gpu_stats': gpu_stats_log
        })
        gpu_stats_all.append(gpu_stats_log)
    # Sort and save top 2 models
    results_sorted = sorted(results, key=lambda x: x['val_acc'], reverse=True)
    for i, res in enumerate(results_sorted[:2]):
        torch.save(res['model_state'], f'best_model_{i+1}.pt')
    # Save grid search summary
    summary = pd.DataFrame([{'lstm_hidden': r['params']['lstm_hidden'],
                             'num_heads': r['params']['num_heads'],
                             'learning_rate': r['params']['learning_rate'],
                             'val_acc': r['val_acc']} for r in results_sorted])
    summary['top2'] = False
    summary.loc[:1, 'top2'] = True
    summary.to_csv('grid_search_results.csv', index=False)
    print("\nGrid Search Summary:")
    display(summary)
    return results_sorted, summary, gpu_stats_all

In [None]:
# =========================
# Cell 7: Prediction & Visualization
# =========================

def predict_sentiment(text, model, tokenizer, device='cuda'):
    model.eval()
    text_clean = preprocess_text(text)
    ids = pad_seq(tokenizer.encode(text_clean).ids, MAX_LEN)
    X = torch.tensor([ids], dtype=torch.long).to(device)
    with torch.no_grad():
        logits = model(X)
        probs = F.softmax(logits, dim=1).cpu().numpy().flatten()
        pred_class = np.argmax(probs) + 1
    # Visualization
    plt.figure(figsize=(8,4))
    sns.barplot(x=[1,2,3,4,5], y=probs, palette='viridis')
    plt.title(f"Sentiment Prediction Probabilities\nPredicted Class: {pred_class} (Confidence: {probs[pred_class-1]:.2f})")
    plt.xlabel("Sentiment Class")
    plt.ylabel("Probability")
    plt.ylim(0,1)
    plt.show()
    print(f"Predicted Sentiment: {pred_class} (Confidence: {probs[pred_class-1]:.2f})")
    return pred_class, probs

In [None]:
# =========================
# Cell 8: GPU Utilization Visualization
# =========================

def plot_gpu_stats(gpu_stats_all):
    # Flatten and organize stats
    stats_flat = []
    for run_idx, run in enumerate(gpu_stats_all):
        for epoch_stat in run:
            epoch = epoch_stat['epoch']
            for gpu_stat in epoch_stat['stats']:
                stats_flat.append({
                    'run': run_idx,
                    'epoch': epoch,
                    'gpu': gpu_stat['gpu'],
                    'utilization': gpu_stat['utilization'],
                    'mem_used': gpu_stat['mem_used'],
                    'mem_total': gpu_stat['mem_total']
                })
    df_stats = pd.DataFrame(stats_flat)
    if df_stats.empty:
        print("No GPU stats to plot.")
        return
    plt.figure(figsize=(12,6))
    sns.lineplot(data=df_stats, x='epoch', y='utilization', hue='gpu', style='run', markers=True)
    plt.title("GPU Utilization Over Epochs")
    plt.ylabel("Utilization (%)")
    plt.xlabel("Epoch")
    plt.savefig("gpu_utilization.png")
    plt.show()
    plt.figure(figsize=(12,6))
    sns.lineplot(data=df_stats, x='epoch', y='mem_used', hue='gpu', style='run', markers=True)
    plt.title("GPU Memory Usage Over Epochs")
    plt.ylabel("Memory Used (MiB)")
    plt.xlabel("Epoch")
    plt.savefig("gpu_memory_usage.png")
    plt.show()

In [None]:
# =========================
# Cell 9: Example Usage
# =========================

# Set device and grid search params
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
fast_grid_params = {
    'lstm_hidden': [128, 192],
    'num_heads': [6, 8],
    'learning_rate': [2e-3]
}

# Run grid search (uses 200K records, 3 epochs per run)
results_sorted, summary, gpu_stats_all = grid_search(
    train_ds, val_ds, fast_grid_params, device=device, batch_size=128, num_epochs=3
)

# Load best model for prediction
best_model = CNN_BiLSTM_Attn(
    vocab_size=50000,
    embed_dim=128,
    cnn_out=150,
    lstm_hidden=int(summary.iloc[0]['lstm_hidden']),
    num_heads=int(summary.iloc[0]['num_heads']),
    num_classes=5,
    dropout=0.3
).to(device)
best_model.load_state_dict(torch.load('best_model_1.pt'))

# Example prediction
sample_text = "This book was absolutely fantastic! I loved every page."
predict_sentiment(sample_text, best_model, bpe_tokenizer, device=device)

# Plot GPU stats
plot_gpu_stats(gpu_stats_all)