In [1]:
!nvidia-smi

Thu Mar  6 08:56:47 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-32GB           On  | 00000000:86:00.0 Off |                  Off |
| N/A   29C    P0              27W / 150W |      0MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE-32GB           On  | 00000000:D8:00.0 Off |  

# Cross-Modal Web Agent - Interactive Notebook

This notebook demonstrates how to use the cross-modal web agent for interactive experimentation.

In [2]:
# Install the package if needed
!pip install -e scribe_agent

import pickle
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"]="True"
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
from PIL import Image
from IPython.display import display, HTML
from transformers import AutoTokenizer
# Import from our package
from scribe_agent.models.cross_modal_model import CrossModalWebAgent
from scribe_agent.utils.visual_processor import VisualProcessor
from scribe_agent.utils.html_processor import process_html

Defaulting to user installation because normal site-packages is not writeable
Obtaining file:///home/gridsan/vdixit/scribe/scribe_agent
  Preparing metadata (setup.py) ... [?25ldone
[0mINFO: pip is looking at multiple versions of scribe-agent to determine which version is compatible with other requirements. This could take a while.
[31mERROR: Could not find a version that satisfies the requirement wandb>=0.15.0 (from scribe-agent) (from versions: none)[0m[31m
[31mERROR: No matching distribution found for wandb>=0.15.0[0m[31m
[0m

2025-03-06 08:57:55.067726: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-06 08:57:55.078997: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-06 08:57:55.093343: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-06 08:57:55.097600: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-06 08:57:55.108457: I tensorflow/core/platform/cpu_feature_guar

## Setup Model and Processors

Initialize the model, tokenizer, and visual processor.

In [3]:
from accelerate import Accelerator
accelerator = Accelerator()

In [4]:
# Set the model path (update this to your trained model path)
MODEL_PATH = "checkpoints/best_model"  # Change to your actual model path

# For testing without a trained model, use the base models directly
TEXT_MODEL = "../Qwen2.5-7B-Instruct"
VISION_MODEL = "clip-vit-base-patch32"

# Check if we have a trained model
has_trained_model = os.path.exists(MODEL_PATH)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize tokenizer
if has_trained_model:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
else:
    tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
    
# Ensure padding token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


Using device: cuda


In [5]:
# Initialize model
if has_trained_model:
    model = CrossModalWebAgent.from_pretrained(MODEL_PATH).to(device)
else:
    # For testing only: creates a new model without fine-tuning
    model = CrossModalWebAgent(
        text_model_name=TEXT_MODEL,
        vision_model_name=VISION_MODEL,
        use_lora=False  # Don't use LoRA for testing
    )
    
model.eval()
model.to(accelerator.device)
model.visual_projection.to(accelerator.device)
model.cross_attention.to(accelerator.device)
model.element_localization.to(accelerator.device)
# Initialize visual processor
visual_processor = VisualProcessor(vision_model_name=VISION_MODEL)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [6]:
from scribe_agent.data.mind2web_dataset import create_multimodal_mind2web_dataloader

In [7]:
num_processors = os.cpu_count()
num_processors

80

In [11]:
def custom_collate_fn(batch):
    """Custom collate function to handle complex nested structures."""
    if len(batch) == 0:
        return {}
    result = {}
    # Get all keys from all batch items
    all_keys = set()
    for item in batch:
        all_keys.update(item.keys())
    # Process each key
    for key in all_keys:
        # Skip items that don't have this key
        values = [item[key] for item in batch if key in item]
        if len(values) == 0:
            continue
        # Handle different types of values
        if isinstance(values[0], dict):
            # Create a new dictionary with batched values
            nested_dict = {}
            sub_keys = set()
            for v in values:
                sub_keys.update(v.keys())
            for sub_key in sub_keys:
                # Get values that have this sub_key
                sub_values = [v[sub_key] for v in values if sub_key in v]
                if len(sub_values) > 0:
                    if isinstance(sub_values[0], torch.Tensor):
                        # Stack tensors
                        try:
                            nested_dict[sub_key] = torch.stack(sub_values)
                        except:
                            # If can't stack (different sizes), keep as list
                            nested_dict[sub_key] = sub_values
                    else:
                        # Keep other types as lists
                        nested_dict[sub_key] = sub_values
            result[key] = nested_dict
        elif isinstance(values[0], torch.Tensor):
            # Stack tensors when possible
            try:
                result[key] = torch.stack(values)
            except:
                # If tensors have different sizes, keep as list
                result[key] = values
        else:
            # For other types, just keep as list
            result[key] = values
    return result

In [12]:
import time

In [13]:
# dataloader = create_multimodal_mind2web_dataloader(
#     data_dir='Multimodal-Mind2Web/data',
#     split="train",
#     tokenizer=tokenizer,
#     visual_processor=visual_processor,
#     batch_size=8,
#     num_workers=96,  # Set to number of CPU cores
#     cache_dir="cache",  # Enable caching
#     precompute_visuals=True  # Process visuals during load (slower initial load, faster training)
# )
# startime = time.time()
# dataloader = create_multimodal_mind2web_dataloader(
#     'Multimodal-Mind2Web/data',
#     "train",
#     tokenizer=tokenizer,
#     visual_processor=visual_processor,
# )
# print(time.time() - startime)
with open('dataset_full.pkl', 'rb') as inp:
    dataset = pickle.load(inp)

# Define split sizes
train_ratio = 0.9
val_ratio = 1 - train_ratio
train_size = int(train_ratio * len(dataset))
val_size = len(dataset) - train_size

# Split dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=custom_collate_fn)

# dataloader = DataLoader(
#         dataset,
#         batch_size=1, 
#         shuffle=True, num_workers=4, pin_memory=False, collate_fn=custom_collate_fn
#     )

In [14]:
# model = torch.compile(model)

In [15]:
len(val_loader)

737

In [16]:
# torch.save(dataloader, 'dataloader.pkl')
it = iter(val_loader)
first = next(it)

In [17]:
first.keys()

dict_keys(['visual_features', 'target_node_id', 'input_ids', 'step_id', 'attention_mask', 'task_id', 'labels'])

In [18]:
first["input_ids"].shape

torch.Size([1, 32768])

In [19]:
!nvidia-smi

Thu Mar  6 09:13:12 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-32GB           On  | 00000000:86:00.0 Off |                  Off |
| N/A   31C    P0              39W / 150W |   5678MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE-32GB           On  | 00000000:D8:00.0 Off |  

## Process Example

Define a function to process an example and generate a prediction.

In [20]:
# from accelerate import Accelerator
# # Initialize accelerator
# # torch.multiprocessing.set_start_method('spawn', True)
# def acc_inf():
#     import bitsandbytes
    
#     model = accelerator.prepare(model)
#     model.eval()
#     with torch.no_grad():
#         outputs = model.forward(
#             input_ids=first["input_ids"].to(accelerator.device),
#             attention_mask=first["attention_mask"].to(accelerator.device),
#             # visual_features=inputs["visual_features"],
#             # max_new_tokens=100
#             )
#     return outputs
from accelerate import notebook_launcher
notebook_launcher(acc_inf, num_processes=2)

In [21]:
 model, first = accelerator.prepare(model, first)

In [22]:
# first['visual_features']['elements'][0]['126']['pixel_values'] = first['visual_features']['elements'][0]['126']['pixel_values'].to(device)
# first['visual_features']['full_image'][0]['pixel_values'] = first['visual_features']['full_image'][0]['pixel_values'].to(device)

In [24]:
# Generate output

starttime = time.time()
with torch.no_grad():
    output = model.forward(first['input_ids'], first['attention_mask'], visual_features = first['visual_features'], labels=first['labels'])
    print(time.time() - starttime)
torch.cuda.empty_cache()


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

In [23]:
# outputs = output['logits'].to('cpu')
first['input_ids'].to('cpu')
first['attention_mask'].to('cpu')
first['visual_features']
first['labels'].to('cpu')
# del output
del first
torch.cuda.empty_cache()

In [20]:
predicted_ids = torch.argmax(outputs, dim=-1)  # Shape: [batch_size, sequence_length]
predicted_ids.shape
decoded_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)

In [21]:
predicted_ids.shape

torch.Size([1, 32768])

In [22]:
    
# # Generate full sequence (optional, if you want to extend beyond input)
# generated_ids = first['input_ids']
# for _ in range(10 - first['input_ids'].shape[1]):
#     with torch.no_grad():
#         outputs = model.forward(generated_ids.to(device), first['attention_mask'].to(device))
#     next_logits = outputs.logits[:, -1, :]  # Get logits for last position
#     next_token = torch.argmax(next_logits, dim=-1, keepdim=True)
#     generated_ids = torch.cat([generated_ids, next_token], dim=1)
    
# # Decode to text
decoded_text1 = tokenizer.decode(first['input_ids'][0, :], skip_special_tokens=True)
decoded_text1

'Objective: Add zyrtec to the cart for pickup at the nearest CVS to zip code 90028\nURL: \n\n<html backend_node_id="8519" node="1">\n<div backend_node_id="8595" node="2">\n<div backend_node_id="8598" node="3">\n<div backend_node_id="8605" node="4">\n<div backend_node_id="8608" node="5">\n<text backend_node_id="8609" node="6">Sign In</text>\n</div>\n<div backend_node_id="8613" node="7">\n<text backend_node_id="8614" node="8">Create an Account</text>\n</div>\n</div>\n<div backend_node_id="8615" node="9">\n<div backend_node_id="8619" node="10">\n<text backend_node_id="8620" node="11">Pharmacy</text>\n</div>\n<div backend_node_id="8624" node="12">\n<text backend_node_id="8625" node="13">MinuteClinic</text>\n</div>\n<div backend_node_id="8631" node="14">\n<text backend_node_id="8632" node="15">Shop</text>\n</div>\n<div backend_node_id="8636" node="16">\n<text backend_node_id="8637" node="17">ExtraCare</text>\n</div>\n<div backend_node_id="8643" node="18">\n<text backend_node_id="8644" node=

In [23]:
decoded_text

': To aucchinitec to a list.\n\n a.\n\n the pharmacy pharmacy pharmacy the code 90204.\n\nTo: httpshttps>\n="_id="1444" class_idhttps0  class_node_id="8522" node="2">\n  backend_node="="8596" node="3">\n<div backend_node_id="8601" node="4">\n<div backend_node_id="8608" node="5">\n<divol backend_node_id="8619" node="6">Z In</text>\n</div>\n</ backend_node_id="8611" node="7">\n<text backend_node_id="8614" node="8">Welcome Account account</text>\n</div>\n</div>\n</ backend_node_id="8617" node="9">\n<div backend_node_id="8618" node="10">\n<text backend_node_id="8620" node="11">Welcomearmacy</text>\n</div>\n<div backend_node_id="8623" node="12">\n<text backend_node_id="8625" node="13">FindClinic</text>\n</div>\n<div backend_node_id="8620" node="14">\n<text backend_node_id="8632" node="15">Health</text>\n</div>\n<div backend_node_id="8636" node="16">\n<text backend_node_id="8637" node="17">Pres Savings</text>\n</div>\n<div backend_node_id="8641" node="18">\n<text backend_node_id="8644" node=

In [24]:
decoded_text2 = tokenizer.decode(first['labels'][0, :], skip_special_tokens=True)
decoded_text2

'[img]  Zyrtec 24 Hour Allergy Relief Tablets with 10 mg C... -> CLICK'

In [25]:
import yaml
def load_config(config_path):
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    return config

In [26]:
config = load_config('scribe_agent/configs/multimodal_config.yaml')

In [29]:
# import wandb
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, AutoConfig, get_scheduler

In [30]:
wandb.init(
                project=config["project_name"],
                name=config["run_name"],
                config=config
            )

NameError: name 'wandb' is not defined

In [33]:
# Set up optimizer
optimizer = AdamW(
    model.parameters(),
    lr=config["training"]["learning_rate"],
    weight_decay=config["training"]["weight_decay"]
)
    
    # Set up learning rate scheduler
num_update_steps_per_epoch = len(train_loader) // config["training"]["gradient_accumulation_steps"]
max_train_steps = config["training"]["num_epochs"] * num_update_steps_per_epoch
    
lr_scheduler = get_scheduler(
    name=config["training"]["lr_scheduler_type"],
    optimizer=optimizer,
    num_warmup_steps=config["training"]["warmup_steps"],
    num_training_steps=max_train_steps
)
    

In [34]:
# Training loop
global_step = 0
best_val_loss = float('inf')    

In [None]:
for epoch in range(config["training"]["num_epochs"]):
    model.train()
    train_loss = 0.0
    
    for step, batch in enumerate(train_dataloader):
        # Forward pass
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            visual_features=batch["visual_features"],
            labels=batch["labels"]
        )
        
        loss = outputs["loss"]
        
        # Scale loss for gradient accumulation
        loss = loss / config["training"]["gradient_accumulation_steps"]
        model.backward(loss)
        
        # Update weights
        if (step + 1) % config["training"]["gradient_accumulation_steps"] == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            global_step += 1
            
            # Log metrics
            if accelerator.is_main_process and global_step % config["training"]["logging_steps"] == 0:
                train_loss += loss.item() * config["training"]["gradient_accumulation_steps"]
                current_lr = optimizer.param_groups[0]["lr"]
                
                # Log to TensorBoard
                tb_writer.add_scalar("train/loss", train_loss / (step + 1), global_step)
                tb_writer.add_scalar("train/lr", current_lr, global_step)
                
                # Log to wandb
                if config["training"].get("use_wandb", False):
                    wandb.log({
                        "train/loss": train_loss / (step + 1),
                        "train/lr": current_lr,
                        "train/epoch": epoch,
                        "train/global_step": global_step
                    })
                
                print(f"Epoch: {epoch}, Step: {global_step}, Loss: {train_loss / (step + 1):.4f}, LR: {current_lr:.6f}")
        
        # Evaluate
        if global_step % config["training"]["eval_steps"] == 0:
            model.eval()
            val_loss = 0.0
            
            with torch.no_grad():
                for val_step, val_batch in enumerate(val_dataloader):
                    val_outputs = model(
                        input_ids=val_batch["input_ids"],
                        attention_mask=val_batch["attention_mask"],
                        visual_features=val_batch["visual_features"],
                        labels=val_batch["labels"]
                    )
                    
                    val_loss += val_outputs["loss"].item()
            
            val_loss /= len(val_dataloader)
            
            # Log validation metrics
            if accelerator.is_main_process:
                # Log to TensorBoard
                tb_writer.add_scalar("val/loss", val_loss, global_step)
                
                # Log to wandb
                if config["training"].get("use_wandb", False):
                    wandb.log({
                        "val/loss": val_loss,
                        "val/epoch": epoch,
                        "val/global_step": global_step
                    })
                
                print(f"Validation - Epoch: {epoch}, Step: {global_step}, Loss: {val_loss:.4f}")
                
                # Save best model
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    
                    # Save checkpoint
                    accelerator.wait_for_everyone()
                    unwrapped_model = accelerator.unwrap_model(model)
                    
                    # Save model
                    model_path = os.path.join(config["output_dir"], "best_model")
                    os.makedirs(model_path, exist_ok=True)
                    unwrapped_model.save_pretrained(model_path)
                    
                    # Save tokenizer
                    tokenizer.save_pretrained(model_path)
                    
                    print(f"New best model saved with val_loss: {val_loss:.4f}")
            
            model.train()

## Analyze Cross-Modal Attention

You can inspect the cross-modal attention scores to understand how the model connects visual and textual elements.

In [None]:
def get_attention_scores(html_content, screenshot, task_description, url):
    # Process HTML
    processed_html, _ = process_html(html_content)
    
    # Convert screenshot to array if it's a PIL Image
    if isinstance(screenshot, Image.Image):
        screenshot_array = np.array(screenshot)
    else:
        screenshot_array = screenshot
    
    # Extract visual features
    visual_features = visual_processor.extract_visual_features(
        screenshot_array,
        {}  # Empty bounding boxes for demo
    )
    
    # Create prompt
    prompt = f"Objective: {task_description}\nURL: {url}\n"
    input_text = prompt + "\n" + processed_html
    
    # Tokenize input
    inputs = tokenizer(
        input_text,
        max_length=32768,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Forward pass with attention outputs
    with torch.no_grad():
        # Get element scores if available
        if hasattr(model, 'cross_attention'):
            # Process text input
            text_outputs = model.text_model(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                output_hidden_states=True,
                return_dict=True
            )
            
            text_hidden_states = text_outputs.hidden_states[-1]
            
            # Process visual features
            projected_visual_features = model.process_visual_features(visual_features)
            
            # Get attention by calling cross attention directly
            q = model.cross_attention.query(text_hidden_states)
            k = model.cross_attention.key(projected_visual_features)
            v = model.cross_attention.value(projected_visual_features)
            
            # Reshape for multi-head attention
            import einops
            q = einops.rearrange(q, 'b s (h d) -> b h s d', h=model.cross_attention.num_heads)
            k = einops.rearrange(k, 'b s (h d) -> b h s d', h=model.cross_attention.num_heads)
            
            # Compute attention scores
            scores = torch.matmul(q, k.transpose(-1, -2)) / (model.cross_attention.head_size ** 0.5)
            
            # Get attention weights
            import torch.nn.functional as F
            attn_weights = F.softmax(scores, dim=-1)
            
            # Get element localization scores
            enhanced_text_features = model.cross_attention(
                text_features=text_hidden_states,
                visual_features=projected_visual_features,
                attention_mask=inputs["attention_mask"]
            )
            
            element_scores = model.element_localization(
                enhanced_text_features, 
                text_hidden_states
            )
            
            return {
                'attention_weights': attn_weights.cpu().numpy(),
                'element_scores': element_scores.cpu().numpy(),
                'input_tokens': tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
            }
        else:
            return {
                'attention_weights': None,
                'element_scores': None,
                'input_tokens': tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
            }

# Example usage
# attention_info = get_attention_scores(
#    html_content="<html><body><button id='submit'>Submit</button></body></html>",
#    screenshot=np.zeros((100, 100, 3), dtype=np.uint8),  # Dummy image
#    task_description="Submit the form",
#    url="https://example.com/form"
# )
# 
# # Visualize attention (if available)
# if attention_info['attention_weights'] is not None:
#     # Plot attention heatmap for the first head
#     plt.figure(figsize=(10, 8))
#     plt.imshow(attention_info['attention_weights'][0, 0], cmap='viridis')
#     plt.colorbar()
#     plt.title('Cross-Modal Attention Weights (Head 0)')
#     plt.show()
# else:
#     print("Attention visualization not available in test mode")