In [1]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from uitars import rollout_to_messages
from peft import LoraConfig, get_peft_model
from generate_rollout_batch import deserialize_rollout

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
processor = AutoProcessor.from_pretrained("ByteDance-Seed/UI-TARS-1.5-7B")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "ByteDance-Seed/UI-TARS-1.5-7B", 
    device_map="cuda", 
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
model.config.use_cache = False
model.gradient_checkpointing_enable()

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
Loading checkpoint shards: 100%|██████████| 7/7 [00:08<00:00,  1.23s/it]


In [3]:
getattr(model.config, "_attn_implementation", "default (eager or sdpa)")

'flash_attention_2'

In [4]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],   # You may adjust to match the linear/attention modules in Qwen2_5_VL
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [5]:
model.base_model.model.model.visual.patch_embed.proj.weight.requires_grad

False

In [6]:
model.train()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2_5_VLForConditionalGeneration(
      (model): Qwen2_5_VLModel(
        (visual): Qwen2_5_VisionTransformerPretrainedModel(
          (patch_embed): Qwen2_5_VisionPatchEmbed(
            (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
          )
          (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
          (blocks): ModuleList(
            (0-31): 32 x Qwen2_5_VLVisionBlock(
              (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
              (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
              (attn): Qwen2_5_VLVisionAttention(
                (qkv): Linear(in_features=1280, out_features=3840, bias=True)
                (proj): Linear(in_features=1280, out_features=1280, bias=True)
              )
              (mlp): Qwen2_5_VLMLP(
                (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
                (up_proj): Linear(in_features=1280, out_feat

In [8]:
rollout = deserialize_rollout("../runs/20251031_204956/rollout_000/rollout.json")

In [13]:
messages = rollout_to_messages(rollout)
for message in messages:
    if type(message["content"]) != list:
        message["content"] = [{"type": "text", "text": message["content"]}]
    for block in message["content"]:
        if block["type"] == "image_url":
            block["type"] = "image"
            block["image"] = block["image_url"]["url"]
            del block["image_url"]

In [14]:
inputs = processor.apply_chat_template(messages)

In [15]:
inputs = processor.apply_chat_template(
	messages,
	#add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

In [16]:
inputs["labels"] = inputs.input_ids
inputs

{'input_ids': tensor([[151644,   8948,    198,  ..., 151653, 151645,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[1.4340, 1.4340, 1.4340,  ..., 2.1459, 2.1459, 2.1459],
        [1.4340, 1.4340, 1.4340,  ..., 2.1459, 2.1459, 2.1459],
        [1.4340, 1.4340, 1.4340,  ..., 2.1459, 2.1459, 2.1459],
        ...,
        [1.7114, 1.7114, 1.7114,  ..., 2.0464, 2.0464, 2.0464],
        [1.7114, 1.7114, 1.7114,  ..., 2.0464, 2.0464, 2.0464],
        [1.7114, 1.7114, 1.7114,  ..., 2.0464, 2.0464, 2.0464]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 58, 92],
        [ 1, 58, 92],
        [ 1, 58, 92],
        [ 1, 58, 92],
        [ 1, 58, 92],
        [ 1, 58, 92],
        [ 1, 58, 92],
        [ 1, 58, 92],
        [ 1, 58, 92],
        [ 1, 58, 92]], device='cuda:0'), 'labels': tensor([[151644,   8948,    198,  ..., 151653, 151645,    198]],
       device='cuda:0')}

In [17]:
inputs.input_ids.shape

torch.Size([1, 14522])

In [18]:
from torch.profiler import profile, ProfilerActivity

# 1. Enable memory profiling and capture the forward pass
with profile(activities=[
        ProfilerActivity.CPU, 
        ProfilerActivity.CUDA], 
    profile_memory=True, 
    record_shapes=True) as prof:
    
    # Run your problematic forward pass here
    # Ensure your model and input tensors are on the GPU
    #with torch.no_grad():
    output = model(**inputs)

# 2. Print a table summarizing the CUDA memory events
print(prof.key_averages().table(sort_by="cuda_memory_usage", row_limit=15))

# 3. Optional: Export for a visual timeline (see Step 2)
# prof.export_chrome_trace("trace.json")



-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              aten::mul         0.36%       9.798ms         3.39%      92.147ms     134.718us     187.146ms         9.01%     208.133ms     304.287us     215.94 KB     215.94 KB     111.63 GB     111.63 G

In [20]:
output.loss.backward()

In [13]:
model.print_trainable_parameters()

trainable params: 5,046,272 || all params: 8,297,212,928 || trainable%: 0.0608


In [19]:
import torch
import gc

def get_gpu_tensors_sorted_by_size(device_id=0):
    """
    Retrieves all PyTorch tensors currently in GPU memory, calculates their size,
    and returns a sorted list of their properties.
    """
    print(f"--- Listing Live PyTorch Tensors on GPU {device_id} ---")
    
    tensor_info = []
    
    # 1. Iterate through all objects tracked by the Python garbage collector
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj) and obj.is_cuda and obj.get_device() == device_id:
                # Calculate size in bytes: element_size * number_of_elements
                size_in_bytes = obj.element_size() * obj.nelement()
                
                tensor_info.append({
                    'size_MB': size_in_bytes / (1024 * 1024),
                    'shape': tuple(obj.size()),
                    'dtype': obj.dtype,
                    'requires_grad': obj.requires_grad,
                    # Optional: Add traceback/origin for more advanced debugging if needed
                })
        except Exception:
            # Safely skip objects that raise an error during property check
            continue

    # 2. Sort the list by size in descending order
    tensor_info.sort(key=lambda x: x['size_MB'], reverse=True)
    
    # 3. Print the sorted list
    print(f"Total Tensors Found: {len(tensor_info)}")
    print("-----------------------------------------------------------------------")
    print(f"{'Size (MB)':<12} {'Shape':<30} {'Dtype':<12} {'Grad':<5}")
    print("-----------------------------------------------------------------------")
    for info in tensor_info[:100]: # Only list the top 20 largest tensors
        size_str = f"{info['size_MB']:<11.3f}"
        shape_str = f"{str(info['shape']):<30}"
        dtype_str = f"{str(info['dtype']).split('.')[-1]:<12}"
        grad_str = f"{str(info['requires_grad']):<5}"
        print(f"{size_str} {shape_str} {dtype_str} {grad_str}")
    
    print("-----------------------------------------------------------------------")

    # Calculate total size in MB of all tensors
    total_size_mb = sum(info['size_MB'] for info in tensor_info)
    print(f"Total size of all tensors: {total_size_mb:.3f} MB")


    return tensor_info

# Example Usage:
# Run this function at the point of your crash (or near it)
_ = get_gpu_tensors_sorted_by_size(device_id=0)

--- Listing Live PyTorch Tensors on GPU 0 ---


  return isinstance(obj, torch.Tensor)


Total Tensors Found: 920
-----------------------------------------------------------------------
Size (MB)    Shape                          Dtype        Grad 
-----------------------------------------------------------------------
4211.947    (1, 14522, 152064)             bfloat16     True 
1039.500    (152064, 3584)                 bfloat16     False
1039.500    (152064, 3584)                 bfloat16     False
239.377     (53360, 1176)                  float32      False
198.543     (1, 14522, 3584)               float32      True 
143.626     (32016, 1176)                  float32      False
129.500     (18944, 3584)                  bfloat16     False
129.500     (18944, 3584)                  bfloat16     False
129.500     (3584, 18944)                  bfloat16     False
129.500     (18944, 3584)                  bfloat16     False
129.500     (18944, 3584)                  bfloat16     False
129.500     (3584, 18944)                  bfloat16     False
129.500     (18944, 3584