## Import Test

In [6]:

import logging
import os
import warnings
import psutil
from collections import deque
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional, Tuple, Type


import torch
import torch.distributed
from torch.distributed.device_mesh import init_device_mesh
import verl.utils.torch_functional as verl_F
from omegaconf import DictConfig, open_dict
from verl import DataProto
from verl.single_controller.base import Worker
from verl.single_controller.base.decorator import register, Dispatch
from verl.utils import hf_tokenizer, hf_processor
from verl.utils.debug import log_gpu_memory_usage
from verl.utils.fs import copy_to_local
from verl.utils.fsdp_utils import get_fsdp_wrap_policy, init_fn, get_init_weight_context_manager
from verl.utils.fsdp_utils import offload_fsdp_optimizer, offload_fsdp_model_to_cpu, load_fsdp_optimizer, \
    load_fsdp_model_to_gpu
from verl.utils.import_utils import import_external_libs
from verl.utils.model import compute_position_id_with_mask
from verl.utils.flops_counter import FlopsCounter
from verl.utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager
from verl.workers.sharding_manager.fsdp_ulysses import FSDPUlyssesShardingManager

from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
from prismatic.models.action_heads import DiffusionActionHead_V1
from prismatic.models.backbones.llm.prompting import PurePromptBuilder
from prismatic.models.projectors import (
    NoisyActionProjector,
    ProprioProjector,
)
from prismatic.util.data_utils import PaddedCollatorForActionPrediction
from prismatic.vla.action_tokenizer import ActionTokenizer
from prismatic.models import load, load_vla

from codetiming import Timer

## Load Test

In [None]:
ckpt_path='/202431205128/baseline/minivla-oft/openvla-oft/outputs/8.10/minivla+libero_4_task_suites_no_noops+b16+lr-0.0001+lora-r64+dropout-0.0--image_aug--v1--minivla--lora64a128--token_64--4_task--2025-08-10_11-15-13--50000_chkpt'
cfg_path='/202431205128/baseline/minivla-oft/pretrained_models/minivla/config.json'
fsdp_config = {
    "wrap_policy": {
        # "transformer_layer_cls_to_wrap": None,
        "min_num_params": 0
    },
    "param_offload": False,
    "optimizer_offload": False,
    "fsdp_size": -1
}
optim_config = {
    "lr": 1e-6,
    "lr_warmup_steps": -1,
    "lr_warmup_steps_ratio": 0.0,
    "min_lr_ratio": None,
    "warmup_style": "constant",
    "total_training_steps": -1,
    "weight_decay": 0.01,
    "lora_rank": 64,
    "lora_dropout": 0.0
}
num_images_in_input = 1
enable_gradient_checkpointing = False
trust_remote_code = False
use_liger = False
role = 'actor'
from verl.utils.model import print_model_size
from verl.utils.torch_dtypes import PrecisionType
from transformers import AutoModelForCausalLM, AutoConfig, AutoModelForVision2Seq, AutoImageProcessor, AutoProcessor
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision, CPUOffload
from torch import optim
from experiments.robot.openvla_utils import update_auto_map, check_model_logic_mismatch, _load_dataset_stats, find_checkpoint_file, load_component_state_dict
AutoConfig.register("openvla", OpenVLAConfig)
AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)

import torch.distributed
# 修复分布式初始化问题 - 设置单机环境变量
if not torch.distributed.is_initialized():
    # 设置单机分布式环境变量
    os.environ.setdefault('RANK', '0')
    os.environ.setdefault('WORLD_SIZE', '1')
    os.environ.setdefault('MASTER_ADDR', '127.0.0.1')
    import random
    port = random.randint(12000, 65000)
    os.environ['MASTER_PORT'] = str(port)


    
    # 检查CUDA是否可用
    if torch.cuda.is_available():
        backend = 'nccl'
    else:
        backend = 'gloo'
        
    try:
        torch.distributed.init_process_group(backend=backend, rank=0, world_size=1)
        print(f"Initialized distributed with backend: {backend}")
    except Exception as e:
        print(f"Failed to initialize distributed: {e}")
        print("Continuing without distributed training...")

def create_device_mesh(world_size, fsdp_size):
    if fsdp_size < 0 or fsdp_size >= world_size:
        device_mesh = init_device_mesh('cuda', mesh_shape=(world_size,), mesh_dim_names=['fsdp'])
    else:
        device_mesh = init_device_mesh('cuda',
                                       mesh_shape=(world_size // fsdp_size, fsdp_size),
                                       mesh_dim_names=['ddp', 'fsdp'])
    return device_mesh

# build device mesh for FSDP
if torch.distributed.is_initialized():
    world_size = torch.distributed.get_world_size()
else:
    world_size = 1  # Default to 2 for single-node testing

print(f"World size: {world_size}")
# TODO(sgm): support FSDP hybrid shard for larger model
# 修复字典访问问题
device_mesh = create_device_mesh(world_size=world_size, fsdp_size=fsdp_config['fsdp_size'])

print(f"World size: {world_size}")
print(f"Device mesh: {device_mesh}")

update_auto_map(ckpt_path)
check_model_logic_mismatch(ckpt_path)

torch_dtype = fsdp_config.get('model_dtype', None)
if torch_dtype is None:
    torch_dtype = torch.bfloat16
else:
    torch_dtype = PrecisionType.to_dtype(torch_dtype)

# override model kwargs
actor_model_config = AutoConfig.from_pretrained(cfg_path, trust_remote_code=trust_remote_code)
actor_model_config.attn_implementation='flash_attention_2'

print(f'Model config after override: {actor_model_config}')

# NOTE(fix me): tie_word_embedding causes meta_tensor init to hang
init_context = get_init_weight_context_manager(use_meta_tensor=not actor_model_config.tie_word_embeddings,
                                                mesh=device_mesh)

with init_context(), warnings.catch_warnings():
    warnings.simplefilter("ignore")
    if type(actor_model_config) in AutoModelForVision2Seq._model_mapping.keys():
        actor_module_class = AutoModelForVision2Seq
    else:
        raise ValueError(f'{type(actor_model_config)} is not supported')
        actor_module_class = AutoModelForCausalLM
    
    actor_module = actor_module_class.from_pretrained(pretrained_model_name_or_path=ckpt_path,
                                                        torch_dtype=torch_dtype,
                                                        attn_implementation='flash_attention_2',
                                                        low_cpu_mem_usage=False,
                                                        trust_remote_code=trust_remote_code)

    actor_module.vision_backbone.set_num_images_in_input(num_images_in_input)
    _load_dataset_stats(actor_module, ckpt_path)

    # Apply Liger kernel to the model if use_liger is set to True
    if use_liger:
        from liger_kernel.transformers.monkey_patch import _apply_liger_kernel_to_instance
        _apply_liger_kernel_to_instance(model=actor_module)
    
    # 修复 optim_config 访问问题
    if optim_config is not None and optim_config.get('lora_rank', 0) > 0:
        from peft import LoraConfig, get_peft_model
        lora_config = LoraConfig(
            r=optim_config['lora_rank'],
            lora_alpha=2 * optim_config['lora_rank'],
            lora_dropout=optim_config.get('lora_dropout', 0.0),
            target_modules="all-linear",
            init_lora_weights="gaussian",
        )
        actor_module = get_peft_model(actor_module, lora_config)
        for name, param in actor_module.named_parameters():
            # print(f"Parameter {name} requires_grad: {param.requires_grad}")
            if "action_queries" in name:
                param.requires_grad = True
        actor_module.print_trainable_parameters()

    actor_module.to(torch_dtype)
    # local_path = Path('/202431205128/baseline/COPY/MARVEL/checkpoints/test/actor/lora_adapter')
    # actor_module.save_pretrained(local_path)


    if enable_gradient_checkpointing:
        actor_module.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False})

if torch.distributed.is_initialized():
    torch.distributed.barrier()

actor_auto_wrap_policy = get_fsdp_wrap_policy(module=actor_module, config=fsdp_config.get('wrap_policy', None), is_lora=optim_config.get('lora_rank', 0) > 0)

print(f'actor_wrap_policy: {actor_auto_wrap_policy}')

def get_sharding_strategy(device_mesh):
    from torch.distributed.fsdp import ShardingStrategy
    if device_mesh.ndim == 1:
        sharding_strategy = ShardingStrategy.FULL_SHARD
    elif device_mesh.ndim == 2:
        sharding_strategy = ShardingStrategy.HYBRID_SHARD
    else:
        raise NotImplementedError(f"Get device mesh ndim={device_mesh.ndim}, but only support 1 or 2")
    return sharding_strategy

fsdp_mesh = device_mesh
sharding_strategy = get_sharding_strategy(fsdp_mesh)

ACTION_DIM = 7
PROPRIO_DIM = 8
NUM_DIFFUSION_STEPS = 50
from torch.nn.parallel import DistributedDataParallel as DDP
# note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect
# TODO(zhangchi.usc1992): 1. support create from random initialized model. 2. Support init with FSDP directly
processor = AutoProcessor.from_pretrained(ckpt_path, trust_remote_code=trust_remote_code)

device = torch.cuda.current_device()
llm_dim = actor_module.llm_dim
print(f'actor_module.llm_dim_origin: {actor_module.llm_dim}')
proprio_projector = ProprioProjector(
    llm_dim=llm_dim,
    proprio_dim=PROPRIO_DIM,
    ).to(device=device, dtype=torch.bfloat16)
proprio_projector_path = find_checkpoint_file(ckpt_path, "proprio_projector")
proprio_state_dict = load_component_state_dict(proprio_projector_path)
proprio_projector.load_state_dict(proprio_state_dict)
proprio_projector = DDP(proprio_projector, device_ids=[device], gradient_as_bucket_view=True, device_mesh=fsdp_mesh)

action_head = DiffusionActionHead_V1(
        input_dim=llm_dim, hidden_dim=llm_dim, action_dim=ACTION_DIM, num_diffusion_steps=NUM_DIFFUSION_STEPS
    ).to(device=device, dtype=torch.bfloat16)
action_head_path = find_checkpoint_file(ckpt_path, "action_head")
action_head_state_dict = load_component_state_dict(action_head_path)
action_head.load_state_dict(action_head_state_dict)

noisy_action_projector = NoisyActionProjector(
    llm_dim=llm_dim).to(device=device, dtype=torch.bfloat16)
noisy_action_projector_path = find_checkpoint_file(ckpt_path, "noisy_action_projector")
noisy_action_projector_state_dict = load_component_state_dict(noisy_action_projector_path)
noisy_action_projector.load_state_dict(noisy_action_projector_state_dict)
noisy_action_projector = DDP(noisy_action_projector, device_ids=[device], gradient_as_bucket_view=True, device_mesh=fsdp_mesh)

# TODO: add transformer policy
# We force reference policy to use CPUOffload to save memory.
# We force turn off CPUOffload for actor because it causes incorrect results when using grad accumulation
cpu_offload = None if role == 'actor' else CPUOffload(offload_params=True)
actor_module_fsdp = FSDP(
    actor_module,
    cpu_offload=cpu_offload,
    param_init_fn=init_fn,
    use_orig_params=True,
    auto_wrap_policy=actor_auto_wrap_policy,
    device_id=torch.cuda.current_device(),
    sharding_strategy=sharding_strategy,  # zero3
    sync_module_states=True,
    device_mesh=device_mesh,
    forward_prefetch=False)

action_head_fsdp = FSDP(
    action_head,
    cpu_offload=cpu_offload,
    param_init_fn=init_fn,
    use_orig_params=True,
    device_id=torch.cuda.current_device(),
    sharding_strategy=sharding_strategy,  # zero3
    sync_module_states=True,
    device_mesh=device_mesh,
    forward_prefetch=False)
action_head = action_head_fsdp._fsdp_wrapped_module

# TODO: add more optimizer args into config
if role == 'actor' and optim_config is not None:
    from verl.utils.torch_functional import get_constant_schedule_with_warmup
    trainable_params = [param for param in actor_module_fsdp.parameters() if param.requires_grad]
    trainable_params += [param for param in action_head_fsdp.parameters() if param.requires_grad]
    trainable_params += [param for param in noisy_action_projector.parameters() if param.requires_grad]
    trainable_params += [param for param in proprio_projector.parameters() if param.requires_grad]
    print(f"# total trainable params: {sum(p.numel() for p in trainable_params)}")
    
    # 修复字典访问
    actor_optimizer = optim.AdamW(trainable_params,
                                    lr=optim_config['lr'],
                                    betas=optim_config.get('betas', (0.9, 0.999)),
                                    weight_decay=optim_config.get('weight_decay', 1e-2))

    total_steps = optim_config.get('total_training_steps', 0)
    num_warmup_steps = int(optim_config.get('lr_warmup_steps', -1))
    if num_warmup_steps < 0:
        num_warmup_steps_ratio = optim_config.get('lr_warmup_steps_ratio', 0.)
        num_warmup_steps = int(num_warmup_steps_ratio * total_steps)

    print(f'Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}')

    actor_lr_scheduler = get_constant_schedule_with_warmup(optimizer=actor_optimizer,
                                                            num_warmup_steps=num_warmup_steps)
else:
    actor_optimizer = None
    actor_lr_scheduler = None

actor_module = actor_module_fsdp._fsdp_wrapped_module

actor_module_fsdp.eval()
action_head_fsdp.eval()
proprio_projector.eval()
noisy_action_projector.eval()


# print(f"actor_module_fsdp: {actor_module_fsdp}, actor_module_optimizer: {actor_optimizer}, \
#       actor_lr_scheduler: {actor_lr_scheduler}, actor_module_config: {actor_model_config}")
print("Model and optimizer setup completed successfully!")




World size: 1
World size: 1
Device mesh: DeviceMesh([0], mesh_dim_names=('fsdp',))
Created backup of original config at: /202431205128/baseline/minivla-oft/openvla-oft/outputs/8.10/minivla+libero_4_task_suites_no_noops+b16+lr-0.0001+lora-r64+dropout-0.0--image_aug--v1--minivla--lora64a128--token_64--4_task--2025-08-10_11-15-13--50000_chkpt/config.json.back.20250815_063453
Updated config.json at: /202431205128/baseline/minivla-oft/openvla-oft/outputs/8.10/minivla+libero_4_task_suites_no_noops+b16+lr-0.0001+lora-r64+dropout-0.0--image_aug--v1--minivla--lora64a128--token_64--4_task--2025-08-10_11-15-13--50000_chkpt/config.json
Changes made:
  - Set AutoConfig to "configuration_prismatic.OpenVLAConfig"
  - Set AutoModelForVision2Seq to "modeling_prismatic.OpenVLAForActionPrediction"
Model config after override: OpenVLAConfig {
  "arch_specifier": "no-align+fused-gelu-mlp",
  "architectures": [
    "OpenVLAForActionPrediction"
  ],
  "attn_implementation": "flash_attention_2",
  "auto_map":

## Data Test

In [None]:
from prismatic.vla.datasets import RLDSBatchTransform, RLDSDataset, RLDSBatchTransform_V1
from torch.utils.data import DataLoader
action_tokenizer = ActionTokenizer(processor.tokenizer)
batch_transform = RLDSBatchTransform_V1(
    action_tokenizer,
    processor.tokenizer,
    image_transform=processor.image_processor.apply_transform,
    prompt_builder_fn=PurePromptBuilder,
    use_wrist_image=False,
    use_proprio=True,
    use_minivla=True
)
train_dataset = RLDSDataset(
    '/202431205128/baseline/minivla-oft/data/modified_libero_rlds',
    'libero_4_task_suites_no_noops',
    batch_transform,
    resize_resolution=tuple(actor_module_fsdp._fsdp_wrapped_module.config.image_sizes),
    shuffle_buffer_size=100_000,
    image_aug=True,
)
collator = PaddedCollatorForActionPrediction(
    processor.tokenizer.model_max_length, processor.tokenizer.pad_token_id, padding_side="right"
)
dataloader = DataLoader(
    train_dataset,
    batch_size=8,
    sampler=None,
    collate_fn=collator,
    num_workers=0,  # Important: Set to 0 if using RLDS, which uses its own parallelism
    )
print('all done!!!!!!!!!!')

2025-08-15 06:34:39.563679: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization


2025-08-15 06:34:39.710844: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization
2025-08-15 06:34:39.936328: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization


2025-08-15 06:34:40.072147: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization
2025-08-15 06:34:40.245824: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization


2025-08-15 06:34:40.373507: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization
2025-08-15 06:34:40.541930: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization


2025-08-15 06:34:40.669757: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization



######################################################################################
# Loading the following 4 datasets (incl. sampling weight):                         #
######################################################################################



2025-08-15 06:34:40.843573: I tensorflow/core/grappler/optimizers/data/replicate_on_split.cc:32] Running replicate on split optimization


Cause: could not parse the source code of <function apply_trajectory_transforms.<locals>.<lambda> at 0x7feb6017ab00>: no matching AST found among candidates:
# coding=utf-8
lambda x: tf.math.reduce_any(x['task']['language_instruction'] != '')
# coding=utf-8
lambda x: tf.math.reduce_all(tf.math.abs(x['action']) <= max_action)
# coding=utf-8
lambda x: tf.math.reduce_all(tf.math.abs(x['observation']['proprio']) <= max_proprio)


TypeError: in user code:


    TypeError: outer_factory.<locals>.inner_factory.<locals>.tf__subsample() missing 1 required positional argument: 'subsample_length'


## Save Test


In [6]:
from torch.distributed.fsdp import FullStateDictConfig, StateDictType
from peft import get_peft_model_state_dict, PeftModel

local_path = Path('/202431205128/baseline/COPY/MARVEL/checkpoints/test/actor')
# record the previous global step
global_step = 0

ckpt_name_suffix = f'{global_step}_checkpoint.pt'
def local_mkdir(path):
    from filelock import FileLock
    import tempfile
    if not os.path.isabs(path):

        working_dir = os.getcwd()
        path = os.path.join(working_dir, path)

    # Using hash value of path as lock file name to avoid long file name
    lock_filename = f"ckpt_{hash(path) & 0xFFFFFFFF:08x}.lock"
    lock_path = os.path.join(tempfile.gettempdir(), lock_filename)

    try:
        with FileLock(lock_path, timeout=60):  # Add timeout
            # make a new dir
            os.makedirs(path, exist_ok=True)
    except Exception as e:
        print(f"Warning: Failed to acquire lock for {path}: {e}")
        # Even if the lock is not acquired, try to create the directory
        os.makedirs(path, exist_ok=True)

    return path

local_path = local_mkdir(local_path)
adapter_path = local_mkdir(local_path / 'lora_adapter')
torch.distributed.barrier()

state_dict_cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    with FSDP.state_dict_type(actor_module_fsdp, StateDictType.FULL_STATE_DICT, state_dict_cfg):
        model_state_dict = actor_module_fsdp._fsdp_wrapped_module.state_dict()
    with FSDP.state_dict_type(action_head_fsdp, StateDictType.FULL_STATE_DICT, state_dict_cfg):
        action_head_state_dict = action_head_fsdp._fsdp_wrapped_module.state_dict()
    
    if torch.distributed.get_rank() == 0:
        actor_module_fsdp._fsdp_wrapped_module.save_pretrained(adapter_path, state_dict=model_state_dict)
        # torch.save(model_state_dict, adapter_path / f'model--{ckpt_name_suffix}')

        torch.save(action_head_state_dict, local_path / f'action_head--{ckpt_name_suffix}')
        torch.save(noisy_action_projector.state_dict(), local_path / f'noisy_action_projector--{ckpt_name_suffix}')
        torch.save(proprio_projector.state_dict(), local_path / f'proprio_projector--{ckpt_name_suffix}')

torch.distributed.barrier()


In [5]:
import torch
from pathlib import Path
import numpy as np
device = torch.cuda.current_device()

def compare_adapter_models(model1_path, model2_path):
    """比较两个 adapter 模型的差别"""
    
    # 加载两个模型的状态字典
    if Path(model1_path).is_dir():
        # 如果是目录，加载 adapter_model.safetensors 或 pytorch_model.bin
        from safetensors.torch import load_file
        try:
            state_dict1 = load_file(Path(model1_path) / "adapter_model.safetensors")
        except:
            try:
                state_dict1 = torch.load(Path(model1_path) / "pytorch_model.bin", map_location='cpu')
            except:
                # 如果上述都失败，尝试加载目录中的 .pt 文件
                pt_files = list(Path(model1_path).glob("*.pt"))
                if pt_files:
                    state_dict1 = torch.load(pt_files[0], map_location='cpu')
                else:
                    raise FileNotFoundError(f"No valid model file found in {model1_path}")
    else:
        # 直接加载文件
        state_dict1 = torch.load(model1_path, map_location='cpu')
    
    if Path(model2_path).is_dir():
        from safetensors.torch import load_file
        try:
            state_dict2 = load_file(Path(model2_path) / "adapter_model.safetensors")
        except:
            try:
                state_dict2 = torch.load(Path(model2_path) / "pytorch_model.bin", map_location='cpu')
            except:
                # 如果上述都失败，尝试加载目录中的 .pt 文件
                pt_files = list(Path(model2_path).glob("*.pt"))
                if pt_files:
                    state_dict2 = torch.load(pt_files[0], map_location='cpu')
                else:
                    raise FileNotFoundError(f"No valid model file found in {model2_path}")
    else:
        # 直接加载文件
        state_dict2 = torch.load(model2_path, map_location='cpu')
    
    print(f"Model 1 keys: {len(state_dict1.keys())}")
    print(f"Model 2 keys: {len(state_dict2.keys())}")
    
    # 比较键的差异
    keys1 = set(state_dict1.keys())
    keys2 = set(state_dict2.keys())
    
    common_keys = keys1 & keys2
    only_in_1 = keys1 - keys2
    only_in_2 = keys2 - keys1
    
    print(f"\n=== 键的比较 ===")
    print(f"共同的键: {len(common_keys)}")
    print(f"只在模型1中的键: {len(only_in_1)}")
    print(f"只在模型2中的键: {len(only_in_2)}")
    
    if only_in_1:
        print("只在模型1中的键:", list(only_in_1)[:5])  # 显示前5个
    if only_in_2:
        print("只在模型2中的键:", list(only_in_2)[:5])  # 显示前5个
    
    # 比较共同键的数值差异
    differences = {}
    print(f"\n=== 数值差异分析 ===")
    diff_key_num = 0
    
    for key in common_keys:
        tensor1 = state_dict1[key].to(device)  # 转换为float32以避免精度问题
        tensor2 = state_dict2[key].to(device)
        
        if tensor1.shape != tensor2.shape:
            print(f"形状不匹配 {key}: {tensor1.shape} vs {tensor2.shape}")
            diff_key_num += 1
            continue
            
        # 计算各种差异指标
        diff = tensor1 - tensor2
        abs_diff = torch.abs(diff)
        
        # 修复：使用正确的方式判断是否有显著差异
        max_abs_diff = abs_diff.max().item()
        if max_abs_diff > 1e-5:  # 使用标量值进行比较
            diff_key_num += 1
        
        differences[key] = {
            'max_abs_diff': max_abs_diff,
            'mean_abs_diff': abs_diff.mean().item(),
            'mse': torch.mean(diff ** 2).item(),
            'cosine_sim': torch.nn.functional.cosine_similarity(
                tensor1.flatten(), tensor2.flatten(), dim=0
            ).item(),
            'l2_norm_diff': torch.norm(diff).item(),
            'relative_diff': (torch.norm(diff) / torch.norm(tensor1)).item() if torch.norm(tensor1) > 0 else float('inf')
        }
    
    # 显示差异最大的参数
    sorted_by_max_diff = sorted(differences.items(), 
                               key=lambda x: x[1]['max_abs_diff'], 
                               reverse=True)
    
    print(f'差异的key个数：{diff_key_num}')
    print("差异最大的参数 (按最大绝对差异排序):")
    for i, (key, metrics) in enumerate(sorted_by_max_diff[:10]):  # 显示前10个
        print(f"{i+1:2d}. {key}")
        print(f"    最大绝对差异: {metrics['max_abs_diff']:.6f}")
        print(f"    平均绝对差异: {metrics['mean_abs_diff']:.6f}")
        print(f"    余弦相似度: {metrics['cosine_sim']:.6f}")
        print(f"    相对差异: {metrics['relative_diff']:.6f}")
        print()
    
    # 添加更详细的统计信息
    if differences:
        all_max_diffs = [d['max_abs_diff'] for d in differences.values()]
        all_mean_diffs = [d['mean_abs_diff'] for d in differences.values()]
        all_cosine_sims = [d['cosine_sim'] for d in differences.values()]
        
        print(f"\n=== 总体统计 ===")
        print(f"最大绝对差异范围: {min(all_max_diffs):.6f} - {max(all_max_diffs):.6f}")
        print(f"平均绝对差异范围: {min(all_mean_diffs):.6f} - {max(all_mean_diffs):.6f}")
        print(f"余弦相似度范围: {min(all_cosine_sims):.6f} - {max(all_cosine_sims):.6f}")
        print(f"几乎相同的参数 (max_diff < 1e-6): {sum(1 for d in all_max_diffs if d < 1e-6)}")
        print(f"有小差异的参数 (1e-6 <= max_diff < 1e-3): {sum(1 for d in all_max_diffs if 1e-6 <= d < 1e-3)}")
        print(f"有大差异的参数 (max_diff >= 1e-3): {sum(1 for d in all_max_diffs if d >= 1e-3)}")
    
    return differences

# 使用示例 - 修改路径
model1_path = '/202431205128/baseline/COPY/MARVEL/checkpoints/test/actor/lora_adapter'
model2_path = "/202431205128/baseline/minivla-oft/openvla-oft/outputs/8.12/minivla+libero_4_task_suites_no_noops+b8+lr-0.0001+lora-r64+dropout-0.0--image_aug--v1--minivla--lora64a128--token_64--4_task--2025-08-12_15-17-58--20_chkpt/lora_adapter"

# 验证文件是否存在
if Path(model1_path).exists():
    print(f"Model 1 文件存在: {model1_path}")
else:
    print(f"Model 1 文件不存在: {model1_path}")
    # 列出目录中的文件
    parent_dir = Path(model1_path).parent
    if parent_dir.exists():
        print(f"目录中的文件: {list(parent_dir.glob('*'))}")

if Path(model2_path).exists():
    print(f"Model 2 路径存在: {model2_path}")
else:
    print(f"Model 2 路径不存在: {model2_path}")

# 运行比较
try:
    differences = compare_adapter_models(model1_path, model2_path)
except Exception as e:
    print(f"比较过程中出错: {e}")
    print(f"请检查文件路径是否正确")

Model 1 文件存在: /202431205128/baseline/COPY/MARVEL/checkpoints/test/actor/lora_adapter
Model 2 路径存在: /202431205128/baseline/minivla-oft/openvla-oft/outputs/8.12/minivla+libero_4_task_suites_no_noops+b8+lr-0.0001+lora-r64+dropout-0.0--image_aug--v1--minivla--lora64a128--token_64--4_task--2025-08-12_15-17-58--20_chkpt/lora_adapter
Model 1 keys: 760
Model 2 keys: 760

=== 键的比较 ===
共同的键: 760
只在模型1中的键: 0
只在模型2中的键: 0

=== 数值差异分析 ===
差异的key个数：747
差异最大的参数 (按最大绝对差异排序):
 1. base_model.model.vision_backbone.fused_featurizer.blocks.24.mlp.fc2.lora_A.weight
    最大绝对差异: 0.127676
    平均绝对差异: 0.017630
    余弦相似度: -0.000157
    相对差异: 1.417156

 2. base_model.model.vision_backbone.fused_featurizer.blocks.4.mlp.fc2.lora_A.weight
    最大绝对差异: 0.119504
    平均绝对差异: 0.017627
    余弦相似度: -0.000922
    相对差异: 1.416366

 3. base_model.model.vision_backbone.featurizer.blocks.21.mlp.fc1.lora_A.weight
    最大绝对差异: 0.117411
    平均绝对差异: 0.017623
    余弦相似度: 0.004316
    相对差异: 1.415434

 4. base_model.model.language_model.mo

In [None]:
# 添加到比较代码之前
def analyze_model_source(model_path):
    """分析模型来源和训练状态 - 优化版本"""
    
    if Path(model_path).is_dir():
        from safetensors.torch import load_file
        try:
            state_dict = load_file(Path(model_path) / "adapter_model.safetensors")
        except:
            state_dict = torch.load(Path(model_path) / "pytorch_model.bin", map_location='cpu')
    else:
        state_dict = torch.load(model_path, map_location='cpu')
    
    print(f"\n=== 分析模型: {model_path} ===")
    
    # 检查一些关键参数的数值特征
    lora_A_params = [k for k in state_dict.keys() if 'lora_A' in k]
    lora_B_params = [k for k in state_dict.keys() if 'lora_B' in k]
    
    print(f"LoRA A 参数数量: {len(lora_A_params)}")
    print(f"LoRA B 参数数量: {len(lora_B_params)}")
    
    # 检查前几个参数的统计信息
    for i, key in enumerate(lora_A_params[:3]):
        tensor = state_dict[key]
        print(f"LoRA A {i+1}: {key}")
        print(f"  形状: {tensor.shape}")
        print(f"  均值: {tensor.mean().item():.6f}")
        print(f"  标准差: {tensor.std().item():.6f}")
        print(f"  最大值: {tensor.max().item():.6f}")
        print(f"  最小值: {tensor.min().item():.6f}")
        print()
    
    # 优化：快速检查 LoRA B 参数 - 只检查前几个和随机抽样
    print("检查 LoRA B 参数初始化状态...")
    
    # 1. 检查前3个 LoRA B 参数
    zero_b_count = 0
    checked_count = 0
    
    for i, key in enumerate(lora_B_params[:5]):  # 只检查前5个
        tensor = state_dict[key]
        is_zero = torch.allclose(tensor, torch.zeros_like(tensor), atol=1e-8)
        if is_zero:
            zero_b_count += 1
        checked_count += 1
        
        print(f"LoRA B {i+1}: {key}")
        print(f"  形状: {tensor.shape}")
        print(f"  是否为零: {is_zero}")
        print(f"  最大绝对值: {tensor.abs().max().item():.8f}")
        print()
    
    # 2. 如果前5个都是零，推断可能是新初始化；如果都不是零，推断是训练后的
    if zero_b_count == checked_count:
        print(f"✓ 前{checked_count}个 LoRA B 参数都是零 → 可能是新初始化的模型")
        estimated_zero_total = len(lora_B_params)
    elif zero_b_count == 0:
        print(f"✓ 前{checked_count}个 LoRA B 参数都不是零 → 可能是训练后的模型")
        estimated_zero_total = 0
    else:
        # 3. 如果混合情况，随机抽样检查更多
        print(f"混合情况，抽样检查更多参数...")
        import random
        sample_keys = random.sample(lora_B_params[5:], min(10, len(lora_B_params) - 5))
        
        for key in sample_keys:
            tensor = state_dict[key]
            is_zero = torch.allclose(tensor, torch.zeros_like(tensor), atol=1e-8)
            if is_zero:
                zero_b_count += 1
            checked_count += 1
        
        estimated_zero_total = int((zero_b_count / checked_count) * len(lora_B_params))
    
    print(f"估计的零初始化 LoRA B 参数: ~{estimated_zero_total}/{len(lora_B_params)}")
    
    # 4. 额外检查：看看非零 LoRA B 的数值范围
    if zero_b_count < checked_count:
        non_zero_b_keys = []
        for key in lora_B_params[:10]:  # 检查前10个
            tensor = state_dict[key]
            if not torch.allclose(tensor, torch.zeros_like(tensor), atol=1e-8):
                non_zero_b_keys.append((key, tensor))
                if len(non_zero_b_keys) >= 3:  # 只分析前3个非零的
                    break
        
        # if non_zero_b_keys:
        #     print("\n非零 LoRA B 参数分析:")
        #     for i, (key, tensor) in enumerate(non_zero_b_keys):
        #         print(f"  LoRA B {i+1}: {key}")
        #         print(f"    均值: {tensor.mean().item():.6f}")
        #         print(f"    标准差: {tensor.std().item():.6f}")
        #         print(f"    最大绝对值: {tensor.abs().max().item():.6f}")
    
    return state_dict

# 分析两个模型
print("分析模型1 (新保存的):")
analyze_model_source(model1_path)

print("分析模型2 (训练后的):")
analyze_model_source(model2_path)

分析模型1 (新保存的):

=== 分析模型: /202431205128/baseline/COPY/MARVEL/checkpoints/test/actor/lora_adapter ===
LoRA A 参数数量: 380
LoRA B 参数数量: 380
LoRA A 1: base_model.model.language_model.model.layers.0.mlp.down_proj.lora_A.weight
  形状: torch.Size([64, 4864])
  均值: 0.000020
  标准差: 0.015625
  最大值: 0.069824
  最小值: -0.068848

LoRA A 2: base_model.model.language_model.model.layers.0.mlp.gate_proj.lora_A.weight
  形状: torch.Size([64, 896])
  均值: -0.000020
  标准差: 0.015564
  最大值: 0.064453
  最小值: -0.063477

LoRA A 3: base_model.model.language_model.model.layers.0.mlp.up_proj.lora_A.weight
  形状: torch.Size([64, 896])
  均值: 0.000043
  标准差: 0.015625
  最大值: 0.068848
  最小值: -0.070312

检查 LoRA B 参数初始化状态...
LoRA B 1: base_model.model.language_model.model.layers.0.mlp.down_proj.lora_B.weight
  形状: torch.Size([896, 64])
  是否为零: True
  最大绝对值: 0.00000000

LoRA B 2: base_model.model.language_model.model.layers.0.mlp.gate_proj.lora_B.weight
  形状: torch.Size([4864, 64])
  是否为零: True
  最大绝对值: 0.00000000

LoRA B 3: base_m

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fd2d7133d90>>
Traceback (most recent call last):
  File "/202431205128/baseline/COPY/MARVEL/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
