In [1]:
import os
import sys
sys.path.append('/home/wly/szl_all_code/triper-project')


In [2]:
from triper.model import from_pretrained_components
audio_config = {
    'mm_audio_encoder': 'whisper_vq',
    'audio_hidden_size': 1280,  # Whisper输出维度
    'audio_model_path': '/sda1/glm-4-voice-tokenizer',
    'audio_projector_type': 'mlp2x_gelu',
    'audio_projector_hidden_dim': 2048,
    'dropout': 0.1
}
tokenizer, triper_model, image_processor, context_len, audio_encoder = from_pretrained_components(
    llava_model_path="/sda1/llava-v1.5-13b",
    audio_encoder_path="/sda1/glm-4-voice-tokenizer",
    audio_projector_path=None,
    audio_config=audio_config,
    freeze_llava=True,
    device_map="cuda:2"
)
triper_model.get_parameter_stats()


  from .autonotebook import tqdm as notebook_tqdm
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.


🔄 Building Triper model from components...
   LLaVA model: /sda1/llava-v1.5-13b
   Audio encoder: /sda1/glm-4-voice-tokenizer
   Audio projector: Built from config
   Freeze LLaVA: True
🔄 Loading LLaVA model...


Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.11s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


🔄 Configuring image tokens...
添加图像token: <image>


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


更新IMAGE_TOKEN_INDEX为: 32000
🔄 Building audio encoder...
✅ WhisperVQEncoder loaded from /sda1/glm-4-voice-tokenizer
🔄 Moving audio encoder to device: cuda:2
🔒 Audio encoder parameters frozen
✅ Audio encoder built and moved to cuda:2: WhisperVQEncoder
🔄 Creating Triper model...
🔄 Building audio projector...
🔧 AudioProjector config:
  audio_hidden_size: 1280
  hidden_size: 5120
  projector_type: mlp2x_gelu
✅ AudioProjector created successfully
✅ Audio projector built: AudioProjector
✅ TriperModel initialized with config: triper
🔄 Moving Triper model to device: cuda:2
✅ LLaVA model attached: LlavaLlamaForCausalLM
🔒 LLaVA model parameters frozen
🎵 Audio encoder attached: WhisperVQEncoder
📦 Components set: tokenizer(LlamaTokenizer), processor(CLIPImageProcessor), context_len(2048)
✅ Triper model created successfully!

🏗️  Triper Model Summary
📦 Components:
  🦙 LLaVA: ✅ (LlavaLlamaForCausalLM)
  🎵 Audio Encoder: ✅ (WhisperVQEncoder) 🔒 External (Frozen)
  🔗 Audio Projector: ✅ (AudioProjector) 

{'total_params': 13383638016,
 'trainable_params': 32788480,
 'components': {'llava': {'total': 13350849536, 'trainable': 0},
  'audio_projector': {'total': 32788480, 'trainable': 32788480}}}

In [3]:
from triper.data import TriperDataset, TriperDataCollator
from torch.utils.data import DataLoader
# 创建数据集
dataset = TriperDataset(
    json_path='/home/wly/szl_all_code/triper-project/data/simple_data_20_samples.json',
    media_root_path='/home/wly/szl_all_code/triper-project/data',
    mode="raw"
)

# 创建collator（在这里传入正确的model_cfg）
collator = TriperDataCollator(
    tokenizer=tokenizer,
    image_processor=image_processor,
    audio_processor=audio_encoder,
    model_cfg=triper_model.llava_model.config
)

# 创建DataLoader
dataloader = DataLoader(
    dataset, 
    batch_size=4, 
    collate_fn=collator,
)

正在从以下路径加载数据集描述文件: /home/wly/szl_all_code/triper-project/data/simple_data_20_samples.json
发现 20 个数据样本。
数据集模式: raw
音频文件夹: /home/wly/szl_all_code/triper-project/data/audio
视频文件夹: /home/wly/szl_all_code/triper-project/data/video
图像文件夹: /home/wly/szl_all_code/triper-project/data/images


In [None]:
import torch

# 更安全的测试代码
print("=== 测试数据流程 ===")

# 在推理前监控显存
import torch
print("推理前显存使用:")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.memory_allocated(i)/1024**3:.2f}GB / {torch.cuda.memory_reserved(i)/1024**3:.2f}GB")

# 清理缓存
torch.cuda.empty_cache()

# 测试单个样本
single_sample = dataset[0]
print("样本结构:", single_sample.keys())

# 测试collator
try:
    batch_result = collator([single_sample])
    
    print("\n✅ Collator处理成功！")
    print("Batch结果:")
    for key, value in batch_result.items():
        if isinstance(value, torch.Tensor):
            print(f"  {key}: {value.shape} (dtype: {value.dtype})")
        else:
            print(f"  {key}: {type(value)}")
    
    # 验证张量格式
    print("\n=== 验证张量 ===")
    for key, value in batch_result.items():
        if isinstance(value, torch.Tensor):
            print(f"✅ {key}: 是有效的torch.Tensor")
        else:
            print(f"❌ {key}: 不是torch.Tensor，类型为 {type(value)}")
            
except Exception as e:
    print(f"❌ Collator失败: {e}")
    import traceback
    traceback.print_exc()

=== 测试数据流程 ===
推理前显存使用:
GPU 0: 0.00GB / 0.00GB
GPU 1: 0.00GB / 0.00GB
GPU 2: 0.00GB / 0.00GB
GPU 3: 26.44GB / 27.84GB
GPU 4: 0.00GB / 0.00GB
样本结构: dict_keys(['id', 'audio_path', 'image_path', 'conversation', 'scene_description', 'metadata', 'has_audio', 'has_image'])
📝 构建的对话文本（包含图像token）: <image>
Monica Geller (anger): There's nothing to tell! He's just some guy I work with!...
📝 批量tokenization完成: input_ids shape: torch.Size([1, 29])
🖼️ LLaVA process_images 处理成功，shape: torch.Size([1, 3, 336, 336])
🎵 Final audio batch shape: torch.Size([1, 64, 1280])
📏 序列长度: 文本=29, 图像=576, 音频=64, 总计=669
📏 Labels shape: torch.Size([1, 29])

✅ Collator处理成功！
Batch结果:
  input_ids: torch.Size([1, 29]) (dtype: torch.int64)
  attention_mask: torch.Size([1, 29]) (dtype: torch.int64)
  labels: torch.Size([1, 29]) (dtype: torch.int64)
  images: torch.Size([1, 3, 336, 336]) (dtype: torch.float32)
  audio_features: torch.Size([1, 64, 1280]) (dtype: torch.float32)

=== 验证张量 ===
✅ input_ids: 是有效的torch.Tensor
✅ attent

In [6]:
import torch
# 在推理代码中使用
print("\n=== 测试单个样本推理 ===")
single_sample = dataset[0]
batch_result = collator([single_sample])
print("单个样本结构:", single_sample.keys())

with torch.no_grad():
    batch_result = {k: v.to(triper_model.device) for k, v in batch_result.items()}
    
    try:
        output = triper_model(
            input_ids=batch_result['input_ids'],
            attention_mask=batch_result['attention_mask'],
            images=batch_result['images'],
            audio_features=batch_result['audio_features']
        )
        print("单个样本推理成功！输出结构:", output.keys())
        print("输出logits形状:", output['logits'].shape)
    except RuntimeError as e:
        print(f"推理失败: {e}")
        if "device" in str(e).lower():
            print("这可能是多GPU模型的设备分布问题")


=== 测试单个样本推理 ===
📝 构建的对话文本（包含图像token）: <image>
Monica Geller (anger): There's nothing to tell! He's just some guy I work with!...
📝 批量tokenization完成: input_ids shape: torch.Size([1, 29])
🖼️ LLaVA process_images 处理成功，shape: torch.Size([1, 3, 336, 336])
🎵 Final audio batch shape: torch.Size([1, 64, 1280])
📏 序列长度: 文本=29, 图像=576, 音频=64, 总计=669
📏 Labels shape: torch.Size([1, 29])
单个样本结构: dict_keys(['id', 'audio_path', 'image_path', 'conversation', 'scene_description', 'metadata', 'has_audio', 'has_image'])
初始input ids形状: torch.Size([1, 29])
images shape: torch.Size([1, 3, 336, 336])
处理图像后, embedding形状: torch.Size([1, 29, 5120])
  🔄 Converting encoded audio to torch.bfloat16
🔄 AudioProjector forward called with input shape: torch.Size([1, 64, 1280])
  Inputs embeds shape: torch.Size([1, 29, 5120])
  Inputs embeds dtype: torch.bfloat16
  🔄 Converting audio embeds to match inputs_embeds dtype
  Final result shape: torch.Size([1, 93, 5120])
  Final result dtype: torch.bfloat16
插入音频后, 嵌入形状: tor

In [None]:
# 测试 collator
print("\n=== 测试批量数据推理 ===")
for batch in dataloader:
    try:
        batch_result = {k: v.to(triper_model.device) for k, v in batch.items()}
        
        output = triper_model(
            input_ids=batch_result['input_ids'],
            images=batch_result['images'],
            audio_features=batch_result['audio_features'],
            attention_mask=batch_result['attention_mask'],
        )
        
        print("批量推理成功！输出结构:", output.keys())
        print("输出logits形状:", output['logits'].shape)
        
        break
    
    except RuntimeError as e:
        print(f"批量推理失败: {e}")
        if "device" in str(e).lower():
            print("这可能是多GPU模型的设备分布问题")


=== 测试批量数据推理 ===
📝 构建的对话文本（包含图像token）: <image>
Monica Geller (anger): There's nothing to tell! He's just some guy I work with!...
📝 构建的对话文本（包含图像token）: <image>
Joey Tribbiani (surprise): C'mon, you're going out with the guy! There's gotta be something ...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Chandler Bing (neutral): So does he have a hump? A hump and a hairpiece?...
📝 批量tokenization完成: input_ids shape: torch.Size([4, 38])
🖼️ LLaVA process_images 处理成功，shape: torch.Size([4, 3, 336, 336])
🎵 Final audio batch shape: torch.Size([4, 64, 1280])
📏 序列长度: 文本=38, 图像=576, 音频=64, 总计=678
📏 Labels shape: torch.Size([4, 38])
初始input ids形状: torch.Size([4, 38])
images shape: torch.Size([4, 3, 336, 336])
处理图像后, embedding形状: torch.Size([4, 38, 5120])
  🔄 Converting encoded audio to torch.bfloat16
🔄 AudioProjector forward called with input shape: torch.Size([4, 64, 1280])
  Inputs embeds shape: torch.Size([4, 38, 5120])
  Inputs embeds dtype: torch.bflo

In [7]:
# %%
# 🔧 设备状态诊断和修复
import torch
print("🔍 当前设备状态:")
print(f"CUDA设备数量: {torch.cuda.device_count()}")
print(f"当前设备: {torch.cuda.current_device()}")

# 检查模型实际所在设备
triper_device = next(triper_model.parameters()).device
llava_device = next(triper_model.llava_model.parameters()).device
print(f"Triper模型设备: {triper_device}")
print(f"LLaVA模型设备: {llava_device}")

# 🚨 如果设备不一致，强制移动到cuda:0
if triper_device != torch.device('cuda:0'):
    print(f"⚠️ 检测到设备不一致，正在移动模型到cuda:0...")
    triper_model = triper_model.to('cuda:0')
    print(f"✅ 模型已移动到: {next(triper_model.parameters()).device}")

# 验证设备一致性
print(f"最终设备状态:")
print(f"  Triper: {next(triper_model.parameters()).device}")
print(f"  LLaVA: {next(triper_model.llava_model.parameters()).device}")
print(f"  Audio: {next(triper_model.audio_encoder.parameters()).device}")

🔍 当前设备状态:
CUDA设备数量: 5
当前设备: 0
Triper模型设备: cuda:2
LLaVA模型设备: cuda:2
⚠️ 检测到设备不一致，正在移动模型到cuda:0...


OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 62.56 MiB is free. Including non-PyTorch memory, this process has 23.56 GiB memory in use. Of the allocated memory 23.19 GiB is allocated by PyTorch, and 2.67 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)