In [2]:
import os
import sys
sys.path.append('/home/wly/szl_all_code/triper-project')


In [3]:
!export CUDA_VISIBLE_DEVICES=1,3,4

In [4]:
from triper.model import from_pretrained_components
audio_config = {
    'mm_audio_encoder': 'whisper_vq',
    'audio_hidden_size': 1280,  # Whisper输出维度
    'audio_model_path': '/sda1/glm-4-voice-tokenizer',
    'audio_projector_type': 'mlp2x_gelu',
    'audio_projector_hidden_dim': 2048,
    'dropout': 0.1
}
tokenizer, triper_model, image_processor, context_len, audio_encoder = from_pretrained_components(
    llava_model_path="/sda1/llava-v1.5-13b",
    audio_encoder_path="/sda1/glm-4-voice-tokenizer",
    audio_projector_path=None,
    audio_config=audio_config,
    freeze_llava=True,
    device_map="auto"
)
triper_model.get_parameter_stats()


  from .autonotebook import tqdm as notebook_tqdm
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.


🔄 Building Triper model from components...
   LLaVA model: /sda1/llava-v1.5-13b
   Audio encoder: /sda1/glm-4-voice-tokenizer
   Audio projector: Built from config
   Freeze LLaVA: True
🔄 Loading LLaVA model...


Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.20s/it]


✅ LLaVA model loaded: LlavaLlamaForCausalLM
🔄 Building audio encoder...
🔄 Building audio encoder: whisper_vq
✅ WhisperVQEncoder loaded from /sda1/glm-4-voice-tokenizer
   Actual hidden size: 1280
🔄 Moving audio encoder to device: cuda:0
🔒 Audio encoder parameters frozen
✅ Audio encoder built and moved to cuda:0: WhisperVQEncoder
🔄 Creating Triper model...
🔄 Building audio projector...
🔧 AudioProjector config:
  audio_hidden_size: 1280
  hidden_size: 5120
  projector_type: mlp2x_gelu
✅ AudioProjector created successfully
✅ Audio projector built: AudioProjector
✅ TriperModel initialized with config: triper
🔄 Moving Triper model to device: cuda:0
✅ LLaVA model attached: LlavaLlamaForCausalLM
🔒 LLaVA model parameters frozen
🎵 Audio encoder attached: WhisperVQEncoder
📦 Components set: tokenizer(LlamaTokenizer), processor(CLIPImageProcessor), context_len(2048)
✅ Triper model created successfully!

🏗️  Triper Model Summary
📦 Components:
  🦙 LLaVA: ✅ (LlavaLlamaForCausalLM)
  🎵 Audio Encoder: 

{'total_params': 13383627776,
 'trainable_params': 32788480,
 'components': {'llava': {'total': 13350839296, 'trainable': 0},
  'audio_projector': {'total': 32788480, 'trainable': 32788480}}}

In [5]:
from triper.data import TriperDataset, TriperDataCollator
from torch.utils.data import DataLoader
# 创建数据集（raw模式）
dataset = TriperDataset(
    json_path='/home/wly/szl_all_code/triper-project/data/simple_data_20_samples.json',
    media_root_path='/home/wly/szl_all_code/triper-project/data',
    mode="raw"
)

# 创建collator（在这里传入processors）
collator = TriperDataCollator(
    tokenizer=tokenizer,
    image_processor=image_processor,
    audio_processor=audio_encoder
)

# 创建DataLoader
dataloader = DataLoader(
    dataset, 
    batch_size=1, 
    collate_fn=collator,
)

正在从以下路径加载数据集描述文件: /home/wly/szl_all_code/triper-project/data/simple_data_20_samples.json
发现 20 个数据样本。
数据集模式: raw
音频文件夹: /home/wly/szl_all_code/triper-project/data/audio
视频文件夹: /home/wly/szl_all_code/triper-project/data/video
图像文件夹: /home/wly/szl_all_code/triper-project/data/images


In [None]:
# 测试单个样本
print("=== 测试单个样本 ===")
single_sample = dataset[0]
print("样本结构:", single_sample.keys())
print("有音频:", single_sample['has_audio'])
print("有图像:", single_sample['has_image'])

# 测试collator
print("\n=== 测试Collator ===")
samples = [dataset[i] for i in range(min(3, len(dataset)))]  # 取前3个样本
batch_result = collator(samples)

print("Batch结果:")
for key, value in batch_result.items():
    if hasattr(value, 'shape'):
        print(f"  {key}: {value.shape}")
    else:
        print(f"  {key}: {type(value)} (长度: {len(value) if hasattr(value, '__len__') else 'N/A'})")

# 测试DataLoader
print("\n=== 测试DataLoader ===")
for i, batch in enumerate(dataloader):
    print(f"Batch {i}:")
    for key, value in batch.items():
        if hasattr(value, 'shape'):
            print(f"  {key}: {value.shape}")
        else:
            print(f"  {key}: {type(value)}")
    triper_model(
        input_ids=batch['input_ids'],
        attention_mask=batch['attention_mask'],
        labels=batch['labels'],
        images=batch['images'],
        audio_features=batch['audio_features']
    )  # 测试模型前向传播
    print("Batch前向传播成功！")
    if i >= 1:  # 只测试前2个batch
        break

In [6]:
import torch
# 测试单个样本的推理
print("\n=== 测试单个样本推理 ===")
single_sample = dataset[0]
batch_result = collator([single_sample])
print("单个样本结构:", single_sample.keys())
with torch.cuda.amp.autocast():  # 使用混合精度
    output = triper_model(
        input_ids=batch_result['input_ids'],
        attention_mask=batch_result['attention_mask'],
        labels=batch_result['labels'],
        images=batch_result['images'],
        audio_features=batch_result['audio_features']
    )
print("单个样本推理成功！输出结构:", output.keys())
print("输出logits形状:", output['logits'].shape)


=== 测试单个样本推理 ===
❌ 图像处理失败: 


Traceback (most recent call last):
  File "/home/wly/.conda/envs/triper/lib/python3.10/site-packages/transformers/feature_extraction_utils.py", line 92, in __getattr__
    return self.data[item]
KeyError: 'shape'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/wly/szl_all_code/triper-project/triper/data/triper_collator.py", line 75, in __call__
    print(f"🖼️ Image batch shape: {batch['images'].shape}")
  File "/home/wly/.conda/envs/triper/lib/python3.10/site-packages/transformers/feature_extraction_utils.py", line 94, in __getattr__
    raise AttributeError
AttributeError


🎵 Audio feature shape: torch.Size([64, 1280])
🎵 Final audio batch shape: torch.Size([1, 64, 1280])
单个样本结构: dict_keys(['id', 'audio_path', 'image_path', 'conversation', 'scene_description', 'metadata', 'has_audio', 'has_image'])


  with torch.cuda.amp.autocast():  # 使用混合精度


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 2 has a total capacity of 47.53 GiB of which 10.88 MiB is free. Process 170426 has 44.23 GiB memory in use. Including non-PyTorch memory, this process has 3.27 GiB memory in use. Of the allocated memory 3.01 GiB is allocated by PyTorch, and 3.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)