In [1]:
import torch
import gc
import os
import sys

# 1. 清理GPU缓存
torch.cuda.empty_cache()
gc.collect()

# 2. 重新设置环境
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# 3. 重新导入模块
sys.path = [p for p in sys.path if 'triper-project' not in p]
sys.path.append('/home/wly/szl_all_code/triper-project')

# 4. 重新加载模型
from triper.model import from_pretrained_components
from triper.data import TriperDataset, TriperDataCollator

audio_config = {
    'mm_audio_encoder': 'whisper_vq',
    'audio_hidden_size': 1280,
    'audio_model_path': '/sda1/glm-4-voice-tokenizer',
    'audio_projector_type': 'mlp2x_gelu',
    'audio_projector_hidden_dim': 2048,
    'dropout': 0.1
}

tokenizer, triper_model, image_processor, context_len, audio_encoder = from_pretrained_components(
    llava_model_path="/sda1/llava-v1.5-13b",
    audio_encoder_path="/sda1/glm-4-voice-tokenizer",
    audio_projector_path=None,
    audio_config=audio_config,
    freeze_llava=True,
    device_map="cuda:0"  # 注意：使用cuda:0而不是cuda:3
)

# 5. 重新创建数据集和collator
dataset = TriperDataset(
    json_path='/home/wly/szl_all_code/triper-project/data/simple_data_20_samples.json',
    media_root_path='/home/wly/szl_all_code/triper-project/data',
    mode="raw"
)

collator = TriperDataCollator(
    tokenizer=tokenizer,
    image_processor=image_processor,
    audio_processor=audio_encoder,
    model_cfg=triper_model.llava_model.config
)

# 6. 测试
single_sample = dataset[0]
batch_result = collator([single_sample])
batch_result = {k: v.to(triper_model.device) for k, v in batch_result.items()}

with torch.no_grad():
    output = triper_model(
        input_ids=batch_result['input_ids'],
        images=batch_result['images'],
        audio_features=batch_result['audio_features']
    )
    print("✅ 修复成功！")
    print(f"输出logits形状: {output['logits'].shape}")

  from .autonotebook import tqdm as notebook_tqdm
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.


🔄 Building Triper model from components...
   LLaVA model: /sda1/llava-v1.5-13b
   Audio encoder: /sda1/glm-4-voice-tokenizer
   Audio projector: Built from config
   Freeze LLaVA: True
🔄 Loading LLaVA model...


Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.08s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


🔄 Configuring image tokens...
添加图像token: <image>


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


更新IMAGE_TOKEN_INDEX为: 32000
🔄 Building audio encoder...
✅ WhisperVQEncoder loaded from /sda1/glm-4-voice-tokenizer
🔄 Moving audio encoder to device: cuda:0
🔒 Audio encoder parameters frozen
✅ Audio encoder built and moved to cuda:0: WhisperVQEncoder
🔄 Creating Triper model...
🔄 Building audio projector...
🔧 AudioProjector config:
  audio_hidden_size: 1280
  hidden_size: 5120
  projector_type: mlp2x_gelu
✅ AudioProjector created successfully
✅ Audio projector built: AudioProjector
✅ TriperModel initialized with config: triper
🔄 Moving Triper model to device: cuda:0
✅ LLaVA model attached: LlavaLlamaForCausalLM
🔒 LLaVA model parameters frozen
🎵 Audio encoder attached: WhisperVQEncoder
📦 Components set: tokenizer(LlamaTokenizer), processor(CLIPImageProcessor), context_len(2048)
✅ Triper model created successfully!

🏗️  Triper Model Summary
📦 Components:
  🦙 LLaVA: ✅ (LlavaLlamaForCausalLM)
  🎵 Audio Encoder: ✅ (WhisperVQEncoder) 🔒 External (Frozen)
  🔗 Audio Projector: ✅ (AudioProjector) 

In [6]:
# 测试批量推理
batch_size = 16
batch_samples = [dataset[i] for i in range(batch_size)]
batch_result = collator(batch_samples)
batch_result = {k: v.to(triper_model.device) for k, v in batch_result.items()}
with torch.no_grad():
    output = triper_model(
        input_ids=batch_result['input_ids'],
        images=batch_result['images'],
        audio_features=batch_result['audio_features'],
        attention_mask=batch_result['attention_mask'],
        labels=batch_result.get('labels', None)
    )
    print("✅ 批量推理成功！")
    print(f"输出logits形状: {output['logits'].shape}")

📝 构建的对话文本（包含图像token）: <image>
Monica Geller (anger): There's nothing to tell! He's just some guy I work with!...
📝 构建的对话文本（包含图像token）: <image>
Joey Tribbiani (surprise): C'mon, you're going out with the guy! There's gotta be something ...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Chandler Bing (neutral): So does he have a hump? A hump and a hairpiece?...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Phoebe Buffay (neutral): Wait, does he eat chalk?...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Phoebe Buffay (neutral): Just, 'cause, I don't want her to go through what I went through wi...
📝 构建的对话文本（包含图像token）: <image>
Monica Geller (neutral): Okay, everybody relax. ...
📝 构建的对话文本（包含图像token）: <image>
Monica Geller (neutral): This is not even a date. It's just two people going out to dinner a...
📝 构建的对话文本（包含图像token）: <image>
Chandler Bing (neutral): Sou

OutOfMemoryError: CUDA out of memory. Tried to allocate 264.00 MiB. GPU 0 has a total capacity of 47.53 GiB of which 95.88 MiB is free. Including non-PyTorch memory, this process has 47.42 GiB memory in use. Of the allocated memory 42.34 GiB is allocated by PyTorch, and 4.72 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)