In [1]:
import torch
import gc
import os
import sys

sys.path = [p for p in sys.path if 'triper-project' not in p]
sys.path.append('/home/wly/szl_all_code/triper-project')

# 清理缓存
torch.cuda.empty_cache()
gc.collect()

# 加载模型
from triper.model import from_pretrained_components
from triper.data import TriperDataset, TriperDataCollator

audio_config = {
    'mm_audio_encoder': 'whisper_vq',
    'audio_hidden_size': 1280,
    'audio_model_path': '/sda1/glm-4-voice-tokenizer',
    'audio_projector_type': 'mlp2x_gelu',
    'audio_projector_hidden_dim': 2048,
    'dropout': 0.1
}

tokenizer, triper_model, image_processor, context_len, audio_encoder = from_pretrained_components(
    llava_model_path="/sda1/llava-v1.5-13b",
    audio_encoder_path="/sda1/glm-4-voice-tokenizer",
    audio_projector_path=None,
    audio_config=audio_config,
    freeze_llava=True,
    device_map="cuda:3"
)

  from .autonotebook import tqdm as notebook_tqdm
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.


🔄 Building Triper model from components...
   LLaVA model: /sda1/llava-v1.5-13b
   Audio encoder: /sda1/glm-4-voice-tokenizer
   Audio projector: Built from config
   Freeze LLaVA: True
🔄 Loading LLaVA model...


Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.99s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


🔄 Configuring image tokens...
添加图像token: <image>


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


更新IMAGE_TOKEN_INDEX为: 32000
🔄 Building audio encoder...
✅ WhisperVQEncoder loaded from /sda1/glm-4-voice-tokenizer
🔄 Moving audio encoder to device: cuda:3


TriperModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


🔒 Audio encoder parameters frozen
✅ Audio encoder built and moved to cuda:3: WhisperVQEncoder
🔄 Creating Triper model...
🔄 Building audio projector...
🔧 AudioProjector config:
  audio_hidden_size: 1280
  hidden_size: 5120
  projector_type: mlp2x_gelu
✅ AudioProjector created successfully
✅ Audio projector built: AudioProjector
✅ TriperModel initialized with config: triper
🔄 Moving Triper model to device: cuda:3
✅ LLaVA model attached: LlavaLlamaForCausalLM
🔒 LLaVA model parameters frozen
🎵 Audio encoder attached: WhisperVQEncoder
📦 Components set: tokenizer(LlamaTokenizer), processor(CLIPImageProcessor), context_len(2048)
✅ Triper model created successfully!

🏗️  Triper Model Summary
📦 Components:
  🦙 LLaVA: ✅ (LlavaLlamaForCausalLM)
  🎵 Audio Encoder: ✅ (WhisperVQEncoder) 🔒 External (Frozen)
  🔗 Audio Projector: ✅ (AudioProjector) 🔓 Trainable
  📝 Tokenizer: ✅ (LlamaTokenizer) 🔒 External
  🖼️ Image Processor: ✅ (CLIPImageProcessor) 🔒 External

📊 Trainable Parameters:
  Total: 13,383,63

In [2]:
dataset = TriperDataset(
    json_path='/home/wly/szl_all_code/triper-project/data/simple_data_20_samples.json',
    media_root_path='/home/wly/szl_all_code/triper-project/data',
)

collator = TriperDataCollator(
    tokenizer=tokenizer,
    image_processor=image_processor,
    audio_processor=audio_encoder,
    model_cfg=triper_model.llava_model.config
)



正在从以下路径加载数据集描述文件: /home/wly/szl_all_code/triper-project/data/simple_data_20_samples.json
发现 20 个数据样本。
数据集模式: raw
音频文件夹: /home/wly/szl_all_code/triper-project/data/audio
视频文件夹: /home/wly/szl_all_code/triper-project/data/video
图像文件夹: /home/wly/szl_all_code/triper-project/data/images


In [3]:
# 测试批量推理
batch_size = 16
batch_samples = [dataset[i] for i in range(batch_size)]
batch_result = collator(batch_samples)

# 🔧 安全移动到设备
device_batch = {}
for k, v in batch_result.items():
    if hasattr(v, 'to'):
        device_batch[k] = v.to(triper_model.device)
    else:
        print(f"保持原值: {k} 类型={type(v)}")
        device_batch[k] = v

with torch.no_grad():
    output = triper_model(
        input_ids=device_batch['input_ids'],
        images=device_batch['images'],
        audio_features=device_batch['audio_features']
        # 🔧 不传递attention_mask和labels，让模型自己处理
    )
    print("✅ 批量推理成功！")
    print(f"输出logits形状: {output['logits'].shape}")

📝 构建的对话文本（包含图像token）: <image>
Monica Geller (anger): There's nothing to tell! He's just some guy I work with!...
📝 构建的对话文本（包含图像token）: <image>
Joey Tribbiani (surprise): C'mon, you're going out with the guy! There's gotta be something ...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Chandler Bing (neutral): So does he have a hump? A hump and a hairpiece?...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Phoebe Buffay (neutral): Wait, does he eat chalk?...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Phoebe Buffay (neutral): Just, 'cause, I don't want her to go through what I went through wi...
📝 构建的对话文本（包含图像token）: <image>
Monica Geller (neutral): Okay, everybody relax. ...
📝 构建的对话文本（包含图像token）: <image>
Monica Geller (neutral): This is not even a date. It's just two people going out to dinner a...
📝 构建的对话文本（包含图像token）: <image>
Chandler Bing (neutral): Sou

In [4]:
# 测试批量推理
batch_size = 16
batch_samples = [dataset[i] for i in range(batch_size)]
batch_result = collator(batch_samples)

# 🔧 安全移动到设备
device_batch = {}
for k, v in batch_result.items():
    if hasattr(v, 'to'):
        device_batch[k] = v.to(triper_model.device)
    else:
        print(f"保持原值: {k} 类型={type(v)}")
        device_batch[k] = v
        
# 在您的notebook中修改这部分
response = triper_model.generate(
    input_ids=device_batch['input_ids'],
    attention_mask=device_batch['attention_mask'],  # 现在可以直接传入
    images=device_batch['images'],
    audio_features=device_batch['audio_features'],
    max_new_tokens=50,
    temperature=0.1,
    do_sample=True,
    top_p=0.8,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)

print(f"生成结果形状: {response.shape}")

# 🔧 修复：由于generate_with_embeds只返回新生成的tokens
# 我们需要根据实际情况来解码
if response.shape[1] == 50:  # 只是新生成的tokens
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    print(f"生成的文本: '{generated_text}'")
else:  # 完整序列
    original_len = device_batch['input_ids'].shape[1]
    generated_part = response[0, original_len:]
    generated_text = tokenizer.decode(generated_part, skip_special_tokens=True)
    print(f"生成的文本: '{generated_text}'")

📝 构建的对话文本（包含图像token）: <image>
Monica Geller (anger): There's nothing to tell! He's just some guy I work with!...
📝 构建的对话文本（包含图像token）: <image>
Joey Tribbiani (surprise): C'mon, you're going out with the guy! There's gotta be something ...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Chandler Bing (neutral): So does he have a hump? A hump and a hairpiece?...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Phoebe Buffay (neutral): Wait, does he eat chalk?...
📝 构建的对话文本（包含图像token）: <image>
No conversation available....
📝 构建的对话文本（包含图像token）: <image>
Phoebe Buffay (neutral): Just, 'cause, I don't want her to go through what I went through wi...
📝 构建的对话文本（包含图像token）: <image>
Monica Geller (neutral): Okay, everybody relax. ...
📝 构建的对话文本（包含图像token）: <image>
Monica Geller (neutral): This is not even a date. It's just two people going out to dinner a...
📝 构建的对话文本（包含图像token）: <image>
Chandler Bing (neutral): Sou

OutOfMemoryError: CUDA out of memory. Tried to allocate 290.00 MiB. GPU 3 has a total capacity of 47.53 GiB of which 225.88 MiB is free. Including non-PyTorch memory, this process has 47.29 GiB memory in use. Of the allocated memory 41.63 GiB is allocated by PyTorch, and 5.30 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [6]:
# 🔧 先测试单个样本，避免批量问题
print("🧪 测试单个样本生成...")

# 取第一个样本
single_sample = [dataset[10]]
single_batch = collator(single_sample)

# 移动到设备
single_device_batch = {}
for k, v in single_batch.items():
    if hasattr(v, 'to'):
        single_device_batch[k] = v.to(triper_model.device)
    else:
        single_device_batch[k] = v

print(f"单样本数据形状:")
print(f"  input_ids: {single_device_batch['input_ids'].shape}")
print(f"  attention_mask: {single_device_batch['attention_mask'].shape}")
print(f"  images: {single_device_batch['images'].shape}")
print(f"  audio_features: {single_device_batch['audio_features'].shape}")

# 🔧 修复attention_mask长度问题
text_len = single_device_batch['input_ids'].shape[1]
text_attention_mask = single_device_batch['attention_mask'][:, :text_len]

print(f"修复后attention_mask: {text_attention_mask.shape}")

# 测试生成
response = triper_model.generate(
    input_ids=single_device_batch['input_ids'],
    attention_mask=text_attention_mask,  # 使用修复后的attention_mask
    images=single_device_batch['images'],
    audio_features=single_device_batch['audio_features'],
    max_new_tokens=512,  # 先用少量token测试
    temperature=0.1,
    do_sample=False,  # 先用贪心搜索
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)

print(f"生成结果形状: {response.shape}")

# 解码结果
original_len = single_device_batch['input_ids'].shape[1]
if response.shape[1] > original_len:
    # 返回完整序列，截取新生成的部分
    generated_part = response[0, original_len:]
    generated_text = tokenizer.decode(generated_part, skip_special_tokens=True)
    print(f"新生成的文本: '{generated_text}'")
else:
    # 只返回新生成的tokens
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    print(f"生成的文本: '{generated_text}'")

# 显示完整对话
full_text = tokenizer.decode(response[0], skip_special_tokens=True)
print(f"完整对话: '{full_text}'")

🧪 测试单个样本生成...
📝 对话预测格式:
<image>
USER: Based on what you see and hear in this scene, what would Chandler Bing say?
ASSISTANT:
📝 原始文本长度范围: 31 - 31
✅ 批量tokenization完成: input_ids shape: torch.Size([1, 31])
✅ 所有样本文本长度统一为: 31
✅ 图像处理成功: torch.Size([1, 3, 336, 336])
✅ 音频批量处理完成: torch.Size([1, 64, 1280])
单样本数据形状:
  input_ids: torch.Size([1, 31])
  attention_mask: torch.Size([1, 31])
  images: torch.Size([1, 3, 336, 336])
  audio_features: torch.Size([1, 64, 1280])
修复后attention_mask: torch.Size([1, 31])
🚀 TriperModel.generate called:
  input_ids: torch.Size([1, 31])
  images: torch.Size([1, 3, 336, 336])
  audio_features: torch.Size([1, 64, 1280])
🎵 检测到音频输入，准备多模态embeddings...
📸 LLaVA处理图像...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLaVA处理后embeds: torch.Size([1, 606, 5120])
🎵 集成音频特征...
🔄 AudioProjector forward called with input shape: torch.Size([1, 64, 1280])
🎵 音频特征插入完成:
  原始embeds: torch.Size([1, 606, 5120])
  音频embeds: torch.Size([1, 64, 5120])
  合并后embeds: torch.Size([1, 670, 5120])
  合并后attention_mask: torch.Size([1, 670])
最终embeds: torch.Size([1, 670, 5120])
最终attention_mask: torch.Size([1, 670])
🚀 调用LLaVA.generate with inputs_embeds...
生成结果形状: torch.Size([1, 1])
生成的文本: ''
完整对话: ''


In [5]:
pure_llava_response = triper_model.generate(
    input_ids=single_device_batch['input_ids'],
    attention_mask=text_attention_mask,
    images=single_device_batch['images'],
    audio_features=None,  # 不传音频
    max_new_tokens=50,
    min_length=single_device_batch['input_ids'].shape[1] + 10,
    do_sample=True,
    temperature=0.8,
    top_p=0.9,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    early_stopping=False,
)

print(f"纯LLaVA生成结果: {pure_llava_response.shape}")

if pure_llava_response.shape[1] > original_len:
    pure_text = tokenizer.decode(pure_llava_response[0, original_len:], skip_special_tokens=True)
    print(f"纯LLaVA生成文本: '{pure_text}'")

NameError: name 'single_device_batch' is not defined

In [8]:
# 手动构建一个简单的prompt
simple_prompt = "USER: What do you see in this image?\nASSISTANT:"
simple_input_ids = tokenizer.encode(simple_prompt, return_tensors="pt").to(triper_model.device)

print(f"简单prompt: '{simple_prompt}'")
print(f"简单input_ids: {simple_input_ids.shape}")

# 测试极简生成
simple_response = triper_model.llava_model.generate(
    inputs=simple_input_ids,
    images=single_device_batch['images'],
    max_new_tokens=512,
    do_sample=False,  # 贪心
    temperature=None,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

print(f"单独图文生成结果: {simple_response.shape}")
if simple_response.shape[1] > simple_input_ids.shape[1]:
    simple_text = tokenizer.decode(simple_response[0, simple_input_ids.shape[1]:], skip_special_tokens=True)
    print(f"生成文本: '{simple_text}'")
else:
    print("❌ 生成也失败了")

简单prompt: 'USER: What do you see in this image?
ASSISTANT:'
简单input_ids: torch.Size([1, 17])
单独图文生成结果: torch.Size([1, 82])
生成文本: '. The person appears to be engaged in a conversation or using the phone for some purpose. The room has a cozy atmosphere, with a couch located in the background and a chair placed nearby. A potted plant can be seen in the corner of the room, adding a touch of greenery to the space.'


In [7]:
from triper.constants import DEFAULT_IMAGE_TOKEN
# 测试新的对话预测格式
print("🧪 测试对话预测格式...")

# 手动构建对话预测prompt
sample_data = dataset[10]
print(f"原始对话: {sample_data.get('conversation', [])}")

# 模拟新格式
if sample_data.get('conversation'):
    first_turn = sample_data['conversation'][0]
    speaker = first_turn.get('speaker', 'Person') 
    text = first_turn.get('text', '')
    
    prediction_prompt = f"{DEFAULT_IMAGE_TOKEN}\nUSER: In this scene, {speaker} says: '{text}'. What would be a natural response?\nASSISTANT:"
else:
    prediction_prompt = f"{DEFAULT_IMAGE_TOKEN}\nUSER: What conversation would happen in this scene?\nASSISTANT:"

prediction_ids = tokenizer.encode(prediction_prompt, return_tensors="pt").to(triper_model.device)

print(f"对话预测prompt: '{prediction_prompt}'")

# 测试生成
prediction_response = triper_model.llava_model.generate(
    inputs=prediction_ids,
    images=single_device_batch['images'],
    max_new_tokens=100,
    do_sample=True,
    temperature=0.8,
    top_p=0.9,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

if prediction_response.shape[1] > prediction_ids.shape[1]:
    pred_text = tokenizer.decode(prediction_response[0, prediction_ids.shape[1]:], skip_special_tokens=True)
    print(f"预测的对话: '{pred_text}'")

🧪 测试对话预测格式...
原始对话: [{'speaker': 'Chandler Bing', 'text': 'Sounds like a date to me.', 'emotion': 'neutral'}]
对话预测prompt: '<image>
USER: In this scene, Chandler Bing says: 'Sounds like a date to me.'. What would be a natural response?
ASSISTANT:'
预测的对话: 'is ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais ais'
