In [2]:
import os
import sys
sys.path.append('/home/wly/szl_all_code/triper-project')
from triper.model.builder import from_pretrained_components

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!export CUDA_VISIBLE_DEVICES=1,4

In [4]:
audio_config = {
    'mm_audio_encoder': 'whisper_vq',
    'audio_hidden_size': 1280,  # Whisper输出维度
    'audio_model_path': '/sda1/glm-4-voice-tokenizer',
    'audio_projector_type': 'mlp2x_gelu',
    'audio_projector_hidden_dim': 2048,
    'dropout': 0.1
}
tokenizer, triper_model, image_processor, context_len, audio_encoder = from_pretrained_components(
    llava_model_path="/sda1/llava-v1.5-13b",
    audio_encoder_path="/sda1/glm-4-voice-tokenizer",
    audio_projector_path=None,
    audio_config=audio_config,
    freeze_llava=True,
    device_map="auto"
)
triper_model.get_parameter_stats()


🔄 Building Triper model from components...
   LLaVA model: /sda1/llava-v1.5-13b
   Audio encoder: /sda1/glm-4-voice-tokenizer
   Audio projector: Built from config
   Freeze LLaVA: True
🔄 Loading LLaVA model...


You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 3/3 [00:56<00:00, 18.70s/it]


✅ LLaVA model loaded: LlavaLlamaForCausalLM
🔄 Building audio encoder...
🔄 Building audio encoder: whisper_vq
✅ WhisperVQEncoder loaded from /sda1/glm-4-voice-tokenizer
   Actual hidden size: 1280
🔄 Moving audio encoder to device: cuda:0
🔒 Audio encoder parameters frozen
✅ Audio encoder built and moved to cuda:0: WhisperVQEncoder
🔄 Creating Triper model...
🔄 Building audio projector...
🔧 AudioProjector config:
  audio_hidden_size: 1280
  hidden_size: 5120
  projector_type: mlp2x_gelu
✅ AudioProjector created successfully
✅ Audio projector built: AudioProjector
✅ TriperModel initialized with config: triper
🔄 Moving Triper model to device: cuda:0
✅ LLaVA model attached: LlavaLlamaForCausalLM
🔒 LLaVA model parameters frozen
🎵 Audio encoder attached: WhisperVQEncoder
📦 Components set: tokenizer(LlamaTokenizer), processor(CLIPImageProcessor), context_len(2048)
✅ Triper model created successfully!

🏗️  Triper Model Summary
📦 Components:
  🦙 LLaVA: ✅ (LlavaLlamaForCausalLM)
  🎵 Audio Encoder: 

{'total_params': 13383627776,
 'trainable_params': 32788480,
 'components': {'llava': {'total': 13350839296, 'trainable': 0},
  'audio_projector': {'total': 32788480, 'trainable': 32788480}}}

In [3]:
audio_path = '/home/wly/szl_all_code/triper-project/tests/audio.wav'
audio_input = audio_encoder(audio_path)
print(f"Audio input shape: {audio_input.shape}")

🎵 Audio features device after extraction: cuda:0
Audio input shape: torch.Size([1, 375, 1280])


In [4]:
from PIL import Image
# 加载图像
def load_image(image_path):
    """加载并预处理图像"""
    image = Image.open(image_path).convert('RGB').resize((336, 336))
    return image

image_path = '/home/wly/szl_all_code/triper-project/tests/cat.jpg'
image = load_image(image_path)
print(f"Image size: {image.size}")

Image size: (336, 336)


In [5]:
from llava.mm_utils import get_model_name_from_path, tokenizer_image_token, process_images
# 获取模型配置
model_cfg = triper_model.config if hasattr(triper_model, "config") else None

# 处理图像并获取图像张量
image_tensor = process_images([image], image_processor, model_cfg=model_cfg)[0]
image_tensor = image_tensor.unsqueeze(0)  # 添加批次维度

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
prompt = "USER: <image>\n What's the content of the image? ASSISTANT:"

# tokenizer 处理文本
input_ids = tokenizer_image_token(
    prompt,
    tokenizer=tokenizer,
    image_token_index=IMAGE_TOKEN_INDEX,
    return_tensors='pt',
).to(triper_model.device)
if len(input_ids.shape) == 1:
    input_ids = input_ids.unsqueeze(0)  # 添加批次维度
  

In [11]:
result = triper_model(
    input_ids=input_ids,
    images=image_tensor,
    audio_features=audio_input
)
result['logits'].shape

🎵 Processing audio features:
  Input audio shape: torch.Size([1, 375, 1280])
  Input audio dtype: torch.float32
  Input audio device: cuda:0
🎵 Audio features device (passthrough): cuda:0
  Target dtype: torch.bfloat16
  Encoded audio dtype: torch.float32
  🔄 Converting encoded audio to torch.bfloat16
🎵 AudioProjector forward:
  Input shape: torch.Size([1, 375, 1280])
  Input dtype: torch.bfloat16
  Input device: cuda:0
  Model dtype: torch.float32
  🔄 Converting input from torch.bfloat16 to torch.float32
  Output shape: torch.Size([1, 375, 5120])
  Output dtype: torch.float32
  Output device: cuda:0
  Audio embeds shape: torch.Size([1, 375, 5120])
  Audio embeds dtype: torch.float32
  Inputs embeds shape: torch.Size([1, 597, 5120])
  Inputs embeds dtype: torch.bfloat16
  🔄 Converting audio embeds to match inputs_embeds dtype
  Final result shape: torch.Size([1, 972, 5120])
  Final result dtype: torch.bfloat16


torch.Size([1, 972, 32000])