In [1]:
import os
import sys
sys.path.append('/home/wly/szl_all_code/triper-project')
from llava.mm_utils import get_model_name_from_path, tokenizer_image_token, process_images
from llava.model.builder import load_pretrained_model
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates
from tqdm import tqdm
import torch
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = '/sda1/llava-v1.5-13b'
tokenizer, model, image_processor, context_len = load_pretrained_model(
        model_path=model_path,
        model_base=None,
        model_name=get_model_name_from_path(model_path),
        device_map='auto'
    )

You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.25s/it]


In [3]:
# 加载图像
def load_image(image_path):
    """加载并预处理图像"""
    image = Image.open(image_path).convert('RGB').resize((336, 336))
    return image

image_path = '/home/wly/szl_all_code/triper-project/tests/cat.jpg'
image = load_image(image_path)
print(f"Image size: {image.size}")

Image size: (336, 336)


In [4]:
# 获取模型配置
model_cfg = model.config if hasattr(model, "config") else None

# 处理图像并获取图像张量
image_tensor = process_images([image], image_processor, model_cfg=model_cfg)[0]
image_tensor = image_tensor.unsqueeze(0)  # 添加批次维度



In [5]:
prompt = "USER: <image>\n What's the content of the image? ASSISTANT:"

# tokenizer 处理文本
input_ids = tokenizer_image_token(
    prompt,
    tokenizer=tokenizer,
    image_token_index=IMAGE_TOKEN_INDEX,
    return_tensors='pt',
).to(model.device)
if len(input_ids.shape) == 1:
    input_ids = input_ids.unsqueeze(0)  # 添加批次维度

In [6]:
# 3. 准备多模态输入
with torch.no_grad():
    # 使用模型的多模态处理方法
    (
        _,
        position_ids,
        attention_mask,
        past_key_values,
        inputs_embeds,
        labels
    ) = model.prepare_inputs_labels_for_multimodal(
        input_ids=input_ids,
        position_ids=None,
        attention_mask=None,
        past_key_values=None,
        labels=None,
        images=image_tensor
    )
    
inputs_embeds.shape

torch.Size([1, 597, 5120])

In [7]:
def get_multimodal_features(model, tokenizer, image_tensor, input_ids):
    """获取多模态融合后的特征向量"""
    
    with torch.no_grad():
        # 1. 获取融合后的 embeddings
        (
            input_ids_processed,
            position_ids,
            attention_mask,
            past_key_values,
            inputs_embeds,
            labels
        ) = model.prepare_inputs_labels_for_multimodal(
            input_ids=input_ids,
            position_ids=None,
            attention_mask=None,
            past_key_values=None,
            labels=None,
            images=image_tensor.unsqueeze(0)  # 添加batch维度
        )
        
        # 2. 通过模型获取输出（包含所有隐藏状态）
        outputs = model.model(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_hidden_states=True,  # 获取所有隐藏层
            return_dict=True
        )
        
        
        
        return {
            'inputs_embeds': inputs_embeds,
            'hidden_states': outputs.hidden_states,  # 所有层的隐藏状态
            'last_hidden_state': outputs.last_hidden_state,
            'attention_mask': attention_mask,
            'position_ids': position_ids,
            
        }

# 获取多模态特征
multimodal_features = get_multimodal_features(model, tokenizer, image_tensor, input_ids)

print(f"✅ 多模态特征提取完成")
print(f"输入 embeddings shape: {multimodal_features['inputs_embeds'].shape}")
print(f"最终隐藏状态 shape: {multimodal_features['last_hidden_state'].shape}")
print(f"隐藏层数量: {len(multimodal_features['hidden_states'])}")
print(f"数据类型: {multimodal_features['last_hidden_state'].dtype}")

✅ 多模态特征提取完成
输入 embeddings shape: torch.Size([1, 597, 5120])
最终隐藏状态 shape: torch.Size([1, 597, 5120])
隐藏层数量: 41
数据类型: torch.bfloat16


In [8]:
def generate_llava_response_correct(model, tokenizer, image_tensor, input_ids, max_new_tokens=200):
    """使用模型实际支持的参数格式"""
    
    with torch.no_grad():
        # 根据模型签名，使用正确的参数格式
        # inputs 对应 input_ids，images 对应图像，image_sizes 对应图像尺寸
        image_sizes = [image_tensor.shape[-2:]]  # [height, width]
        
        outputs = model.generate(
            inputs=input_ids,  # 使用 inputs 而不是 input_ids
            images=image_tensor,
            image_sizes=image_sizes,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            use_cache=False,
            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
        )
        
        return outputs

# 尝试使用正确的参数格式
print("🤖 正在生成LLaVA响应（使用正确参数格式）...")
print(f"输入ID形状: {input_ids.shape}, 图像张量形状: {image_tensor.shape}")


generated_outputs = generate_llava_response_correct(model, tokenizer, image_tensor, input_ids)
print("✅ 正确参数格式成功！")

# 解码输出文本
generated_text = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)

print("=== LLaVA 生成结果 ===")
for i, text in enumerate(generated_text):
    print(f"样本 {i+1}:")
    print(text)
    print("-" * 50)
    
    # 提取只有Assistant的回答部分
    if "ASSISTANT:" in text:
        assistant_response = text.split("ASSISTANT:")[-1].strip()
        print(f"Assistant回答: {assistant_response}")
        print("-" * 50)
            

 

🤖 正在生成LLaVA响应（使用正确参数格式）...
输入ID形状: torch.Size([1, 22]), 图像张量形状: torch.Size([1, 3, 336, 336])
✅ 正确参数格式成功！
=== LLaVA 生成结果 ===
样本 1:
The image features a cat standing on a wooden floor, eating food from a bowl.
--------------------------------------------------
