In [None]:
import torch
from transformers import AutoModelForCausalLM

from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images


# specify the path to the model
model_path = "deepseek-ai/deepseek-vl-7b-chat"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).to('cuda:1').eval()


In [None]:
import os
import torch
import json
from glob import glob


adv_dir = "..."
vqa_json_path = "coco300_vqa_main.json"
output_json = "deepseek_eval_res.json" 

with open(vqa_json_path, 'r') as f:
    vqa_data = json.load(f)

image_to_questions = {item['image']: item['vqa'] for item in vqa_data}

adv_files = sorted(glob(os.path.join(adv_dir, "*.png")))

results = []

for i, adv_path in enumerate(adv_files):
    if i == 100:
        break

    filename = os.path.basename(adv_path)
    # .replace(".png", ".jpg")
    
    questions = image_to_questions.get(filename, ["", "", ""])
    if len(questions) != 3:
        questions = questions[:3] + [""] * (3 - len(questions))

    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>Describe this image in one sentence.",
            "images": [adv_path]
        },
        {
            "role": "Assistant",
            "content": ""
        }
    ]
    
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation,
        images=pil_images,
        force_batchify=True
    ).to(vl_gpt.device)
    
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
    
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True
    )
    
    adv_response_1 = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    print(f'ADV [{filename}] Q1: {adv_response_1}')
    
    adv_responses = [adv_response_1]
    for i, question_text in enumerate(questions, start=2):

        conversation[0]["content"] = f"<image_placeholder>{question_text}"

        pil_images = load_pil_images(conversation)
        prepare_inputs = vl_chat_processor(
            conversations=conversation,
            images=pil_images,
            force_batchify=True
        ).to(vl_gpt.device)
        
        inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
        
        outputs = vl_gpt.language_model.generate(
            inputs_embeds=inputs_embeds,
            attention_mask=prepare_inputs.attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=512,
            do_sample=False,
            use_cache=True
        )
        
        adv_response = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
        print(f'ADV [{filename}] Q{i}: {adv_response}')
        
        adv_responses.append(adv_response)

    results.append({
        "filename": filename,
        "adversarial_response_1": adv_responses[0],
        "adversarial_response_2": adv_responses[1] if len(adv_responses) > 1 else "",
        "adversarial_response_3": adv_responses[2] if len(adv_responses) > 2 else "",
        "adversarial_response_4": adv_responses[3] if len(adv_responses) > 3 else ""
    })

with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print(f"Results saved to {output_json}")