In [None]:
import os

os.environ["HF_TOKEN"]  = "-----"

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import re
import ast

def parse_output(raw_text: str):
    text = raw_text.strip()

    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1:
        return "", ""

    json_block = text[start:end+1]

    ans_match = re.search(r'"answer"\s*:\s*"([^"]*)"', json_block)
    answer = ans_match.group(1) if ans_match else ""

    exp_match = re.search(r'"explanation"\s*:\s*"([^"]*)"', json_block, flags=re.S)
    if not exp_match:
        return answer, ""

    exp_raw = exp_match.group(1).strip()

    explanation = exp_raw
    if exp_raw.startswith("[") and exp_raw.endswith("]"):
        try:
            exp_list = ast.literal_eval(exp_raw)
            if isinstance(exp_list, list) and len(exp_list) > 0:
                explanation = exp_list[0]   
        except Exception:
            explanation = exp_raw  

    return answer, explanation


In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from PIL import Image
from peft import PeftModel
from transformers import AutoProcessor, Qwen3VLForConditionalGeneration

# ==== 1️⃣ 路径设置 ====
BASE_MODEL = "Qwen/Qwen3-VL-2B-Instruct"
LORA_PATH = "/home/org/JAIST/advanced_machine_learning/qwen3-vl-2b-instruct-trl-sft-CLEVR/0001"          
IMAGE_ROOT = "/home/org/JAIST/advanced_machine_learning/data/custom_dataset/custom_dataset/test"      
TEST_CSV = "/home/org/JAIST/advanced_machine_learning/data/custom_dataset/custom_dataset/test_non_labels.csv"                     
OUTPUT_CSV = "./results.csv"               

device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(BASE_MODEL)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    BASE_MODEL, torch_dtype=torch.bfloat16
)
model = PeftModel.from_pretrained(model, LORA_PATH)
model.to(device)
model.eval()

test_df = pd.read_csv(TEST_CSV)

test_df["file"] = test_df["file"].apply(lambda x: os.path.join(IMAGE_ROOT, x))

# ==== 4️⃣ 推理 ====
results = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    image_path = row["file"]
    image = Image.open(image_path).convert("RGB")
    question = row["question"]
    messages = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": (
                        """You are Qwen, an expert vision-language reasoning assistant.
                        Your goal is to achieve the highest accuracy in visual reasoning tasks.
                        Follow this rigid process:
                        1. PERCEPTION: Identify specific objects, colors, shapes, materials, and their exact spatial relations.
                        2. REASONING: Connect the visual evidence logically to the question.
                        3. CONCLUSION: Formulate the final answer.

                        Output format (JSON):
                        {
                        "explanation": "<one short sentence, less than 30 words>",
                        "answer": "<one word only>"
                        }
                        """
                    ),
                }
            ],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question},
            ],
        },
    ]

    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)


    # 模型生成
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0,    
            do_sample=False
        )

    generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]

    output_text = processor.decode(generated_ids, skip_special_tokens=True)

    answer, explanation = parse_output(output_text)

    results.append({
        "id": row["id"],
        "answer": answer,
        "explanation": explanation
    })

# ==== 5️⃣ 保存 ====
pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"✅ 推理完成，结果已保存到 {OUTPUT_CSV}")


  from .autonotebook import tqdm as notebook_tqdm


OSError: Qwen/Qwen3-VL-2B-Instruct is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [6]:
import pandas as pd

def fill_empty_answers(df: pd.DataFrame) -> pd.DataFrame:
    """
    将 DataFrame 中 answer 为空或仅空格的项替换为 "-1"。
    """
    df["answer"] = df["answer"].apply(
        lambda x: "-1" if (not isinstance(x, str) or x.strip() == "") else x.strip()
    )
    df["explanation"] = df["explanation"].apply(
        lambda x: "-1" if (not isinstance(x, str) or x.strip() == "") else x.strip()
    )
    return df


In [7]:
df = pd.read_csv("results.csv")
df = fill_empty_answers(df)
df.to_csv("results_filled.csv", index=False)
print("✅ 已将空 answer 填充为 -1，保存到 results_filled.csv")


✅ 已将空 answer 填充为 -1，保存到 results_filled.csv
