In [14]:
import json
import random
from collections import defaultdict
import os

# ======= 用户自定义配置区域 =======
json_path = 'arxivqa.jsonl'                # 输入数据路径（支持 .json or .jsonl）
train_cats = ['physics', 'cs']             # 训练和验证用的两个分类
zeroshot_cat = 'physics'                  # Zero-Shot 测试的分类
output_dir = 'split_result'        # 输出目录
seed = 42                                  # 随机种子
max_samples_per_class = 1000               # 每个训练/验证类别最大样本数
max_zeroshot_samples = 200                # Zero-shot 分类最大样本数
# ===================================

def extract_category(example):
    return example['id'].split('-')[0]

def load_json_or_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        first_line = f.readline()
        f.seek(0)
        if first_line.strip().startswith('{'):
            return [json.loads(line.strip()) for line in f if line.strip()]
        else:
            return json.load(f)

def split_data(data, train_cats, zeroshot_cat, seed=42, max_per_class=None, max_zero=None):
    random.seed(seed)
    categorized = defaultdict(list)

    for item in data:
        cat = extract_category(item)
        categorized[cat].append(item)

    train_data = []
    val_data = []
    test_data = []

    for cat in train_cats:
        items = categorized[cat]
        random.shuffle(items)
        if max_per_class is not None:
            items = items[:max_per_class]
        n = len(items)
        train_end = int(0.8 * n)
        val_end = int(0.9 * n)
        train_data.extend(items[:train_end])
        val_data.extend(items[train_end:val_end])

    # Zero-shot 测试数据（也限制最大数量）
    test_items = categorized[zeroshot_cat]
    random.shuffle(test_items)
    if max_zero is not None:
        test_items = test_items[:max_zero]
    test_data = test_items

    return train_data, val_data, test_data

def save_as_jsonl(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

def main():
    data = load_json_or_jsonl(json_path)
    train_data, val_data, test_data = split_data(
        data,
        train_cats,
        zeroshot_cat,
        seed=seed,
        max_per_class=max_samples_per_class,
        max_zero=max_zeroshot_samples
    )

    os.makedirs(output_dir, exist_ok=True)

    save_as_jsonl(train_data, os.path.join(output_dir, 'train.jsonl'))
    save_as_jsonl(val_data, os.path.join(output_dir, 'val.jsonl'))
    save_as_jsonl(test_data, os.path.join(output_dir, 'test_zeroshot.jsonl'))

    print(f"✅ 数据划分完成，输出目录：{output_dir}")
    print(f"训练集大小: {len(train_data)}")
    print(f"验证集大小: {len(val_data)}")
    print(f"Zero-Shot 测试集大小: {len(test_data)}")

if __name__ == '__main__':
    main()


✅ 数据划分完成，输出目录：split_result
训练集大小: 1600
验证集大小: 200
Zero-Shot 测试集大小: 200


In [4]:
import json
from collections import Counter


def extract_category(example):
    return example['id'].split('-')[0]

def count_categories(jsonl_path):
    counter = Counter()
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                item = json.loads(line)
                category = extract_category(item)
                counter[category] += 1
    return counter

if __name__ == "__main__":
    jsonl_path1 = "/root/IC_MLLM_VQA/QwenScience/ArxivQA/split_result/test_zeroshot.jsonl"  # 替换成你的文件路径
    counts1 = count_categories(jsonl_path1)
    print("每个zero shot:\n")
    for cat, count in counts1.items():
        print(f"{cat:15s}: {count}\n")
    
    jsonl_path2 = '/root/IC_MLLM_VQA/QwenScience/ArxivQA/split_result/train.jsonl'
    counts2 = count_categories(jsonl_path2)
    print("每个train:\n")
    for cat, count in counts2.items():
        print(f"{cat:15s}: {count}")
        
    jsonl_path3 = '/root/IC_MLLM_VQA/QwenScience/ArxivQA/split_result/val.jsonl'
    counts3 = count_categories(jsonl_path3)
    print("每个val:\n")
    for cat, count in counts3.items():
        print(f"{cat:15s}: {count}")
        
        

每个zero shot:

physics        : 200

每个train:

physics        : 800
cs             : 800
每个val:

physics        : 100
cs             : 100


In [5]:
with open("/root/IC_MLLM_VQA/QwenScience/ArxivQA/split_result/test_zeroshot.jsonl", 'r') as fr:
  arxiv_qa = [ json.loads(line.strip()) for line in fr]
len(arxiv_qa)

200

In [7]:
with open("/root/IC_MLLM_VQA/QwenScience/ArxivQA/split_result/test_zeroshot.jsonl", 'r') as fr:
  arxiv_qa = [ json.loads(line.strip()) for line in fr]
  
test_dataset = []
for i in range(len(arxiv_qa)):
    sample = arxiv_qa[i]
    try:
        if sample['question'] is None:
            print(f"第 {i} 个样本没有问题，跳过")
            continue

        test_dataset.append({
            'id':sample['id'],
            "image": sample['image'], 
            "question": sample["question"],
            "choices": sample["options"],
            "answer": sample["label"],
            "solution": sample["rationale"]
        })
    except Exception as e:
        print(f"跳过第 {i} 个样本，错误：{e}")
        continue

In [11]:
test_dataset[0]

{'id': 'physics-13608',
 'image': 'images/1304.6375_1.jpg',
 'question': 'Based on the trends observed in the p=0.05 and p=0.10 graphs, what can be inferred about the relationship between the parameter m and the properties ρ_S, ρ_M, and ρ_N at higher interaction strengths (I=1.5)?',
 'choices': ['A) ρ_S increases with m, while ρ_M and ρ_N decrease.',
  'B) ρ_S and ρ_M decrease with m, while ρ_N increases.',
  'C) ρ_S, ρ_M, and ρ_N all increase with m.',
  'D) ρ_S, ρ_M, and ρ_N all decrease with m.'],
 'answer': 'D',
 'solution': 'At I=1.5, for both values of p, as m increases, there is a general downward trend in all three properties ρ_S, ρ_M, and ρ_N, indicating that they all decrease with an increase in m.'}

In [1]:

import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoProcessor
from datasets import load_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import json
import os
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm
2025-04-18 23:00:44.699319: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-18 23:00:44.742687: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-18 23:00:44.742724: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-18 23:00:44.743889: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-18 23:00:44.7

In [2]:

# 设置设备：如果有 GPU 就用 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 加载模型到 GPU
model = AutoModelForVision2Seq.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    device_map={"": device},      # 明确放到 cuda 或 cpu
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32  # GPU 使用半精度更快
)

# 加载处理器
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.17s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
with open("/root/IC_MLLM_VQA/QwenScience/ArxivQA/split_result/test_zeroshot.jsonl", 'r') as fr:
  arxiv_qa = [ json.loads(line.strip()) for line in fr]
  
test_dataset = []
for i in range(len(arxiv_qa)):
    sample = arxiv_qa[i]
    try:
        if sample['question'] is None:
            print(f"第 {i} 个样本没有问题，跳过")
            continue

        test_dataset.append({
            'id':sample['id'],
            "image": sample['image'], 
            "question": sample["question"],
            "choices": sample["options"],
            "answer": sample["label"],
            "solution": sample["rationale"]
        })
    except Exception as e:
        print(f"跳过第 {i} 个样本，错误：{e}")
        continue

In [4]:
def build_message(sample):
    content = []
    if sample['image'] is not None:
        content.append({"type": "image", "image": "/root/IC_MLLM_VQA/QwenScience/ArxivQA/"+sample['image']})
    
    question_text = f"Question: {sample['question']}\nChoices:\n"
    for idx, choice in enumerate(sample['choices']):
        question_text += f"{chr(65 + idx)}. {choice}\n"
    
    if sample.get("hint"):
        question_text += f"\nHint: {sample['hint']}\n"
    
    question_text += "Read the image carfelly and please select the correct answer and explain why."
    content.append({"type": "text", "text": question_text})
    
    return [{"role": "user", "content": content}]

In [5]:
from torchvision import transforms
for sample in test_dataset:
    messages = build_message(sample)
    image_path = '/root/IC_MLLM_VQA/QwenScience/ArxivQA/'+sample["image"]
    image_inputs = Image.open(image_path).convert("RGB")
    break

In [10]:
import re
import pandas as pd
from tqdm import tqdm
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge

# 映射选项
answer_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

# 提取选项字母的函数（支持 C、c、C.、C) 等）
def extract_choice_letter(ans):
    if not isinstance(ans, str):
        return None
    ans = ans.strip()
    match = re.match(r"([A-Da-d])[\.\)]?", ans)
    return match.group(1).upper() if match else None

# 构建模型输入消息
def build_message(sample):
    content = []
    if sample['image'] is not None:
        content.append({"type": "image", "image": sample['image']})
    question_text = f"Question: {sample['question']}\nChoices:\n"
    for idx, choice in enumerate(sample['choices']):
        question_text += f"{chr(65 + idx)}. {choice}\n"
    question_text += "Please select the correct answer and explain why."
    content.append({"type": "text", "text": question_text})
    return [{"role": "user", "content": content}]

# 解析模型输出（返回编号+解释）
def parse_output(output):
    output = output.strip()
    answer_match = re.search(r"\b([A-Da-d])[\.\:\)]?", output)
    answer_letter = answer_match.group(1).upper() if answer_match else None
    answer = answer_mapping.get(answer_letter, -1)
    explanation = ""
    if answer != -1:
        idx = output.find(answer_match.group(0))
        if idx != -1:
            explanation = output[idx + len(answer_match.group(0)):].strip()
    return answer, explanation

# 初始化评估工具
rouge = Rouge()
smoothie = SmoothingFunction().method1
vectorizer = CountVectorizer(stop_words="english").fit([s["solution"] for s in test_dataset])
keywords = set(vectorizer.get_feature_names_out())

# 记录结果
all_records = []

for idx, sample in enumerate(tqdm(test_dataset, desc="Evaluating", ncols=100, leave=False)):
    if idx >= 5:  # 如果你想跑所有样本，删掉这一行
        break

    # 构建输入
    messages = build_message(sample)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_path = '/root/IC_MLLM_VQA/QwenScience/ArxivQA/' + sample["image"]
    image_inputs = Image.open(image_path).convert("RGB")

    inputs = processor(text=[text], images=image_inputs, return_tensors="pt", padding=True).to("cuda")
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    output = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)[0]

    # 解析模型输出
    pred_answer_id, pred_explanation = parse_output(output)

    # 计算 BLEU
    if pred_explanation:
        bleu = sentence_bleu([sample["solution"].split()], pred_explanation.split(), smoothing_function=smoothie)
    else:
        bleu = 0.0

    # 计算 ROUGE-L
    try:
        rouge_score = rouge.get_scores(pred_explanation, sample["solution"])[0]["rouge-l"]["f"]
    except:
        rouge_score = 0.0

    # 计算关键词重叠
    gt_tokens = set(sample["solution"].lower().split())
    pred_tokens = set(pred_explanation.lower().split())
    overlap = len(gt_tokens & pred_tokens & keywords)
    keyword_score = overlap / max(len(gt_tokens & keywords), 1)

    # 提取真实答案
    true_letter = extract_choice_letter(sample["answer"])
    true_answer_id = answer_mapping.get(true_letter, -1)

    # 存入结果
    all_records.append({
        "Question": sample["question"],
        "Choices": "\n".join(sample["choices"]),
        "True Answer": sample["answer"],
        "True Answer ID": true_answer_id,
        "Predicted Answer ID": pred_answer_id,
        "Correct": int(true_answer_id == pred_answer_id),
        "Model Output": output,
        "Model Explanation": pred_explanation,
        "Reference Solution": sample["solution"],
        "BLEU": bleu,
        "ROUGE-L": rouge_score,
        "Keyword Overlap": keyword_score
    })

# 构建结果表格
results_df = pd.DataFrame(all_records)
results_df.to_csv("model_evaluation_results.csv", index=False, encoding="utf-8-sig")

# 汇总指标（基于编号比较准确率）
acc = accuracy_score(results_df["True Answer ID"], results_df["Predicted Answer ID"])
f1 = f1_score(results_df["True Answer ID"], results_df["Predicted Answer ID"], average='macro')
avg_bleu = results_df["BLEU"].mean()
avg_rouge = results_df["ROUGE-L"].mean()
avg_keyword_overlap = results_df["Keyword Overlap"].mean()

# 打印汇总结果
print("\n📊 Summary Metrics:")
print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ F1 Score: {f1:.4f}")
print(f"✅ Avg BLEU: {avg_bleu:.4f}")
print(f"✅ Avg ROUGE-L: {avg_rouge:.4f}")
print(f"✅ Avg Keyword Overlap: {avg_keyword_overlap:.4f}")
print("\n📁 结果已保存为 model_evaluation_results.csv（UTF-8 with BOM）")



Evaluating:   0%|                                                           | 0/200 [00:00<?, ?it/s]

                                                                                                    


📊 Summary Metrics:
✅ Accuracy: 0.4000
✅ F1 Score: 0.1667
✅ Avg BLEU: 0.1129
✅ Avg ROUGE-L: 0.3291
✅ Avg Keyword Overlap: 0.4450

📁 结果已保存为 model_evaluation_results.csv（UTF-8 with BOM）


