In [1]:
#安装依赖
!pip install -q transformers accelerate pillow torch

from google.colab import files
import os

#创建目录
!mkdir -p images

print("请上传你的7张图片...")
uploaded = files.upload()

#保存图片
for filename in uploaded.keys():
    with open(f'images/{filename}', 'wb') as f:
        f.write(uploaded[filename])
    print(f' {filename} 已上传')

print(f'\n共上传 {len(uploaded)} 张图片')

请上传你的7张图片...


Saving books.png to books.png
Saving bottle.png to bottle.png
Saving campus.png to campus.png
Saving classroom.png to classroom.png
Saving desk.png to desk.png
Saving notebook.png to notebook.png
Saving screen.png to screen.png
✓ books.png 已上传
✓ bottle.png 已上传
✓ campus.png 已上传
✓ classroom.png 已上传
✓ desk.png 已上传
✓ notebook.png 已上传
✓ screen.png 已上传

共上传 7 张图片


In [2]:
import json

# 英文版测试问题（BLIP-VQA专用）
test_questions = [
    {
        "id": 1,
        "image": "images/books.png",
        "questions": [
            "What items can you see in this image?",
            "How many books are on the shelf?",
            "What text is on the white book on the right?"
        ]
    },
    {
        "id": 2,
        "image": "images/desk.png",
        "questions": [
            "What electronic devices are on the desk?",
            "How many game controllers can you see?",
            "What color is the game controller?"
        ]
    },
    {
        "id": 3,
        "image": "images/bottle.png",
        "questions": [
            "What is this item?",
            "What brand name is printed on the bottle?",
            "What color is this water bottle?"
        ]
    },
    {
        "id": 4,
        "image": "images/notebook.png",
        "questions": [
            "What is this item?",
            "What is the largest English word on the notebook cover?",
            "What color is the side of the notebook?"
        ]
    },
    {
        "id": 5,
        "image": "images/screen.png",
        "questions": [
            "What type of content is displayed on the screen?",
            "What can you see in the document title?"
        ]
    },
    {
        "id": 6,
        "image": "images/campus.png",
        "questions": [
            "What main elements can you see in this image?",
            "What type of place is this?",
            "How does the weather look?"
        ]
    },
    {
        "id": 7,
        "image": "images/classroom.png",
        "questions": [
            "What scene is this? What main items are in the picture?",
            "How many students can you clearly see in the foreground?",
            "Where is the projection screen in the classroom?"
        ]
    }
]

# 保存为JSON
with open('test_questions.json', 'w', encoding='utf-8') as f:
    json.dump(test_questions, f, ensure_ascii=False, indent=2)

print(" 英文版测试问题已准备好")
print(f" 共 {len(test_questions)} 张图片，{sum(len(item['questions']) for item in test_questions)} 个问题")

 英文版测试问题已准备好
 共 7 张图片，20 个问题


In [3]:
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import json
from datetime import datetime
import time

print("=" * 60)
print("Loading BLIP-VQA Model (English VQA)")
print("=" * 60)

# 加载BLIP-VQA模型
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

print(" Model loaded successfully!\n")

# 加载测试问题
with open('test_questions.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

results = []
total = sum(len(item['questions']) for item in test_data)
current = 0

print(f"Starting test: {len(test_data)} images, {total} questions\n")

for item in test_data:
    image_path = item['image']
    print(f"\n{'='*60}")
    print(f"Image {item['id']}/{len(test_data)}: {image_path}")
    print(f"{'='*60}")

    try:
        image = Image.open(image_path).convert('RGB')
    except Exception as e:
        print(f"Cannot load image: {e}")
        continue

    for question in item['questions']:
        current += 1
        print(f"\n[{current}/{total}] Q: {question}")

        # 处理输入
        inputs = processor(image, question, return_tensors="pt").to("cuda")

        # 生成答案
        start_time = time.time()
        try:
            with torch.no_grad():
                out = model.generate(**inputs, max_length=50, num_beams=5)

            inference_time = time.time() - start_time

            # 解码答案
            answer = processor.decode(out[0], skip_special_tokens=True)

            print(f"          A: {answer}")
            print(f"           {inference_time:.2f}s")

            results.append({
                'image_id': item['id'],
                'image_path': image_path,
                'question': question,
                'answer': answer,
                'inference_time': inference_time,
                'timestamp': datetime.now().isoformat()
            })

            # 清理缓存
            torch.cuda.empty_cache()

        except Exception as e:
            print(f"           Error: {e}")
            results.append({
                'image_id': item['id'],
                'image_path': image_path,
                'question': question,
                'answer': f"ERROR: {str(e)}",
                'inference_time': 0
            })

print("\n" + "="*60)
print(" Experiment Complete!")
print("="*60)

# 保存结果
with open('results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# 统计
successful = [r for r in results if not r['answer'].startswith('ERROR')]
print(f" Total questions: {len(results)}")
print(f" Successful: {len(successful)}")
if successful:
    avg_time = sum(r['inference_time'] for r in successful) / len(successful)
    print(f" Average inference time: {avg_time:.2f}s")

print(f" Results saved to results.json")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading BLIP-VQA Model (English VQA)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

 Model loaded successfully!

Starting test: 7 images, 20 questions


Image 1/7: images/books.png

[1/20] Q: What items can you see in this image?
          A: books
           1.99s

[2/20] Q: How many books are on the shelf?
          A: 13
           0.12s

[3/20] Q: What text is on the white book on the right?
          A: can ' t read it
           0.21s

Image 2/7: images/desk.png

[4/20] Q: What electronic devices are on the desk?
          A: laptop and mouse
           0.15s

[5/20] Q: How many game controllers can you see?
          A: one
           0.11s

[6/20] Q: What color is the game controller?
          A: white
           0.16s

Image 3/7: images/bottle.png

[7/20] Q: What is this item?
          A: bottle
           0.11s

[8/20] Q: What brand name is printed on the bottle?
          A: coca cola
           0.11s

[9/20] Q: What color is this water bottle?
          A: blue
           0.14s

Image 4/7: images/notebook.png

[10/20] Q: What is this item?
          A: s

In [4]:
import json
from google.colab import files

# 加载结果
with open('results.json', 'r', encoding='utf-8') as f:
    results = json.load(f)

print("=" * 60)
print("实验结果预览")
print("=" * 60)

# 按图片分组显示
from collections import defaultdict
by_image = defaultdict(list)
for r in results:
    by_image[r['image_id']].append(r)

for img_id in sorted(by_image.keys()):
    print(f"\n 图片 {img_id}: {by_image[img_id][0]['image_path']}")
    print("-" * 60)
    for qa in by_image[img_id]:
        print(f"   Q: {qa['question']}")
        print(f"   A: {qa['answer']}")
        print()

# 统计信息
print("\n" + "=" * 60)
print("统计信息")
print("=" * 60)
print(f"总问题数: {len(results)}")
avg_time = sum(r['inference_time'] for r in results) / len(results)
print(f"平均推理时间: {avg_time:.2f}秒")

# 下载结果
print("\n正在下载结果文件...")
files.download('results.json')
print(" 下载完成！")

实验结果预览

 图片 1: images/books.png
------------------------------------------------------------
   Q: What items can you see in this image?
   A: books

   Q: How many books are on the shelf?
   A: 13

   Q: What text is on the white book on the right?
   A: can ' t read it


 图片 2: images/desk.png
------------------------------------------------------------
   Q: What electronic devices are on the desk?
   A: laptop and mouse

   Q: How many game controllers can you see?
   A: one

   Q: What color is the game controller?
   A: white


 图片 3: images/bottle.png
------------------------------------------------------------
   Q: What is this item?
   A: bottle

   Q: What brand name is printed on the bottle?
   A: coca cola

   Q: What color is this water bottle?
   A: blue


 图片 4: images/notebook.png
------------------------------------------------------------
   Q: What is this item?
   A: styrofoam container

   Q: What is the largest English word on the notebook cover?
   A: scrummy

 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

 下载完成！
