In [None]:
!pip uninstall -y salesforce-lavis
!pip install -U "transformers>=4.33" accelerate sentencepiece pillow
!pip install -q transformers accelerate torchvision
import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration

In [None]:
"""loading pre-train processor and model"""
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "Salesforce/blip2-opt-2.7b"  
processor = Blip2Processor.from_pretrained(MODEL_ID, force_download=True)
model = Blip2ForConditionalGeneration.from_pretrained(MODEL_ID, device_map="auto", torch_dtype=torch.float16)
"""loading pre-train processor and model done"""

In [None]:
def analyze_illegal_parking(image_path):
    image = Image.open(image_path).convert("RGB")

    prompt = (
        "Describe whether the image shows illegal parking. Answer yes or no, and explain the reason in one sentence."
    )

    inputs = processor(images=image, text=[prompt], return_tensors="pt").to(device)
    eos_token_id = processor.tokenizer.eos_token_id
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=True,
        top_p=0.9,
        eos_token_id=eos_token_id,
        repetition_penalty=1.3
    )
    #print("generated_ids:", generated_ids)
    output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    if output.startswith(prompt):
        output = output[len(prompt):].strip()

    print(output)

analyze_illegal_parking("./image.png")

In [None]:
import torch

image = Image.open("./image.png").convert("RGB")
def vqa_illegal_check(model, processor, rawImg, device="cpu"):
    # ---- Step 1: 僅判斷 Yes/No ----
    q1 = "In the first image, is any visible car parked illegally? Answer only Yes or No."
    inp1 = processor(images=rawImg, text=q1, return_tensors="pt")
    inp1 = {k: v.to(device) for k, v in inp1.items()}

    with torch.no_grad():
        out1 = model.generate(
            **inp1,
            max_new_tokens=3,
            do_sample=False
        )

    in_len1 = inp1["input_ids"].shape[1]
    ans1 = processor.decode(out1[0, in_len1:], skip_special_tokens=True).strip()

    first = ans1.split()[0].strip(",. ").capitalize() if ans1 else ""
    if first not in {"Yes", "No"}:
        full1 = processor.decode(out1[0], skip_special_tokens=True)
        first = "Yes" if "yes" in full1.lower() else ("No" if "no" in full1.lower() else "No")

    illegal = first

    # ---- Step 2: 只要理由 ----
    q2 = ("Give one short reason based only on visible evidence "
          "(e.g., red lines, blocking crosswalk/driveway, no-parking signs). "
          "Answer format:\nReason: ")
    inp2 = processor(images=rawImg, text=q2, return_tensors="pt")
    inp2 = {k: v.to(device) for k, v in inp2.items()}

    with torch.no_grad():
        out2 = model.generate(
            **inp2,
            max_new_tokens=40,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )

    dec2_full = processor.decode(out2[0], skip_special_tokens=True)
    reason = dec2_full.split("Reason:", 1)[-1].strip()

    junk = [
        "Briefly state the reason for your previous answer",
        "based only on visible evidence",
        "(e.g., red lines, blocking crosswalk, blocking driveway, no-parking signs)",
        "Answer format"
    ]
    low = reason.lower()
    for j in junk:
        low = low.replace(j.lower(), "")
    reason = low.strip(" :.-\"").strip()

    if not reason:
        reason = "????????????????????????????."

    return f"Illegal: {illegal}\nReason: {reason}"

# ===== 使用 =====
print(vqa_illegal_check(model, processor, image, device))


In [None]:
"""model visual question answering"""
inputs_captioning = processor(rawImg, return_tensors="pt").to(device, torch.float16)
generated_ids_captioning = model.generate(**inputs_captioning)
generated_text_captioning = processor.batch_decode(generated_ids_captioning, skip_special_tokens=True)[0].strip()
print("Image Captioning:", generated_text_captioning)

In [None]:
# Q1
question = "Question: How many cars are in the first sub-image? Answer:"
inputs_vqa = processor(rawImg, text=question, return_tensors="pt").to(device, torch.float16)
generated_ids_vqa = model.generate(**inputs_vqa, max_new_tokens=15, num_beams=3, do_sample=False, temperature=0.1)
generated_text_vqa = processor.batch_decode(generated_ids_vqa, skip_special_tokens=True)[0].strip()
print("VQA (first sub-image):", generated_text_vqa)


In [None]:
# Q2
question2 = "Question: How many cars are in the second sub-image? Answer:"
inputs_vqa2 = processor(rawImg, text=question2, return_tensors="pt").to(device, torch.float16)
generated_ids_vqa2 = model.generate(**inputs_vqa2, max_new_tokens=15, num_beams=3, do_sample=False, temperature=0.1)
generated_text_vqa2 = processor.batch_decode(generated_ids_vqa2, skip_special_tokens=True)[0].strip()
print("VQA (second sub-image):", generated_text_vqa2)