In [1]:
TECHNIQUE_OPTIONS = [
    "Scanning electron microscopy (SEM)",
    "Transmission electron microscopy (TEM)",
    "Atomic force microscopy (AFM)",
    "Reflected light microscopy"
]

CATEGORY_OPTIONS = [
    "Metal or alloy",
    "Ceramic",
    "Polymer",
    "Composite",
    "Fracture"
]

In [2]:
def technique_mcq_prompt():
    return (
        "Look carefully at the microscopy image and identify the imaging technique.\n\n"
        "A. Scanning electron microscopy (SEM)\n"
        "B. Transmission electron microscopy (TEM)\n"
        "C. Atomic force microscopy (AFM)\n"
        "D. Reflected light microscopy\n\n"
        "Answer with only one letter."
    )

def category_mcq_prompt():
    return (
        "Which material category best describes the image?\n\n"
        "A. Metal or alloy\n"
        "B. Ceramic\n"
        "C. Polymer\n"
        "D. Composite\n"
        "E. Fracture\n\n"
        "Answer with only one letter."
    )

In [3]:
def parse_answer(text, valid):
    text = text.upper()
    for v in valid:
        if v in text:
            return v
    return None

def gt_to_letter(gt, options):
    try:
        return chr(ord("A") + options.index(gt))
    except:
        return None

In [4]:
import torch
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

def load_qwen_model(model_path, device="cuda"):
    model = Qwen3VLForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        device_map="auto"
    )

    processor = AutoProcessor.from_pretrained(model_path)

    model.eval()
    return model, processor


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = "/mnt/d/Subham/model/Qwen3-VL-8B-Instruct"

model, processor = load_qwen_model(MODEL_PATH, device)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
def run_qwen_vlm(model, processor, image, prompt):

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ]
        }
    ]

    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )

    inputs = inputs.to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=5,
            do_sample=False
        )

    # IMPORTANT: trim prompt tokens
    generated_ids_trimmed = [
        out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)
    ]

    output = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]

    return output.strip()


In [7]:

import pandas as pd
from PIL import Image
from tqdm import tqdm
import time

CSV_PATH = "qa.csv"
df = pd.read_csv(CSV_PATH)

start_time = time.time()
y_true_tech = []
y_pred_tech = []

y_true_cat = []
y_pred_cat = []

for _, row in tqdm(df.iterrows(), total=len(df)):

    # --- Safe image loading ---
    try:
        image = Image.open(row["image_local_path"]).convert("RGB")
    except Exception:
        continue

    # -------- Technique MCQ --------
    tech_out = run_qwen_vlm(
        model,
        processor,
        image,
        technique_mcq_prompt()
    )

    tech_pred = parse_answer(tech_out, ["A", "B", "C", "D"])
    tech_gt   = gt_to_letter(row["technique"], TECHNIQUE_OPTIONS)

    if tech_pred is not None and tech_gt is not None:
        y_pred_tech.append(tech_pred)
        y_true_tech.append(tech_gt)

    # -------- Category MCQ --------
    cat_out = run_qwen_vlm(
        model,
        processor,
        image,
        category_mcq_prompt()
    )

    cat_pred = parse_answer(cat_out, ["A", "B", "C", "D", "E"])
    cat_gt   = gt_to_letter(row["categories"], CATEGORY_OPTIONS)

    if cat_pred is not None and cat_gt is not None:
        y_pred_cat.append(cat_pred)
        y_true_cat.append(cat_gt)
# Calculate total time
total_time = time.time() - start_time
print(f"\nTotal processing time: {total_time:.2f} seconds")
print(f"Average time per image: {total_time/len(df):.2f} seconds")

  0%|                                                                                | 0/51 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|███████████████████████████████████████████████████████████████████████| 51/51 [53:40<00:00, 63.15s/it]


Total processing time: 3220.61 seconds
Average time per image: 63.15 seconds





In [10]:
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix
)

def print_metrics(y_true, y_pred, name):
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)

    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred,
        average="macro",
        zero_division=0
    )

    print(f"\n=== {name} ===")
    print(f"Accuracy           : {acc:.3f}")
    print(f"Balanced Accuracy  : {bal_acc:.3f}")
    print(f"Macro Precision    : {prec:.3f}")
    print(f"Macro Recall       : {rec:.3f}")
    print(f"Macro F1-score     : {f1:.3f}")

In [11]:
print_metrics(y_true_tech, y_pred_tech, "Technique Classification")
print_metrics(y_true_cat,  y_pred_cat,  "Category Classification")


=== Technique Classification ===
Accuracy           : 0.913
Balanced Accuracy  : 0.862
Macro Precision    : 0.933
Macro Recall       : 0.862
Macro F1-score     : 0.888

=== Category Classification ===
Accuracy           : 0.378
Balanced Accuracy  : 0.382
Macro Precision    : 0.529
Macro Recall       : 0.305
Macro F1-score     : 0.329


