encoding images and resizing

In [100]:
from PIL import Image


"""def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")"""


from PIL import Image
import base64
from io import BytesIO


def encode_image_to_base64(image_path):
    global first_flag
    # Open and preprocess the image
    image = Image.open(image_path).convert("RGB")
    w, h = image.size
    crop_margin_w = w // 15
    crop_margin_h = h // 15

    # Crop 1/20 from each side
    image = image.crop((
        crop_margin_w,                  # left
        crop_margin_h,                  # top
        w - crop_margin_w,              # right
        h - crop_margin_h               # bottom
    ))

    # Resize to 1/2 of original dimensions
    new_size = (w // 8, h // 8)
    image = image.resize(new_size, Image.LANCZOS)
    
    if first_flag:
        print(f"new image size: {new_size}")
        first_flag = False
        
    # Convert to base64
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


Step 1: Analyze Normal Images

In [101]:
import openai
from PIL import Image
from IPython.display import display
import base64

# Set your OpenAI API key
client = openai.OpenAI(api_key="..................................") # Replace it with yours please!

def describe_image_gpt(image_path):
    image_analysis_prompt = (
        f"This is a normal {sub_class}. Analyze the image and describe the normal {sub_class} in detail, "
        "including type, color, size (length, width), material, composition, quantity, "
        "relative location."
    )

    base64_image = encode_image_to_base64(image_path)
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": image_analysis_prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        max_completion_tokens=500
    )
    return response.choices[0].message.content.strip()




In [102]:
# Run step 1
def step1(normal_image_paths):
  normal_descriptions = []
  for path in normal_image_paths:
      display(Image.open(path))
      desc = describe_image_gpt(path)
      print(desc, "\n" + "="*80)
      normal_descriptions.append(desc)
  return normal_descriptions

Step 2: Summarizing the Normal Image Context

In [103]:
def step2(normal_descriptions):
  # Build the prompt using collected descriptions
  summary_prompt = (
      f"[ Normal {sub_class} Description 1 ]\n{normal_descriptions[0]}\n"
      f"[ Normal {sub_class} Description 2 ]\n{normal_descriptions[1]}\n"
      f"[ Normal {sub_class} Description 3 ]\n{normal_descriptions[2]}\n"
      "Combine the three descriptions into one by extracting only the \"common\" features.\n"
      "Create a concise summary that reflects the shared characteristics while removing any "
      "redundant or unique details."
  )

  # Send to GPT-4.1 for summarization
  response = client.chat.completions.create(
      model=model,
      messages=[{"role": "user", "content": summary_prompt}],
      max_completion_tokens=500
  )

  summary_description = response.choices[0].message.content.strip()

  # Output summary
  print(f"Summary of Normal {sub_class}\n")
  print(summary_description)
  return summary_description

Step 3: Generating Main Questions

In [104]:
import re

def step3(summary_description):
  # Create question generation prompt from the summary
  question_prompt = (
      f"[ Description of {sub_class} ]\n{summary_description}\n\n"
      f"Using the [ Description of {sub_class} ], create several\n"
      f"but essential, simple and important questions to determine whether the {sub_class} in the image is\n"
      "normal or abnormal. Ensure the questions are only based on visible characteristics, excluding\n"
      "any aspects that cannot be determined from the image. Also, simplify any difficult terms into\n"
      "easy-to-understand questions.\n"
      "(Q1) : ..."
  )

  # Send to GPT-4.1 to generate main questions
  response = client.chat.completions.create(
      model=model,
      messages=[{"role": "user", "content": question_prompt}],
      max_completion_tokens=500
  )

  # Extract questions
  main_question_block = response.choices[0].message.content.strip()
  print("Main Questions\n")
  print(main_question_block)

  print()

  # split them into a list
  main_questions = re.findall(r'\(Q\d+\)\s*:\s*(.*)', main_question_block)
  print(main_questions)
  return main_questions

In [105]:
# Ask GPT-4.1 with question and image
def evaluate_image_question(image_path, question):
    base64_image = encode_image_to_base64(image_path)
    prompt = (
        f"Question : {question}\n"
        f"At first, describe the {sub_class} and then answer the question. \n"
        "Your response must end with ‘- Result: Yes‘ or ‘- Result: No‘.\n"
        "Let’s think step by step."
    )

    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ],
        max_completion_tokens=500
    )
    return response.choices[0].message.content.strip()

Question Filtering:

In [106]:
def filter_questions(main_questions, normal_image_dir, max_images=10, rejection_threshold=0.2):
    # Load image paths
    image_paths = [os.path.join(normal_image_dir, f) for f in os.listdir(normal_image_dir)
                  if f.lower().endswith((".jpg", ".jpeg", ".png"))]
    image_paths = image_paths[:max_images]

    # Evaluate each question and track rejection rate
    filtered_questions = []

    for question in main_questions:
        no_count = 0

        for image_path in image_paths:
            response_text = evaluate_image_question(image_path, question)
            last_line = response_text.splitlines()[-1].lower()
            if "result: no" in last_line:
                no_count += 1
            elif "result: yes" not in last_line:
                print("Wrong answer, check model response: " + response_text.strip())
                exit()

        rejection_ratio = no_count / len(image_paths)
        print(f"Question: {question[:60]}... → {rejection_ratio*100:.1f}% No")

        if rejection_ratio <= rejection_threshold:
            filtered_questions.append(question)


    print("\n✔️ Final Kept Questions:")
    for q in filtered_questions:
        print("- ", q)
    return filtered_questions

Step 4: Generating 5 Variations of Each Main Question

In [107]:
def step4(main_questions):
    # Store all variations
    question_variations = {}

    # Loop through each main question and generate 5 variations
    for i, q in enumerate(main_questions):
        variation_prompt = (
            f"Generate five variations of the following question while keeping the semantic meaning.\n"
            f"Input : {q}\n"
            "Output1:\nOutput2:\nOutput3:\nOutput4:\nOutput5:"
        )

        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": variation_prompt}],
            max_completion_tokens=500
        )

        # Parse and clean the outputs
        outputs = re.findall(r'Output\d+\s*:\s*(.*)', response.choices[0].message.content.strip())
        question_variations[q] = outputs

    # Show result
    for main_q, variations in question_variations.items():
        print(f"\n🔸 Original: {main_q}")
        for i, v in enumerate(variations, 1):
            print(f"  Var {i}: {v}")
    return question_variations

Step 5: Evaluating Test Images with the Questions

In [114]:
import glob
import os
import json
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.utils import shuffle

def step5(test_image_dir, question_variations, ground_truth, id, where_left=0, checkpoint_every=5):
    global results, preds, gts, scores

    good_paths = glob.glob(os.path.join(test_image_dir, "good", "*.[jp][pn]g"))
    anomaly_paths = glob.glob(os.path.join(test_image_dir, "logical_anomalies", "*.[jp][pn]g"))

    test_image_paths = good_paths + anomaly_paths
    test_image_paths = shuffle(test_image_paths, random_state=42)

    # Resume from checkpointed state or start fresh
    if where_left == 0:
        results = {}
        preds = []
        gts = []
        scores = []
    else:
        try:
            state = np.load(f"logicqa_state_{id}.npz")
            preds = state["preds"].tolist()
            gts = state["gts"].tolist()
            scores = state["scores"].tolist()
        except Exception as e:
            print("⚠️ Could not load preds/gts/scores:", e)
            preds, gts, scores = [], [], []

        try:
            with open(f"logicqa_results_{id}.json", "r") as f:
                results = json.load(f)
        except:
            results = {}

    # Slice image list to resume from where_left
    test_image_paths = test_image_paths[where_left:]

    for i, image_path in enumerate(test_image_paths, start=where_left):
        print(f"    🔄 Eval Progress: {i}/{where_left + len(test_image_paths)} - {os.path.basename(image_path)}", end="\r", flush=True)
        results[image_path] = {}

        vote_results = []

        for main_q, variations in question_variations.items():
            all_qs = [main_q] + variations
            no_votes = 0
            yes_votes = 0
            responses = []

            for q in all_qs:
                answer = evaluate_image_question(image_path, q)
                last_line = answer.strip().splitlines()[-1].lower()
                is_no = "result: no" in last_line
                no_votes += is_no
                yes_votes += not is_no
                responses.append({"question": q, "response": answer})

            majority_is_no = no_votes > yes_votes
            vote_results.append(majority_is_no)

            results[image_path][main_q] = {
                "responses": responses,
                "majority_vote": "No" if majority_is_no else "Yes"
            }

        final_pred = int(any(vote_results))
        preds.append(final_pred)
        gts.append(ground_truth.get(image_path, 0))

        per_question_scores = [
            sum("result: no" in r["response"].lower() for r in results[image_path][q]["responses"]) / len(results[image_path][q]["responses"])
            for q in results[image_path] if results[image_path][q]["responses"]
        ]
        image_score = max(per_question_scores) if per_question_scores else 0
        scores.append(image_score)

        # ✅ Save checkpoint every N images
        if (i + 1) % checkpoint_every == 0 or (i + 1 == where_left + len(test_image_paths)):
            print(f"\n💾 Saving checkpoint at image {i+1}")
            with open(f"logicqa_results_{id}.json", "w") as f:
                json.dump(results, f, indent=2)
            np.savez(f"logicqa_state_{id}.npz", preds=np.array(preds), gts=np.array(gts), scores=np.array(scores))

    # Final save
    with open(f"logicqa_results_{id}.json", "w") as f:
        json.dump(results, f, indent=2)
    np.savez(f"logicqa_state_{id}.npz", preds=np.array(preds), gts=np.array(gts), scores=np.array(scores))

    # Metrics
    acc = accuracy_score(gts, preds)
    try:
        auroc = roc_auc_score(gts, scores)
    except:
        auroc = "AUROC: undefined (only one class present)"
    f1s = [f1_score(gts, [1 if s >= t else 0 for s in scores]) for t in np.linspace(0, 1, 100)]
    f1_max = max(f1s)

    print("\n🔍 Evaluation Metrics:")
    print(f"✔️ Accuracy: {acc:.3f}")
    print(f"✔️ AUROC: {auroc if isinstance(auroc, str) else round(auroc, 3)}")
    print(f"✔️ F1-Max: {f1_max:.3f}")
    return acc, auroc, f1_max


A function to include all steps together and extracting image features

In [115]:
import os
import random
import numpy as np
from sklearn.cluster import KMeans
from PIL import Image
import torchvision.models as models
import torchvision.transforms as transforms
import torch

def extract_image_features(image_paths):
    """Extract features using a pretrained model (ResNet18)"""
    model = models.resnet18(pretrained=True)
    model.fc = torch.nn.Identity()  # remove classification head
    model.eval()

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    features = []
    with torch.no_grad():
        for path in image_paths:
            img = Image.open(path).convert("RGB")
            x = transform(img).unsqueeze(0)  # add batch dim
            feat = model(x).squeeze(0).numpy()
            features.append(feat)

    return np.array(features)

def all_steps(normal_image_dir, test_image_dir, few_shot_mode, id):
    global normal_descriptions, summary_description, main_questions, main_questions_filtered, question_variations
    
    all_normal_paths = [
        os.path.join(normal_image_dir, f)
        for f in os.listdir(normal_image_dir)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]

    if "random" in few_shot_mode:
        three_normal_paths = random.sample(all_normal_paths, 3)

    elif "clustering_based" in few_shot_mode:
        features = extract_image_features(all_normal_paths)
        kmeans = KMeans(n_clusters=3, random_state=0).fit(features)
        centers = kmeans.cluster_centers_
        labels = kmeans.labels_

        three_normal_paths = []
        for k in range(3):
            cluster_indices = np.where(labels == k)[0]
            cluster_feats = features[cluster_indices]
            dists = np.linalg.norm(cluster_feats - centers[k], axis=1)
            closest_idx = cluster_indices[np.argmin(dists)]
            three_normal_paths.append(all_normal_paths[closest_idx])

    else:
        raise ValueError(f"Unsupported few_shot_mode: {few_shot_mode}")
    


    ground_truth = {}
    for path in test_image_dir:
        label = 0 if "good" in path else 1
        ground_truth[path] = label

    # Step-by-step LogicQA pipeline
    normal_descriptions = step1(three_normal_paths)
    summary_description = step2(normal_descriptions)
    main_questions = step3(summary_description)
    main_questions_filtered = filter_questions(main_questions, normal_image_dir, max_images=100, rejection_threshold=0.2)
    question_variations = step4(main_questions_filtered)
    acc, auroc, f1_max = step5(test_image_dir, question_variations, ground_truth, id)
    
    return acc, auroc, f1_max

In [None]:

import numpy as np
from scipy.stats import ttest_rel

sub_class = "breakfast_box"
normal_image_dir = f"C:\\Users\\sevza\\Desktop\\NLP\proj\\mvtec_loco_anomaly_detection\\{sub_class}\\train\\good\\"
test_image_dir = f"C:\\Users\\sevza\\Desktop\\NLP\proj\\mvtec_loco_anomaly_detection\\{sub_class}\\test\\"
model = 'gpt-4.1'

# To collect metrics: each a list of 10 values
random_acc, random_auroc, random_f1 = [], [], []
cluster_acc, cluster_auroc, cluster_f1 = [], [], []

first_flag = True
id = 0
for i in range(3):
    id = id + 1
    print(f"\n🔁 Random Mode Run {i+1}")
    acc, auroc, f1 = all_steps(normal_image_dir, test_image_dir, few_shot_mode="random", id=id)
    random_acc.append(acc)
    random_auroc.append(auroc)
    random_f1.append(f1)

    id = id + 1
    print(f"\n🔁 Clustering-Based Mode Run {i+1}")
    acc, auroc, f1 = all_steps(normal_image_dir, test_image_dir, few_shot_mode="clustering_based", id=id)
    cluster_acc.append(acc)
    cluster_auroc.append(auroc)
    cluster_f1.append(f1)

# Convert to NumPy arrays
random_acc = np.array(random_acc)
cluster_acc = np.array(cluster_acc)
random_auroc = np.array(random_auroc)
cluster_auroc = np.array(cluster_auroc)
random_f1 = np.array(random_f1)
cluster_f1 = np.array(cluster_f1)

# Paired t-tests
def compare(metric_name, rand, clust):
    print(f"\n📊 {metric_name}")
    print(f"  - Random     Mean ± Std: {rand.mean():.3f} ± {rand.std():.3f}")
    print(f"  - Clustering Mean ± Std: {clust.mean():.3f} ± {clust.std():.3f}")
    t_stat, p_val = ttest_rel(clust, rand)
    print(f"  - Paired t-test: t = {t_stat:.3f}, p = {p_val:.4f} {'(significant)' if p_val < 0.05 else '(not significant)'}")

compare("Accuracy", random_acc, cluster_acc)
compare("AUROC", random_auroc, cluster_auroc)
compare("F1-max", random_f1, cluster_f1)


IMPORTANT NOTE: Ignore the following cells and the accuracies shown in its output. This cell is designed to handle unexpected connection errors that occurred during execution and to avoid re-running the entire program. Additionally, note that the accuracy values reported in this cell is incorrect due to an error in the ground truth labels. The correct metrics, including accuracy, are calculated based on the saved result files in the final two cells of this notebook.

In [122]:
# IGNORE THIS CELL: this cell is designed to handle a connection error and avoid re-running already executed parts.

i = 1
print(f"\n🔁 Clustering-Based Mode Run {i+1}")
question_variations = step4(main_questions_filtered)
acc, auroc, f1 = step5(test_image_dir, question_variations, ground_truth, id)
    
#acc, auroc, f1 = all_steps(normal_image_dir, test_image_dir, few_shot_mode="clustering_based", id=id)
cluster_acc.append(acc)
cluster_auroc.append(auroc)
cluster_f1.append(f1)

# Convert to NumPy arrays
random_acc = np.array(random_acc)
cluster_acc = np.array(cluster_acc)
random_auroc = np.array(random_auroc)
cluster_auroc = np.array(cluster_auroc)
random_f1 = np.array(random_f1)
cluster_f1 = np.array(cluster_f1)

# Paired t-tests
def compare(metric_name, rand, clust):
    print(f"\n📊 {metric_name}")
    print(f"  - Random     Mean ± Std: {rand.mean():.3f} ± {rand.std():.3f}")
    print(f"  - Clustering Mean ± Std: {clust.mean():.3f} ± {clust.std():.3f}")
    t_stat, p_val = ttest_rel(clust, rand)
    print(f"  - Paired t-test: t = {t_stat:.3f}, p = {p_val:.4f} {'(significant)' if p_val < 0.05 else '(not significant)'}")

compare("Accuracy", random_acc, cluster_acc)
compare("AUROC", random_auroc, cluster_auroc)
compare("F1-max", random_f1, cluster_f1)



🔁 Clustering-Based Mode Run 2

🔸 Original: Is the box made of light-colored, disposable material with two main compartments (a smaller one on the left and a larger one on the right)?
  Var 1: Is the box constructed from a light-toned, disposable material and does it have two primary compartments, with a smaller compartment on the left and a larger one on the right?
  Var 2: Does the box consist of a light-colored, throwaway material and feature two main sections, with the left side being smaller and the right side larger?
  Var 3: Is the box composed of disposable, light-colored material and split into two main compartments, the smaller on the left and the larger on the right?
  Var 4: Is the box made out of a light-colored disposable material with two main sections, with a smaller one to the left and a bigger one to the right?
  Var 5: Does the box use a light-colored, single-use material and have two primary compartments—a smaller on the left and a larger on the right?

🔸 Original: 

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U41'), dtype('<U41')) -> None

In [140]:
import json
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score

def is_normal(filename):
    return "good" in filename.lower()

def majority_votes_to_final_label(votes):
    return int(any("no" in v.lower() for v in votes))

# Containers for statistics
random_acc, random_auroc, random_f1 = [], [], []
cluster_acc, cluster_auroc, cluster_f1 = [], [], []

for i in range(1, 5):
    file_path = f"logicqa_results_{i}.json"
    with open(file_path, "r") as f:
        results = json.load(f)

    preds = []
    gts = []
    scores = []  # anomaly scores = proportion of "no" answers

    for image_path, question_dict in results.items():
        majority_votes = [v["majority_vote"].lower() for v in question_dict.values()]

        pred = majority_votes_to_final_label(majority_votes)
        gt = 0 if is_normal(image_path) else 1
        score = sum("no" in v for v in majority_votes) / len(majority_votes)

        preds.append(pred)
        gts.append(gt)
        scores.append(score)

    acc = sum(p == g for p, g in zip(preds, gts)) / len(preds) if preds else 0

    try:
        auroc = roc_auc_score(gts, scores)
    except:
        auroc = np.nan  # If AUROC is not computable

    f1s = [f1_score(gts, [1 if s >= t else 0 for s in scores]) for t in np.linspace(0, 1, 100)]
    f1_max = max(f1s)

    # Store in appropriate list
    if i in [1, 3]:  # Random runs
        random_acc.append(acc)
        random_auroc.append(auroc)
        random_f1.append(f1_max)
        run_type = "Random"
    else:            # Clustering-based runs
        cluster_acc.append(acc)
        cluster_auroc.append(auroc)
        cluster_f1.append(f1_max)
        run_type = "Clustering-based"

    print(f"\n📊 {run_type} Run {i} — logicqa_results_{i}.json")
    print(f"   ✔️ Accuracy: {acc:.3f}")
    print(f"   📈 AUROC: {auroc if not np.isnan(auroc) else 'undefined'}")
    print(f"   ⭐ F1-max: {f1_max:.3f}")

# Convert to NumPy arrays for stats
random_acc = np.array(random_acc)
random_auroc = np.array(random_auroc)
random_f1 = np.array(random_f1)

cluster_acc = np.array(cluster_acc)
cluster_auroc = np.array(cluster_auroc)
cluster_f1 = np.array(cluster_f1)

# Print Mean and Std
print("\n📈 Aggregated Results:")

print("\n🌀 Random Mode:")
print(f"   ✔️ Accuracy: {random_acc.mean():.3f} ± {random_acc.std():.3f}")
print(f"   📈 AUROC: {np.nanmean(random_auroc):.3f} ± {np.nanstd(random_auroc):.3f}")
print(f"   ⭐ F1-max: {random_f1.mean():.3f} ± {random_f1.std():.3f}")

print("\n🌟 Clustering-Based Mode:")
print(f"   ✔️ Accuracy: {cluster_acc.mean():.3f} ± {cluster_acc.std():.3f}")
print(f"   📈 AUROC: {np.nanmean(cluster_auroc):.3f} ± {np.nanstd(cluster_auroc):.3f}")
print(f"   ⭐ F1-max: {cluster_f1.mean():.3f} ± {cluster_f1.std():.3f}")



📊 Random Run 1 — logicqa_results_1.json
   ✔️ Accuracy: 0.870
   📈 AUROC: 0.8907999999999999
   ⭐ F1-max: 0.874

📊 Clustering-based Run 2 — logicqa_results_2.json
   ✔️ Accuracy: 0.830
   📈 AUROC: 0.8300000000000001
   ⭐ F1-max: 0.795

📊 Random Run 3 — logicqa_results_3.json
   ✔️ Accuracy: 0.540
   📈 AUROC: 0.7836000000000001
   ⭐ F1-max: 0.739

📊 Clustering-based Run 4 — logicqa_results_4.json
   ✔️ Accuracy: 0.890
   📈 AUROC: 0.904
   ⭐ F1-max: 0.893

📈 Aggregated Results:

🌀 Random Mode:
   ✔️ Accuracy: 0.705 ± 0.165
   📈 AUROC: 0.837 ± 0.054
   ⭐ F1-max: 0.806 ± 0.067

🌟 Clustering-Based Mode:
   ✔️ Accuracy: 0.860 ± 0.030
   📈 AUROC: 0.867 ± 0.037
   ⭐ F1-max: 0.844 ± 0.049


In [143]:
# Load the last result file (clustering-based run 2)
with open("logicqa_results_4.json", "r") as f:
    results = json.load(f)

def get_misclassified_samples(results, find_normal=True):
    for image_path, question_dict in results.items():
        majority_votes = [v["majority_vote"].lower() for v in question_dict.values()]
        pred = majority_votes_to_final_label(majority_votes)
        gt = 0 if is_normal(image_path) else 1

        if (find_normal and is_normal(image_path) and pred != gt) or (not find_normal and not is_normal(image_path) and pred != gt):
            return image_path, gt, pred, question_dict
    return None, None, None, {}

# Find one false positive (normal image → wrongly predicted as abnormal)
fp_path, fp_gt, fp_pred, fp_question_dict = get_misclassified_samples(results, find_normal=True)

# Find one false negative (abnormal image → wrongly predicted as normal)
fn_path, fn_gt, fn_pred, fn_question_dict = get_misclassified_samples(results, find_normal=False)

print("\n🔍 Example False Positive (normal → predicted abnormal):")
if fp_path:
    print(f"   📷 Image: {fp_path}")
    print("   ❌ Questions answered 'No':")
    for q, v in fp_question_dict.items():
        if "no" in v["majority_vote"].lower():
            print(f"     - {q}")
else:
    print("   ✅ No false positives found.")

print("\n🔍 Example False Negative (abnormal → predicted normal):")
if fn_path:
    print(f"   📷 Image: {fn_path}")
    print("   📋 All Questions and Answers:")
    for q, v in fn_question_dict.items():
        print(f"     - Q: {q}")
        """for idx, resp in enumerate(v["responses"]):
            print(f"         Variant {idx}: {resp['response'].strip()}")"""
else:
    print("   ✅ No false negatives found.")



🔍 Example False Positive (normal → predicted abnormal):
   📷 Image: C:\Users\sevza\Desktop\NLP\proj\mvtec_loco_anomaly_detection\breakfast_box\test\good\042.png
   ❌ Questions answered 'No':
     - Is the right compartment visually divided into two sections—one with a large pile of granola or oats (possibly with nuts/seeds), and the other with banana chips and whole almonds next to each other?

🔍 Example False Negative (abnormal → predicted normal):
   📷 Image: C:\Users\sevza\Desktop\NLP\proj\mvtec_loco_anomaly_detection\breakfast_box\test\logical_anomalies\046.png
   📋 All Questions and Answers:
     - Q: Is the box made of light-colored, disposable material with two main compartments (a smaller one on the left and a larger one on the right)?
     - Q: Are there 2 or 3 small oranges (or clementines/mandarins) and 1 whole red or red-yellow apple together in the left compartment?
     - Q: Is the right compartment visually divided into two sections—one with a large pile of granola or o