In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parents[0]
sys.path.insert(0, str(PROJECT_ROOT))

In [2]:
# Third-party
from PIL import Image
import json 

# Project imports
from multimodal_fewshot_prompting.prompts import get_prompt_builder
from multimodal_fewshot_prompting.vision import image_to_base64
from multimodal_fewshot_prompting.llm.ollama import llm
from multimodal_fewshot_prompting.parsers import speedtest_parser

In [3]:
VALIDATION_DIR = PROJECT_ROOT / "data" / "validation"

In [None]:
def evaluate(strategy: str):
    prompt_builder = get_prompt_builder(strategy)

    labels = json.loads(
        (VALIDATION_DIR / "labels.json").read_text()
    )

    results = []

    for sample in labels:
        image = Image.open(VALIDATION_DIR / sample["image"])
        image_b64 = image_to_base64(image)

        messages = prompt_builder(image_b64)
        response = llm.invoke(messages)

        try:
            prediction = speedtest_parser.parse(response.content)
            success = True
        except Exception:
            prediction = None
            success = False

        results.append({
            "image": sample["image"],
            "parsed": success,
            "prediction": prediction,
            "expected": sample,
        })

    return results


In [5]:
labels = json.loads(
        (VALIDATION_DIR / "labels.json").read_text()
    )

In [8]:
strategy = "zero-shot"

prompt_builder = get_prompt_builder(strategy)

labels = json.loads(
    (VALIDATION_DIR / "labels.json").read_text()
)

results = []

for sample in labels:
    image = Image.open(VALIDATION_DIR / sample["image"])
    image_b64 = image_to_base64(image)

    messages = prompt_builder(image_b64)
    response = llm.invoke(messages)

    try:
        prediction = speedtest_parser.parse(response.content)
        success = True
    except Exception:
        prediction = None
        success = False

    results.append({
        "image": sample["image"],
        "parsed": success,
        "prediction": prediction,
        "expected": sample,
    })


In [9]:
results

[{'image': 'openspeedtest_validation_01.png',
  'parsed': True,
  'prediction': {'download': 644.05, 'upload': 51.8, 'ping': 80},
  'expected': {'image': 'openspeedtest_validation_01.png',
   'download': 644.05,
   'upload': 51.8,
   'ping': 44.0}},
 {'image': 'speedtest_validation_01.png',
  'parsed': True,
  'prediction': {'download': 495.73, 'upload': 198.07, 'ping': 3},
  'expected': {'image': 'speedtest_validation_01.png',
   'download': 495.73,
   'upload': 198.07,
   'ping': 3.0}}]

In [None]:
zero = evaluate("zero-shot")

In [None]:
few = evaluate("few-shot")

In [None]:




def score(results):
    return sum(r["parsed"] for r in results) / len(results)

print("Zero-shot accuracy:", score(zero))
print("Few-shot accuracy:", score(few))