In [3]:
import os

os.chdir("/home/scottc/links/scratch/causal_pool/")

import jsonlines
from pprint import pprint

dataset = jsonlines.open("datasets/1k_simple/counterfactual_test.jsonl")

entry = list(dataset)[0]

In [4]:
from typing import Tuple

def get_metrics(entry, pred) -> Tuple[int, int]:
    """
    Returns (exactly correct or not, how many options were correct)
    """
    if not all(c.isalpha() and c.isupper() for c in pred):
        return 0, 0
    
    selected_options = set(ord(c) - ord("A") for c in pred)

    if len(selected_options) != len(pred):  # duplicate options
        return 0, 0
    
    ground_truth = set(entry["ground_truth"])
    
    exactly_correct = int(selected_options == ground_truth)
    num_correct = len(selected_options & ground_truth)
    
    return exactly_correct, num_correct

In [5]:
from sft.prompt_utils import build_question_prompt


def build_prompt(entry):
    video_path = f"datasets/1k_simple/shots/{entry['video']}/video.mp4"
    question_prompt = build_question_prompt(entry)
    
    print(question_prompt)
    
    return [{
        "role": "user",
        "content": [
            {
                "type": "video_url",
                "video_url": {"url": f"data:video/mp4;base64,{to_b64(video_path)}"},
            },
            {"type": "text", "text": question_prompt},
        ],
    },
    ]

import base64

def to_b64(video_path):
    with open(video_path, "rb") as video_file:
        return base64.b64encode(video_file.read()).decode("utf-8")

In [6]:
from openai import OpenAI

BASE_URL = "http://trig0006:8000/v1"
MODEL = "OpenGVLab/InternVL3_5-4B"

client = OpenAI(base_url=BASE_URL, api_key="EMPTY")

response = client.chat.completions.create(
    messages=build_prompt(entry),
    model=MODEL,
    max_tokens=None,
    temperature=0.8,
    extra_body={
        "top_k": 20,
        "top_p": 0.95,
        "repetition_penalty": 1.0,
        "presence_penalty": 0.0,
    },
)

Answer the following question based on the video provided.
Pocket locations: red at (0, 0), green at (0.9906, 0), white at (0, 1.9812), and purple at (0.9906, 1.9812). If the initial velocity were changed from (dx=2.00, dy=-1.45) to (dx=1.10, dy=0.00) (assume all other variables are unchanged), what would happen?
A. The ball bounced off a wall
B. The second wall hit was orange-red-wall
C. The first wall hit was blue-purple-wall
D. The first wall hit was orange-red-wall
E. The first wall hit was grey-orange-wall
F. The third wall hit was blue-purple-wall

Please select the correct option(s). Don't write anything else than the option letter(s). Example: AC.


In [7]:
print(response.choices[0].message.content)

D. The first wall hit was orange-red-wall


In [8]:
print(entry["ground_truth"])

[0, 2]


In [9]:
from eval.eval_utils import get_metrics

print(get_metrics(entry, response.choices[0].message.content))

InvalidPredictionError: Prediction contains non-A-Z characters: 'D. The first wall hit was orange-red-wall'