In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir('../')

In [2]:
# In[1]# In[1]:
import os
import sys
import copy
import torch
import argparse
from transformers import StoppingCriteria, StoppingCriteriaList
from math import ceil
from PIL import Image
import numpy as np
import decord

decord.bridge.set_bridge('torch')
from torchvision.transforms.functional import InterpolationMode
import json
import time
import datetime
from tqdm import tqdm
import random

random.seed(1234)

from utils.config import Config
from utils.easydict import EasyDict
from transformers import StoppingCriteria, StoppingCriteriaList
from decord import VideoReader, cpu
import torchvision.transforms as T
from dataset.video_transforms import (
    GroupNormalize, GroupScale, GroupCenterCrop, 
    Stack, ToTorchFormatTensor
)
from peft import get_peft_model, LoraConfig, TaskType
from io import BytesIO
from models import *

try:
    from petrel_client.client import Client
    has_client = True
    print("Client on!")
except:
    has_client = False
    print("Client off!")

if has_client:
    client = Client('~/petreloss.conf')
else:
    client = None


# In[2]:


def get_args():
    parser = argparse.ArgumentParser()
    #与测试任务无关
    parser.add_argument('--model_type', default="VideoChat2_it4_mistral_LinearProAda")
    parser.add_argument('--model_dir', default="./download/parameters")
    parser.add_argument('--model_pth', default="timesuite")
    parser.add_argument('--output_dir', default="Please input model output dir!")
    parser.add_argument('--batch_size', type=int, default=1)
    parser.add_argument('--infer_clip_frames', type=int, default=8)
    
    args = parser.parse_args(args=[])    
    return args


args = get_args()
args_list_str = '\n' + '\n'.join([f'{k:<25}: {v}' for k, v in vars(args).items()])
print(args_list_str)


# In[5]:

# config_file = "configs/config_mistral.json"
config_file = args.model_dir+"/config.json"

cfg = Config.from_file(config_file)
cfg.model.use_lora = False
cfg.model.pretrained_path=None
cfg.device="cuda"

print("vision_encoder.num_frames:", cfg.model.vision_encoder.num_frames)
# cfg.model.vision_encoder.num_frames = 4

model_cls = eval(args.model_type)
# model = VideoChat2_it_mistral(config=cfg.model)
model = model_cls(config=cfg.model)


# add lora to run stage3 model
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, 
    r=16, lora_alpha=32, lora_dropout=0.,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
         "gate_proj", "up_proj", "down_proj", "lm_head"
    ]
)
model.mistral_model = get_peft_model(model.mistral_model, peft_config)


state_dict = torch.load(args.model_dir+"/"+args.model_pth+".pth", "cpu")


if 'model' in state_dict.keys():
    msg = model.load_state_dict(state_dict['model'], strict=False)
else:
    msg = model.load_state_dict(state_dict, strict=False)
print(msg)

model = model.to(torch.device(cfg.device))
model = model.eval()

print('Model Initialization Finished')







def get_prompt(conv):
    ret = conv.system + conv.sep
    for role, message in conv.messages:
        if message:
            ret += role + " " + message + " " + conv.sep
        else:
            ret += role
    return ret


def get_prompt2(conv):
    ret = conv.system + conv.sep
    count = 0
    for role, message in conv.messages:
        count += 1
        if count == len(conv.messages):
            ret += role + " " + message
        else:
            if message:
                ret += role + " " + message + " " + conv.sep
            else:
                ret += role
    return ret


def get_context_emb(conv, model, img_list, answer_prompt=None, print_res=False):
    if answer_prompt:
        prompt = get_prompt2(conv)
    else:
        prompt = get_prompt(conv)
    if print_res:
        print("prompt:",prompt)
    if '<VideoHere>' in prompt:
        prompt_segs = prompt.split('<VideoHere>')
    else:
        prompt_segs = prompt.split('<ImageHere>')
    assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
    with torch.no_grad():
        seg_tokens = [
            model.mistral_tokenizer(
                seg, return_tensors="pt", add_special_tokens=i == 0).to(cfg.device).input_ids
            # only add bos to the first seg
            for i, seg in enumerate(prompt_segs)
        ]
        seg_embs = [model.mistral_model.base_model.model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
#         seg_embs = [model.mistral_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
    mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
    mixed_embs = torch.cat(mixed_embs, dim=1)
    return mixed_embs


def ask(text, conv):
    conv.messages.append([conv.roles[0], text])
        

class StoppingCriteriaSub(StoppingCriteria):
    def __init__(self, stops=[], encounters=1):
        super().__init__()
        self.stops = stops
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            if torch.all((stop == input_ids[0][-len(stop):])).item():
                return True
        return False
    
    
def answer(conv, model, img_list, do_sample=True, max_new_tokens=200, num_beams=1, min_length=1, top_p=0.9,
               repetition_penalty=1.0, length_penalty=1, temperature=1.0, answer_prompt=None, print_res=False):
    stop_words_ids = [
        torch.tensor([2]).to(cfg.device),
        torch.tensor([29871, 2]).to(cfg.device)]  # '</s>' can be encoded in two different ways.
    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
    
    conv.messages.append([conv.roles[1], answer_prompt])
    embs = get_context_emb(conv, model, img_list, answer_prompt=answer_prompt, print_res=print_res)
    with torch.no_grad():
        outputs = model.mistral_model.generate(
            inputs_embeds=embs,
            max_new_tokens=max_new_tokens,
            stopping_criteria=stopping_criteria,
            num_beams=num_beams,
            do_sample=do_sample,
            min_length=min_length,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            length_penalty=length_penalty,
            temperature=temperature,
        )
    output_token = outputs[0]
    if output_token[0] == 0:  # the model might output a unknow token <unk> at the beginning. remove it
            output_token = output_token[1:]
    if output_token[0] == 1:  # some users find that there is a start token <s> at the beginning. remove it
            output_token = output_token[1:]
    output_text = model.mistral_tokenizer.decode(output_token, add_special_tokens=False)
    output_text = output_text.split('</s>')[0]  # remove the stop sign </s>
#     output_text = output_text.split('[/INST]')[-1].strip()
    conv.messages[-1][1] = output_text + '</s>'
    return output_text, output_token.cpu().numpy()



def get_index(num_frames, num_segments):
    seg_size = float(num_frames - 1) / num_segments
    start = int(seg_size / 2)
    offsets = np.array([
        start + int(np.round(seg_size * idx)) for idx in range(num_segments)
    ])
    return offsets


def load_video(video_path, num_segments=8, return_msg=False, resolution=224):
    
    if client is not None and "s3" in video_path:
        video_bytes = client.get(video_path)
        assert(video_bytes is not None)
        vr = VideoReader(BytesIO(video_bytes), ctx=cpu(0), num_threads=1)
    else:
        vr = VideoReader(uri=video_path, ctx=cpu(0), num_threads=1)
    num_frames = len(vr)
    frame_indices = get_index(num_frames, num_segments)

    # transform
    crop_size = resolution
    scale_size = resolution
    input_mean = [0.48145466, 0.4578275, 0.40821073]
    input_std = [0.26862954, 0.26130258, 0.27577711]

    transform = T.Compose([
        GroupScale(int(scale_size), interpolation=InterpolationMode.BICUBIC),
        GroupCenterCrop(crop_size),
        Stack(),
        ToTorchFormatTensor(),
        GroupNormalize(input_mean, input_std) 
    ])

    images_group = list()
    for frame_index in frame_indices:
        img = Image.fromarray(vr[frame_index].numpy())
        images_group.append(img)
    torch_imgs = transform(images_group)
    if return_msg:
        fps = float(vr.get_avg_fps())
        sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
        # " " should be added in the start and end
        msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds. "
        return torch_imgs, msg
    else:
        return torch_imgs
    

# In[8]:


def generate_videochat(vid_path, user_messages):
    
    num_frame = model.clip_frames
    tot_frames = model.total_frames
    resolution = cfg.model.vision_encoder.img_size
    
    vid, msg = load_video(vid_path, num_segments=tot_frames, return_msg=True, resolution=resolution)

    # The model expects inputs of shape: T x C x H x W
    TC, H, W = vid.shape
    video = vid.reshape(1, TC//3, 3, H, W).to(cfg.device)

    img_list = []
    with torch.no_grad():
        image_emb = model.encode_long_video(video,[msg,],"")
        print("Shape of long video embeds: ", image_emb.shape)
#         image_emb, _ = model.encode_img(video, "")
    img_list.append(image_emb)
    
    chat = EasyDict({
    "system": "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail. ",
    "roles": ("[INST]", "[/INST]"),
    "messages": [],
    "sep": ""
    })
    
    chat.messages.append([chat.roles[0], "<Video><VideoHere></Video> [/INST]"])
    ask(msg+user_messages, chat)

    llm_answer = answer(conv=chat, model=model, do_sample=False, img_list=img_list, max_new_tokens=256, print_res=True)[0]
    print("LLM answer:", llm_answer,"\n\n\n")
    
    return llm_answer, chat, img_list

Client on!

model_type               : VideoChat2_it4_mistral_LinearProAda
model_dir                : ./download/parameters
model_pth                : timesuite
output_dir               : Please input model output dir!
batch_size               : 1
infer_clip_frames        : 8
vision_encoder.num_frames: 8


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Enable good initialization!
_IncompatibleKeys(missing_keys=['vision_encoder.encoder.patch_embed.proj.weight', 'vision_encoder.encoder.patch_embed.proj.bias', 'vision_encoder.encoder.blocks.0.norm1.weight', 'vision_encoder.encoder.blocks.0.norm1.bias', 'vision_encoder.encoder.blocks.0.attn.q_bias', 'vision_encoder.encoder.blocks.0.attn.v_bias', 'vision_encoder.encoder.blocks.0.attn.qkv.weight', 'vision_encoder.encoder.blocks.0.attn.proj.weight', 'vision_encoder.encoder.blocks.0.attn.proj.bias', 'vision_encoder.encoder.blocks.0.norm2.weight', 'vision_encoder.encoder.blocks.0.norm2.bias', 'vision_encoder.encoder.blocks.0.mlp.fc1.weight', 'vision_encoder.encoder.blocks.0.mlp.fc1.bias', 'vision_encoder.encoder.blocks.0.mlp.fc2.weight', 'vision_encoder.encoder.blocks.0.mlp.fc2.bias', 'vision_encoder.encoder.blocks.1.norm1.weight', 'vision_encoder.encoder.blocks.1.norm1.bias', 'vision_encoder.encoder.blocks.1.attn.q_bias', 'vision_encoder.encoder.blocks.1.attn.v_bias', 'vision_encoder.encoder

In [3]:
vid_path = "./demo/example/yoga.mp4"
# vid_path = "./example/jesse_dance.mp4"

question="Describe the video in details."

generate_videochat(vid_path,question)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Shape of long video embeds:  torch.Size([1, 384, 4096])
prompt: You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail. [INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.3, 0.4, 0.5, 0.6, 0.7, 0.7, 0.8, 0.9, 1.0, 1.0, 1.1, 1.2, 1.3, 1.4, 1.4, 1.5, 1.6, 1.7, 1.7, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9 seconds. Describe the video in details. [/INST]
LLM answer: A woman

('A woman is doing yoga on a rooftop. She is wearing a black top and grey pants. She is doing a pose where she is on her hands and feet, and she is bending forward and backward. The rooftop has a view of the mountains and trees. ',
 {'system': 'You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail. ',
  'roles': ['[INST]', '[/INST]'],
  'messages': [['[INST]', '<Video><VideoHere></Video> [/INST]'],
   ['[INST]',
    'The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.3, 0.4, 0.5, 0.6, 0.7, 0.7, 0.8, 0.9, 1.0, 1.0, 1.1, 1.2, 1.3, 1.4, 1.4, 1.5, 1.6, 1.7, 1.7, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.0

In [4]:
data_list = {
    "Action Sequence": ("action_sequence.json", "pnorm2:s3://star/Charades_v1_480/", "video", True), # has start & end
    "Action Prediction": ("action_prediction.json", "pnorm2:s3://star/Charades_v1_480/", "video", True), # has start & end
    "Action Antonym": ("action_antonym.json", "pnorm2:s3://ssv2-video/", "video", False),
    "Fine-grained Action": ("fine_grained_action.json", "pnorm:s3://Moments_in_Time_Raw/videos/", "video", False),
    "Unexpected Action": ("unexpected_action.json", "pnorm2:s3://funqa-test/test/", "video", False),
    "Object Existence": ("object_existence.json", "pnorm2:s3://clevrer/video_validation/", "video", False),
    "Object Interaction": ("object_interaction.json", "pnorm2:s3://star/Charades_v1_480/", "video", True), # has start & end
    "Object Shuffle": ("object_shuffle.json", "pnorm2:s3://perception/videos/", "video", False),
    "Moving Direction": ("moving_direction.json", "pnorm2:s3://clevrer/video_validation/", "video", False),
    "Action Localization": ("action_localization.json", "pnorm2:s3://sta/sta_video/", "video", True),  # has start & end
    "Scene Transition": ("scene_transition.json", "pnorm2:s3://scene-qa/video/", "video", False),
    "Action Count": ("action_count.json", "pnorm2:s3://perception/videos/", "video", False),
    "Moving Count": ("moving_count.json", "pnorm2:s3://clevrer/video_validation/", "video", False),
    "Moving Attribute": ("moving_attribute.json", "pnorm2:s3://clevrer/video_validation/", "video", False),
    "State Change": ("state_change.json", "pnorm2:s3://perception/videos/", "video", False),
    "Fine-grained Pose": ("fine_grained_pose.json", "pnorm2:s3://nturgbd/", "video", False),
    "Character Order": ("character_order.json", "pnorm2:s3://perception/videos/", "video", False),
    "Egocentric Navigation": ("egocentric_navigation.json", "pnorm2:s3://vlnqa/", "video", False),
    "Episodic Reasoning": ("episodic_reasoning.json", "pnorm2:s3://tvqa/frames_fps3_hq/", "frame", True),  # has start & end, read frame
    "Counterfactual Inference": ("counterfactual_inference.json", "pnorm2:s3://clevrer/video_validation/", "video", False),
}

data_dir = "./download/datasets/mvbench"

In [5]:
from torch.utils.data import Dataset
import io
from io import BytesIO
from petrel_client.client import Client
from decord import VideoReader, cpu
client = Client('~/petreloss.conf', enable_mc=False)

In [6]:
num_frame = model.clip_frames
tot_frames = model.total_frames
resolution = cfg.model.vision_encoder.img_size

In [7]:
class MVBench_dataset(Dataset):
    def __init__(self, data_dir, data_list, num_segments=8, resolution=224):
        self.data_list = []
        for k, v in data_list.items():
            with open(os.path.join(data_dir, v[0]), 'r') as f:
                json_data = json.load(f)
            for data in json_data:
                self.data_list.append({
                    'task_type': k,
                    'prefix': v[1],
                    'data_type': v[2],
                    'bound': v[3],
                    'data': data
                })
        
        self.decord_method = {
            'video': self.read_video,
            'gif': self.read_gif,
            'frame': self.read_frame,
        }
        
        self.num_segments = num_segments
        
        # transform
        crop_size = resolution
        scale_size = resolution
        input_mean = [0.48145466, 0.4578275, 0.40821073]
        input_std = [0.26862954, 0.26130258, 0.27577711]
        self.transform = T.Compose([
            GroupScale(int(scale_size), interpolation=InterpolationMode.BICUBIC),
            GroupCenterCrop(crop_size),
            Stack(),
            ToTorchFormatTensor(),
            GroupNormalize(input_mean, input_std) 
        ])
    
    def __str__(self):
        len_list = {}
        option_list = {}
        for data in self.data_list:
            if data['task_type'] not in len_list:
                len_list[data['task_type']] = 0
            len_list[data['task_type']] += 1
            if data['task_type'] not in option_list:
                option_list[data['task_type']] = 0
            option_list[data['task_type']] += len(data['data']['candidates'])
        
        correct = 0
        total = 0
        res = f"There are {len(self.data_list)} videos as follow:\n"
        for k, v in len_list.items():
            correct += len_list[k]
            total += option_list[k]
            res += f"{v} for {k} ({option_list[k]} options => {len_list[k]/option_list[k]*100:.2f}%)\n"
            correct = correct + 1 / option_list[k]
        res += f"Total random accuracy: {correct/total*100:.2f}%"
        return res.rstrip()
        
    def __len__(self):
        return len(self.data_list)
    
    def get_index(self, bound, fps, max_frame, first_idx=0):
        if bound:
            start, end = bound[0], bound[1]
        else:
            start, end = -100000, 100000
        start_idx = max(first_idx, round(start * fps))
        end_idx = min(round(end * fps), max_frame)
        seg_size = float(end_idx - start_idx) / self.num_segments
        frame_indices = np.array([
            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
            for idx in range(self.num_segments)
        ])
        return frame_indices
    
    def read_video(self, video_path, bound=None, return_time=True):
        if "s3://" in video_path:
            video_bytes = client.get(video_path)
            vr = VideoReader(io.BytesIO(video_bytes), ctx=cpu(0), num_threads=1)
        else:
            vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
        max_frame = len(vr) - 1
        fps = float(vr.get_avg_fps())
        images_group = list()
        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0) 
        for frame_index in frame_indices:
            img = Image.fromarray(vr[frame_index].numpy())
            images_group.append(img)
        torch_imgs = self.transform(images_group)
        if return_time:
            sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
            # " " should be added in the start and end
            msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds. "
            return torch_imgs, msg
        else:
            return torch_imgs
    
    def read_gif(self, video_path, bound=None, fps=25, return_time=True):
        if "s3://" in video_path:
            video_bytes = client.get(video_path)
            gif = imageio.get_reader(io.BytesIO(video_bytes))
        else:
            gif = imageio.get_reader(video_path)
        max_frame = len(gif) - 1
        
        images_group = list()
        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0) 
        for index, frame in enumerate(gif):
            if index in frame_indices:
                img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
                img = Image.fromarray(img)
                images_group.append(img)
        torch_imgs = self.transform(images_group)
        if return_time:
            sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
            # " " should be added in the start and end
            msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds. "
            return torch_imgs, msg
        else:
            return torch_imgs
    
    def read_frame(self, video_path, bound=None, fps=3, return_time=True):
        if os.path.exists(video_path):
            max_frame = len(os.listdir(video_path))
        else:
            max_frame = len([k for k in client.list(video_path)])
        images_group = list()
        frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
        for frame_index in frame_indices:
            if "s3://" in video_path:
                img_bytes = client.get(os.path.join(video_path, f"{frame_index:05d}.jpg"))
                img = Image.open(io.BytesIO(img_bytes))
            else:
                img = Image.open(os.path.join(video_path, f"{frame_index:05d}.jpg"))
            images_group.append(img)
        torch_imgs = self.transform(images_group)
        if return_time:
            sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
            # " " should be added in the start and end
            msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds. "
            return torch_imgs, msg
        else:
            return torch_imgs

    def qa_template(self, data):
        question = f"Question: {data['question']}\n"
        question += "Options:\n"
        answer = data['answer']
        answer_idx = -1
        for idx, c in enumerate(data['candidates']):
            question += f"({chr(ord('A') + idx)}) {c}\n"
            if c == answer:
                answer_idx = idx
        question = question.rstrip()
        answer = f"({chr(ord('A') + answer_idx)}) {answer}"
        return question, answer

    def __getitem__(self, idx):
        decord_method = self.decord_method[self.data_list[idx]['data_type']]
        bound = None
        if self.data_list[idx]['bound']:
            bound = (
                self.data_list[idx]['data']['start'],
                self.data_list[idx]['data']['end'],
            )
        video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
        torch_imgs, time_inst = decord_method(video_path, bound)
        question, answer = self.qa_template(self.data_list[idx]['data'])
            
        return {
            'video': torch_imgs, 
            'question': question, 
            'answer': answer,
            'task_type': self.data_list[idx]['task_type'],
            'time': time_inst,
        }

In [8]:
dataset = MVBench_dataset(data_dir, data_list, num_segments=tot_frames, resolution=resolution)

In [9]:
def infer_mvbench(
        data_sample, system="", 
        question_prompt='', # add in the end of question
        answer_prompt=None, # add in the begining of answer
        return_prompt='',  # add in the begining of return message
        system_q=False, # whether add question in the system prompt for QFormer
        print_res=True,
        system_llm=False
    ):
    video = data_sample["video"]
    msg=data_sample["time"]
    TC, H, W = video.shape
    video = video.reshape(1, TC//3, 3, H, W).to(cfg.device)
    video_list = []
    with torch.no_grad():
        video_emb = model.encode_long_video(video,[msg,],"")
    video_list.append(video_emb)
#     video_list.append(torch.zeros_like(video_emb))

    chat = EasyDict({
        "system": system,
        "roles": ("[INST]", "[/INST]"),
        "messages": [],
        "sep": ""
    })

    chat.messages.append([chat.roles[0], f"<Video><VideoHere></Video> [/INST]"])
    
    if system_llm:
        prompt = msg + system + data_sample['question'] + question_prompt
    else:
        prompt = msg + data_sample['question'] + question_prompt
    
    ask(prompt, chat)

    llm_message = answer(
        conv=chat, model=model, do_sample=False, 
        img_list=video_list, max_new_tokens=100, 
        answer_prompt=answer_prompt, print_res=print_res
    )[0]
    # remove potential explanation
    llm_message = return_prompt + llm_message.strip().split('\n')[0]
    print(llm_message)
    print(f"GT: {data_sample['answer']}")
    return llm_message

In [10]:
def check_ans(pred, gt):
    flag = False
    
    pred_list = pred.lower().split(' ')
    pred_option, pred_content = pred_list[0], ' '.join(pred_list[1:])
    gt_list = gt.lower().split(' ')
    gt_option, gt_content = gt_list[0], " ".join(gt_list[1:])
    if gt_content[-1] == '.':
        gt_content = gt_content[:-1]
    
    if pred_option.replace('.', '') in gt_option:
        flag = True
    elif gt_option in pred_option:
        flag = True
#     elif gt_content in pred_content:
#         flag = True
#     elif gt_content.replace("a ", "") in pred_content:
#         flag = True
#     elif gt_content.replace("an ", "") in pred_content:
#         flag = True
        
    return flag

In [None]:
correct = 0
total = 0
res_list = []
acc_dict = {}

for example in tqdm(dataset):
    task_type = example['task_type']
    if task_type not in acc_dict:
        acc_dict[task_type] = [0, 0] # correct, total
    acc_dict[task_type][1] += 1
    total += 1
    pred = infer_mvbench(
        example, 
#         "Carefully observe the video and choose the best option for the question. ", 
#         "Carefully watch the video and pay attention to the cause, sequence of events, and object details and movements. Based on your observations, select the best option that accurately addresses the question. ",  # newPrompt
#         "Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question. ", # newPrompt2
        "Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.\n", # newPrompt2
#         question_prompt="\nOnly give the best option without any explanation.",
#         question_prompt="\nThink it step by step. Only give the best option without any explanation.", # prompt2
        question_prompt="\nOnly give the best option.",  # prompt3
        answer_prompt="Best option:(",
        return_prompt='(',
        system_q=False,
        print_res=True,
        system_llm=True,
    )
    gt = example['answer']
    res_list.append({
        'pred': pred,
        'gt': gt
    })
    if check_ans(pred=pred, gt=gt):
        acc_dict[task_type][0] += 1
        correct += 1
    print(f"Part  Acc: {acc_dict[task_type][0] / acc_dict[task_type][1] * 100 :.2f}%")
    print(f"Total Acc: {correct / total * 100 :.2f}%")
    print('-' * 50, task_type, '-' * 50)

  0%|          | 0/4000 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.6, 1.7, 1.8, 1.9, 2.0, 2.2, 2.3, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.5, 3.6, 3.8, 3.9, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 16.0, 16.1, 16.2, 1

  0%|          | 1/4000 [00:01<1:50:13,  1.65s/it]

(A) Ate the medicine.
GT: (A) Ate the medicine.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.5, 0.5, 0.7, 0.7, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.4, 1.6, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.3, 2.5, 2.5, 2.6, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.2, 3.2, 3.4, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.8, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.0, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.8, 8.8, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.7, 9.9, 9.9, 10.0, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9 seconds

  0%|          | 2/4000 [00:03<1:56:17,  1.75s/it]

(C) Opened the closet/cabinet.
GT: (C) Opened the closet/cabinet.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.2, 14.4, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.2, 17.3, 17.4, 17.6, 17.7, 17.8, 17.9, 18.1, 18.2, 18.3, 18.4, 18.6, 18.7, 18.8, 18.9, 19.1, 19.2, 19.3, 19.4, 19.6, 19.7, 19.8, 20.0, 20.1, 20.2, 20.3, 20.5, 20.6, 20.7, 20.8, 21.0, 21.1, 21.2, 21.3, 21.5, 21.6, 21.7, 21.8, 22.0, 22.1, 22.2, 22.3, 22.5, 22.6, 22.7, 22.9, 23.0, 23.1, 23.2, 23.4, 23.5, 23.6, 23.7, 23.9, 24.0, 24.1, 24.2, 24.4, 24.5, 24.6, 24.7, 24.9, 25.0, 25.1, 25.3, 25.4, 25.5, 25.6, 25.8, 25.9, 26.0, 26.1, 26.3, 26.4, 26.5, 26.6, 26.8, 26.9, 27.0, 27.1, 27.3, 27.4, 27.5, 27.7, 27.8, 27.9, 28.0

  0%|          | 3/4000 [00:05<2:01:20,  1.82s/it]

(C) Opened the closet/cabinet.
GT: (C) Opened the closet/cabinet.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.2, 3.3, 3.3, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.4, 9.5, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.1, 10.2, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.0, 11.1, 11.1, 11.2, 11.3, 11

  0%|          | 4/4000 [00:09<3:09:47,  2.85s/it]

(D) Threw the towel.
GT: (D) Threw the towel.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.5, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.3, 7.4, 7.5, 7.5, 7.6, 7.6, 7.7, 7.8, 7.8, 7.8, 7.9, 8.0, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.4, 8.5, 8.6, 8.6, 8.7, 8.7, 8.8, 8.8, 8.9, 8.9, 9.0, 9.1, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.6, 9.7, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.1, 10.2, 10.3, 10.3, 10.4, 10.4, 10.5, 10.6, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 10.9, 11.0, 11.1, 11.1, 11.2, 11.2, 11.3, 11.3, 11.4, 11.4, 11.5, 11.6, 11.6, 11.7, 11.7, 11.8, 11.8, 11.9, 12.0, 12.0, 1

  0%|          | 5/4000 [00:11<2:42:44,  2.44s/it]

(C) Sat on the sofa/couch.
GT: (C) Sat on the sofa/couch.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.6 seconds. C

  0%|          | 6/4000 [00:13<2:23:41,  2.16s/it]

(B) Put down the towel.
GT: (B) Put down the towel.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 19.5, 19.6, 19.7, 19.7, 19.8, 19.9, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.9, 21.0, 21.0, 21.1, 21.2, 21.2, 21.3, 21.4, 21.5, 21.6, 21.6, 21.7, 21.8, 21.9, 21.9, 22.0, 22.1, 22.2, 22.3, 22.3, 22.4, 22.5, 22.6, 22.7, 22.7, 22.8, 22.9, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.4, 23.5, 23.6, 23.6, 23.7, 23.8, 23.9, 24.0, 24.0, 24.1, 24.2, 24.2, 24.3, 24.4, 24.5, 24.6, 24.6, 24.7, 24.8, 24.9, 24.9, 25.0, 25.1, 25.2, 25.3, 25.3, 25.4, 25.5, 25.5, 25.6, 25.7, 25.8, 25.9, 25.9, 26.0, 26.1, 26.1, 26.2, 26.3, 26.4, 26.5, 26.6, 26.6, 26.7, 26.8, 26.9, 26.9, 27.0, 27.1, 27.2, 27.2, 27.3, 27.4, 27.5, 27.6, 27.6, 27.7, 27.8, 27.9

  0%|          | 7/4000 [00:14<2:14:22,  2.02s/it]

(C) Closed the window.
GT: (C) Closed the window.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 22.1, 22.2, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.6, 23.7, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6, 24.8, 24.9, 25.0, 25.1, 25.2, 25.3, 25.4, 25.5, 25.7, 25.8, 25.9, 26.0, 26.1, 26.2, 26.3, 26.4, 26.5, 26.7, 26.8, 26.9, 27.0, 27.1, 27.2, 27.3, 27.4, 27.5, 27.6, 27.7, 27.9, 28.0, 28.1, 28.2, 28.3, 28.4, 28.5, 28.6, 28.7, 28.8, 28.9, 29.1, 29.2, 29.3, 29.4, 29.5, 29.6, 29.7, 29.8, 29.9, 30.0, 30.2, 30.3, 30.4, 30.5, 30.6, 30.7, 30.8, 30.9, 31.1, 31.2, 31.3, 31.4, 31.5, 31.6, 31.7, 31.8, 31.9, 32.0, 32.2, 32.3, 32.4, 32.5, 32.6, 32.7, 32.8, 32.9, 33.0, 33.1, 33.2, 33.4, 33.5, 33.6, 33.7, 33.8, 33.9, 34.0, 34.1

  0%|          | 8/4000 [00:16<2:08:33,  1.93s/it]

(D) Put down the paper/notebook.
GT: (D) Put down the paper/notebook.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.2, 1

  0%|          | 9/4000 [00:18<2:01:59,  1.83s/it]

(D) Put down the broom.
GT: (D) Put down the broom.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 5.6, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.2, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.2, 15.3, 15.4, 15.5, 15

  0%|          | 10/4000 [00:19<1:59:44,  1.80s/it]

(B) Put down the bag.
GT: (B) Put down the bag.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.2, 2.2, 2.3, 2.3, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.5, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4 seconds. Carefully 

  0%|          | 11/4000 [00:21<1:55:36,  1.74s/it]

(A) Put down the laptop.
GT: (A) Put down the laptop.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.3, 0.3, 0.4, 0.5, 0.5, 0.5, 0.6, 0.7, 0.7, 0.8, 0.9, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.5, 1.5, 1.6, 1.7, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.3, 2.4, 2.5, 2.5, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 7.9 seconds. Carefully 

  0%|          | 12/4000 [00:23<1:56:22,  1.75s/it]

(A) Tidied up the table.
GT: (A) Tidied up the table.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.7, 0.8, 0.9, 0.9, 1.0, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.7, 1.9, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.3, 2.4, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 3.9, 4.1, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.3, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.5, 7.7, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.3, 9.3 seconds. Carefully 

  0%|          | 13/4000 [00:24<1:50:42,  1.67s/it]

(B) Opened the laptop.
GT: (B) Opened the laptop.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.9, 1.0, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 2.0, 2.1, 2.2, 2.4, 2.5, 2.7, 2.8, 3.0, 3.1, 3.2, 3.4, 3.5, 3.6, 3.8, 3.9, 4.0, 4.2, 4.3, 4.4, 4.6, 4.7, 4.9, 5.0, 5.2, 5.3, 5.4, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.4, 6.5, 6.7, 6.8, 7.0, 7.1, 7.2, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.5, 8.6, 8.8, 8.9, 9.0, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 10.0, 10.1, 10.3, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.8, 11.9, 12.0, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 14.0, 14.1, 14.3, 14.4, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.8, 15.9, 16.1, 16.2, 16.3, 16.5, 16.6, 16.8

  0%|          | 14/4000 [00:26<1:52:21,  1.69s/it]

(C) Put down the food.
GT: (C) Put down the food.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.7

  0%|          | 15/4000 [00:28<1:52:46,  1.70s/it]

(D) Took the bag.
GT: (D) Took the bag.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.8, 7.9, 7.9, 8.0, 8.0, 8.1, 8.2, 8.2, 8.3, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.1, 9.2, 9.3, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.1, 10.2, 10.2, 10.3, 10.4, 10.4, 10.5, 10.5, 10.6, 10.7, 10.7, 10.8, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.3, 11.4, 11.5, 11.5, 11.6, 11.6, 11.7, 11.8, 11.8, 11.9, 11.9, 12.0, 12.1, 12.1, 12.2, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.6, 12.7, 12.7, 12.8, 12.9, 12.9, 13.0, 13.0, 13.1, 13.2, 13.2, 13.3, 13.3, 13.4, 13.5, 13.5, 13.6, 13.7, 13.7, 13.8, 13.8, 13.9, 14.0, 14.0, 14.1, 14.1, 14.2, 14.3, 14.3, 14.4, 14.4, 14.5, 14.6, 14.6, 14.7, 14.7, 14.8

  0%|          | 16/4000 [00:30<1:57:40,  1.77s/it]

(D) Tidied up the blanket.
GT: (D) Tidied up the blanket.
Part  Acc: 100.00%
Total Acc: 100.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 28.5, 28.5, 28.6, 28.7, 28.7, 28.8, 28.8, 28.9, 29.0, 29.1, 29.1, 29.2, 29.2, 29.3, 29.4, 29.4, 29.5, 29.5, 29.6, 29.7, 29.7, 29.8, 29.9, 29.9, 30.0, 30.1, 30.1, 30.2, 30.2, 30.3, 30.3, 30.4, 30.5, 30.5, 30.6, 30.6, 30.8, 30.8, 30.9, 30.9, 31.0, 31.0, 31.1, 31.2, 31.2, 31.3, 31.3, 31.4, 31.5, 31.5, 31.6, 31.7, 31.7, 31.8, 31.9, 31.9, 32.0, 32.0, 32.1, 32.2, 32.2, 32.3, 32.3, 32.4, 32.4, 32.6, 32.6, 32.7, 32.7, 32.8, 32.9, 32.9, 33.0, 33.0, 33.1, 33.1, 33.2, 33.3, 33.3, 33.4, 33.5, 33.6, 33.6, 33.7, 33.7, 33.8, 33.8, 33.9, 34.0, 34.0, 34.1, 34.1, 34.2, 34.3, 34.4, 34.4, 34.5, 34.5, 34.6, 34.7, 34.7, 34.8, 34.8, 34.9, 35.0, 35.0, 35.1, 35.2, 35.2, 35.3

  0%|          | 17/4000 [00:31<1:56:02,  1.75s/it]

(B) Put down the food.
GT: (D) Tidied up the table.
Part  Acc: 94.12%
Total Acc: 94.12%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.2, 0.4, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.1, 1.3, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.2, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 2.9, 3.1, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.9, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.4, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.3, 7.5, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.3, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.0, 10.2, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 10.9, 11.1, 11.1, 11.2, 11.3, 11.4 se

  0%|          | 18/4000 [00:33<1:52:12,  1.69s/it]

(D) Closed the book.
GT: (D) Closed the book.
Part  Acc: 94.44%
Total Acc: 94.44%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 12.3, 12.4, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.8, 12.9, 13.0, 13.0, 13.1, 13.1, 13.2, 13.3, 13.3, 13.4, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.8, 13.9, 13.9, 14.0, 14.1, 14.1, 14.2, 14.2, 14.3, 14.4, 14.4, 14.5, 14.5, 14.6, 14.7, 14.7, 14.8, 14.8, 14.9, 15.0, 15.0, 15.1, 15.1, 15.2, 15.3, 15.4, 15.4, 15.5, 15.5, 15.6, 15.7, 15.7, 15.8, 15.8, 15.9, 16.0, 16.0, 16.1, 16.1, 16.2, 16.3, 16.3, 16.4, 16.4, 16.5, 16.6, 16.6, 16.7, 16.7, 16.8, 16.9, 16.9, 17.0, 17.0, 17.1, 17.2, 17.2, 17.3, 17.3, 17.4, 17.5, 17.5, 17.6, 17.7, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.1, 18.1, 18.2, 18.3, 18.3, 18.4, 18.4, 18.5, 18.6, 18.6, 18.7, 18.8, 18.8, 18.9, 19.0

  0%|          | 19/4000 [00:35<1:53:02,  1.70s/it]

(A) Closed the closet/cabinet.
GT: (A) Closed the closet/cabinet.
Part  Acc: 94.74%
Total Acc: 94.74%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.9, 2.0, 2.1, 2.3, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 3.9, 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.3, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.6, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.5, 13.6, 13.7, 13.9, 14.0, 14.1, 14.3, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 16.

  0%|          | 20/4000 [00:36<1:55:01,  1.73s/it]

(B) Took the paper/notebook.
GT: (B) Took the paper/notebook.
Part  Acc: 95.00%
Total Acc: 95.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.6, 23.7, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.3, 24.4, 24.5, 24.6, 24.7, 24.8, 24.9, 25.0, 25.1, 25.1, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 25.8, 25.8, 25.9, 26.0, 26.1, 26.2, 26.3, 26.4, 26.5, 26.6

  1%|          | 21/4000 [00:38<1:55:38,  1.74s/it]

(B) Took the paper/notebook.
GT: (B) Took the paper/notebook.
Part  Acc: 95.24%
Total Acc: 95.24%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.0, 11.1, 11.1, 11.2, 11.3, 11.3, 11.4, 11.5, 11.5, 11.6, 11

  1%|          | 22/4000 [00:40<1:52:12,  1.69s/it]

(C) Threw the clothes.
GT: (C) Threw the clothes.
Part  Acc: 95.45%
Total Acc: 95.45%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.4, 9.5, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.1, 12.2, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.2, 15.3, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.3, 16.4, 16.5, 16.7, 16.8, 17.0, 17.1, 17.2, 17.4, 17.5, 17.6, 17.8, 17.9, 18.0, 18.2, 18.3, 18.5, 18.6, 18.7, 18.9, 19.0, 19.1, 19.3, 19.4, 19.5, 19.7, 19.8, 19.9, 20.1, 20.2, 20.3, 20.5, 20.6,

  1%|          | 23/4000 [00:42<1:53:20,  1.71s/it]

(B) Opened the box.
GT: (B) Opened the box.
Part  Acc: 95.65%
Total Acc: 95.65%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.0, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.4, 16.5, 16.6, 16.7, 16.9, 17.0

  1%|          | 24/4000 [00:43<1:55:08,  1.74s/it]

(C) Took the food.
GT: (D) Tidied up the blanket.
Part  Acc: 91.67%
Total Acc: 91.67%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.2, 7.2, 7.4, 7.5, 7.5, 7.7, 7.8, 7.8, 8.0, 8.0, 8.2, 8.2, 8.3, 8.5, 8.5, 8.6, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.

  1%|          | 25/4000 [00:45<1:53:56,  1.72s/it]

(D) Opened the book.
GT: (D) Opened the book.
Part  Acc: 92.00%
Total Acc: 92.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12

  1%|          | 26/4000 [00:47<1:50:44,  1.67s/it]

(A) Sat on the floor.
GT: (A) Sat on the floor.
Part  Acc: 92.31%
Total Acc: 92.31%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 1

  1%|          | 27/4000 [00:48<1:50:40,  1.67s/it]

(A) Took the laptop.
GT: (A) Took the laptop.
Part  Acc: 92.59%
Total Acc: 92.59%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 1.9, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.6, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5

  1%|          | 28/4000 [00:50<1:50:03,  1.66s/it]

(A) Put down the dish.
GT: (A) Put down the dish.
Part  Acc: 92.86%
Total Acc: 92.86%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6, 24.7, 24.8, 24.9, 25.0, 25.1, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 25.8, 25.9, 26.0, 26.1, 26.2, 26.3, 26.4

  1%|          | 29/4000 [00:52<1:50:44,  1.67s/it]

(C) Took the towel.
GT: (C) Took the towel.
Part  Acc: 93.10%
Total Acc: 93.10%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.1, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.1, 2.1, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.1, 3.1, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 6.9, 7.0, 7.0 seconds. Carefully 

  1%|          | 30/4000 [00:53<1:53:18,  1.71s/it]

(A) Put down the cup/glass/bottle.
GT: (A) Put down the cup/glass/bottle.
Part  Acc: 93.33%
Total Acc: 93.33%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.3, 0.5, 0.6, 0.8, 0.9, 1.0, 1.2, 1.3, 1.5, 1.6, 1.7, 1.9, 2.0, 2.2, 2.3, 2.4, 2.6, 2.7, 2.9, 3.0, 3.1, 3.3, 3.4, 3.6, 3.7, 3.8, 4.0, 4.1, 4.3, 4.4, 4.5, 4.7, 4.8, 5.0, 5.1, 5.2, 5.4, 5.5, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.5, 6.6, 6.8, 6.9, 7.0, 7.2, 7.3, 7.5, 7.6, 7.7, 7.9, 8.0, 8.2, 8.3, 8.4, 8.6, 8.7, 8.9, 9.0, 9.1, 9.3, 9.4, 9.6, 9.7, 9.8, 10.0, 10.1, 10.3, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.2, 11.4, 11.5, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.5, 13.6, 13.7, 13.9, 14.0, 14.2, 14.3, 14.4, 14.6, 14.7, 14.9, 15.0, 15.1, 15.3, 15.4, 15.6, 15.7, 15.8, 16.0, 16.1, 16.3, 16.4, 16.5, 16.7, 16.8, 17.

  1%|          | 31/4000 [00:55<1:56:07,  1.76s/it]

(A) Sat on the sofa/couch.
GT: (A) Sat on the sofa/couch.
Part  Acc: 93.55%
Total Acc: 93.55%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.6, 2.8, 2.9, 3.1, 3.2, 3.4, 3.5, 3.7, 3.8, 4.0, 4.1, 4.2, 4.4, 4.5, 4.7, 4.8, 5.0, 5.2, 5.3, 5.5, 5.6, 5.8, 5.9, 6.0, 6.2, 6.3, 6.5, 6.6, 6.8, 6.9, 7.1, 7.2, 7.4, 7.5, 7.7, 7.8, 8.0, 8.1, 8.2, 8.4, 8.5, 8.7, 8.9, 9.0, 9.2, 9.3, 9.5, 9.6, 9.8, 9.9, 10.0, 10.2, 10.3, 10.5, 10.6, 10.8, 11.0, 11.1, 11.2, 11.4, 11.5, 11.7, 11.8, 12.0, 12.1, 12.3, 12.4, 12.6, 12.7, 12.9, 13.0, 13.2, 13.3, 13.5, 13.6, 13.8, 13.9, 14.0, 14.2, 14.3, 14.5, 14.7, 14.8, 15.0, 15.1, 15.2, 15.4, 15.5, 15.7, 15.8, 16.0, 16.1, 16.3, 16.4, 16.6, 16.7, 16.9, 17.0, 17.2, 17.3, 17.5, 17.6, 17.8, 17.9, 18.0, 18.2, 18.4, 18.5, 18.7, 18.8, 19.0, 19.1, 19.2, 19.4, 19.5, 19.7, 19.8, 20.0, 

  1%|          | 32/4000 [00:57<1:58:32,  1.79s/it]

(D) Sat on the sofa/couch.
GT: (D) Sat on the sofa/couch.
Part  Acc: 93.75%
Total Acc: 93.75%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 11.9, 12.

  1%|          | 33/4000 [00:59<1:53:15,  1.71s/it]

(B) Took the dish.
GT: (B) Took the dish.
Part  Acc: 93.94%
Total Acc: 93.94%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 1

  1%|          | 34/4000 [01:00<1:51:41,  1.69s/it]

(C) Took the book.
GT: (A) Put down the book.
Part  Acc: 91.18%
Total Acc: 91.18%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.9, 21.0, 21.1, 21.2, 21.3, 21.4, 21.6, 21.7, 21.8, 21.9, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.6, 23.7, 23.8, 23.9, 24.0, 24.1, 24.3, 24.4, 24.5, 24.6, 24.7, 24.8, 25.0, 25.1, 25.2, 25.3, 25.4, 25.5, 25.7, 25.8, 25.9, 26.0, 26.1, 26.2, 26.4, 26.5, 26.6, 26.7, 26.8, 26.9, 27.0, 27.2, 27.3, 27.4, 27.5, 27.6, 27.7, 27.9, 28.0, 28.1, 28.2, 28.3

  1%|          | 35/4000 [01:02<1:53:03,  1.71s/it]

(B) Opened the bag.
GT: (B) Opened the bag.
Part  Acc: 91.43%
Total Acc: 91.43%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.1, 7.2, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.3, 19.4, 19.5, 19.6, 

  1%|          | 36/4000 [01:04<1:55:31,  1.75s/it]

(B) Threw the blanket.
GT: (B) Threw the blanket.
Part  Acc: 91.67%
Total Acc: 91.67%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.4, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6 seconds. Ca

  1%|          | 37/4000 [01:06<1:52:48,  1.71s/it]

(B) Put down the clothes.
GT: (C) Washed the clothes.
Part  Acc: 89.19%
Total Acc: 89.19%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.3, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.9, 3.0, 3.0, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 

  1%|          | 38/4000 [01:07<1:51:17,  1.69s/it]

(B) Threw the box.
GT: (B) Threw the box.
Part  Acc: 89.47%
Total Acc: 89.47%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.8, 8.9, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.0, 11.1, 11.1, 11.2, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.7, 11.8, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.2, 12.3, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 12.9, 13.0, 13.0, 13.1, 13.2, 13.2, 13.3, 13.4, 13.4, 13.5, 13.5, 13.6, 13.7, 13.8, 13.8, 13.9, 13.9, 14.0, 14.0, 14.1, 14.2, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.6, 14.7, 14.8, 14.8, 14.9, 15.0, 15.0, 15.1, 15.1, 15.2, 15.2, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.7, 15.8, 15.8, 15.9,

  1%|          | 39/4000 [01:09<1:55:46,  1.75s/it]

(B) Closed the closet/cabinet.
GT: (B) Closed the closet/cabinet.
Part  Acc: 89.74%
Total Acc: 89.74%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.3, 4.5, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.7, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.5, 15.7, 15.8, 15.9, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.8, 17.0, 17.1, 17.2, 17.3, 17.5, 17.6, 17.7, 17.9, 18.0, 18.1, 18.3, 18.4, 18.5, 18.6, 18.8, 18.9, 19.0, 19.2, 19.3, 

  1%|          | 40/4000 [01:11<1:56:38,  1.77s/it]

(B) Put down the bag.
GT: (B) Put down the bag.
Part  Acc: 90.00%
Total Acc: 90.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 0.9, 1.1, 1.2, 1.4, 1.5, 1.6, 1.8, 1.9, 2.1, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.1, 3.2, 3.4, 3.5, 3.7, 3.8, 3.9, 4.1, 4.2, 4.4, 4.5, 4.7, 4.8, 4.9, 5.1, 5.2, 5.4, 5.5, 5.7, 5.8, 5.9, 6.1, 6.2, 6.4, 6.5, 6.7, 6.8, 7.0, 7.1, 7.2, 7.4, 7.5, 7.7, 7.8, 8.0, 8.1, 8.2, 8.4, 8.5, 8.7, 8.8, 9.0, 9.1, 9.3, 9.4, 9.5, 9.7, 9.8, 10.0, 10.1, 10.3, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.3, 11.4, 11.5, 11.7, 11.8, 12.0, 12.1, 12.3, 12.4, 12.6, 12.7, 12.8, 13.0, 13.1, 13.3, 13.4, 13.6, 13.7, 13.8, 14.0, 14.1, 14.3, 14.4, 14.6, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.1, 16.3, 16.4, 16.6, 16.7, 16.9, 17.0, 17.2, 17.3, 1

  1%|          | 41/4000 [01:13<1:55:37,  1.75s/it]

(A) Put down the paper/notebook.
GT: (C) Sat on the floor.
Part  Acc: 87.80%
Total Acc: 87.80%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.4, 0.5, 0.7, 0.8, 1.0, 1.1, 1.3, 1.4, 1.6, 1.7, 1.9, 2.1, 2.2, 2.4, 2.5, 2.7, 2.8, 3.0, 3.1, 3.3, 3.4, 3.6, 3.7, 3.9, 4.1, 4.2, 4.4, 4.5, 4.7, 4.8, 5.0, 5.1, 5.3, 5.4, 5.6, 5.7, 5.9, 6.1, 6.2, 6.4, 6.5, 6.7, 6.8, 7.0, 7.1, 7.3, 7.4, 7.6, 7.7, 7.9, 8.0, 8.2, 8.4, 8.5, 8.7, 8.8, 9.0, 9.1, 9.3, 9.4, 9.6, 9.7, 9.9, 10.0, 10.2, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.0, 12.2, 12.4, 12.5, 12.7, 12.8, 13.0, 13.1, 13.3, 13.4, 13.6, 13.7, 13.9, 14.0, 14.2, 14.3, 14.5, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.2, 16.3, 16.5, 16.7, 16.8, 17.0, 17.1, 17.3, 17.5, 17.6, 17.8, 17.9, 18.1, 18.2, 18.4, 18.5, 18.7, 1

  1%|          | 42/4000 [01:14<1:55:31,  1.75s/it]

(C) Took the paper/notebook.
GT: (C) Took the paper/notebook.
Part  Acc: 88.10%
Total Acc: 88.10%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 29.2, 29.3, 29.3, 29.4, 29.4, 29.5, 29.6, 29.6, 29.7, 29.7, 29.8, 29.9, 29.9, 30.0, 30.1, 30.1, 30.2, 30.2, 30.3, 30.4, 30.4, 30.5, 30.5, 30.6, 30.7, 30.7, 30.8, 30.8, 30.9, 31.0, 31.0, 31.1, 31.1, 31.2, 31.3, 31.3, 31.4, 31.5, 31.5, 31.6, 31.6, 31.7, 31.8, 31.8, 31.9, 31.9, 32.0, 32.1, 32.1, 32.2, 32.2, 32.3, 32.4, 32.4, 32.5, 32.6, 32.6, 32.7, 32.7, 32.8, 32.9, 32.9, 33.0, 33.0, 33.1, 33.2, 33.2, 33.3, 33.3, 33.4, 33.5, 33.5, 33.6, 33.6, 33.7, 33.8, 33.8, 33.9, 34.0, 34.0, 34.1, 34.1, 34.2, 34.3, 34.3, 34.4, 34.4, 34.5, 34.6, 34.6, 34.7, 34.7, 34.8, 34.9, 34.9, 35.0, 35.1, 35.1, 35.2, 35.2, 35.3, 35.4, 35.4, 35.5, 35.5, 35.6, 35.7, 35.7, 35.8, 35.8

  1%|          | 43/4000 [01:16<1:53:37,  1.72s/it]

(A) Tidied up the table.
GT: (A) Tidied up the table.
Part  Acc: 88.37%
Total Acc: 88.37%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.1, 1.2, 1.3, 1.4, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 3.0, 3.0, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5 seconds. Car

  1%|          | 44/4000 [01:18<1:51:42,  1.69s/it]

(C) Threw the shoe.
GT: (C) Threw the shoe.
Part  Acc: 88.64%
Total Acc: 88.64%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.4, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.8, 7.9, 8.0, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.4, 8.5, 8.6, 8.6, 8.7, 8.7, 8.8, 8.9, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.3, 10.4, 10.5, 10.5, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.3, 11.4, 11.4, 11.5, 11.6, 11.6, 11.7, 11.7, 11.8, 11.9, 11.9, 12.0, 12.0, 12.1, 12

  1%|          | 45/4000 [01:19<1:49:13,  1.66s/it]

(A) Put down the dish.
GT: (C) Put down the phone/camera.
Part  Acc: 86.67%
Total Acc: 86.67%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3

  1%|          | 46/4000 [01:21<1:49:03,  1.65s/it]

(A) Took the blanket.
GT: (A) Took the blanket.
Part  Acc: 86.96%
Total Acc: 86.96%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 9.2, 9.3, 9.5, 9.6, 9.7, 9.9, 10.0, 10.2, 10.3, 10.5, 10.6, 10.8, 10.9, 11.1, 11.2, 11.3, 11.5, 11.6, 11.8, 11.9, 12.1, 12.2, 12.4, 12.5, 12.7, 12.8, 12.9, 13.1, 13.2, 13.4, 13.5, 13.7, 13.8, 14.0, 14.1, 14.3, 14.4, 14.5, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.1, 16.3, 16.4, 16.6, 16.7, 16.9, 17.0, 17.2, 17.3, 17.4, 17.6, 17.7, 17.9, 18.0, 18.2, 18.3, 18.5, 18.6, 18.8, 18.9, 19.0, 19.2, 19.3, 19.5, 19.6, 19.8, 19.9, 20.1, 20.2, 20.4, 20.5, 20.6, 20.8, 20.9, 21.1, 21.2, 21.4, 21.5, 21.7, 21.8, 22.0, 22.1, 22.2, 22.4, 22.5, 22.7, 22.8, 23.0, 23.1, 23.3, 23.4, 23.6, 23.7, 23.8, 24.0, 24.1, 24.3, 24.4, 24.6, 24.7, 24.9, 25.0, 25.2

  1%|          | 47/4000 [01:23<1:58:05,  1.79s/it]

(B) Took the cup/glass/bottle.
GT: (D) Put down the box.
Part  Acc: 85.11%
Total Acc: 85.11%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.

  1%|          | 48/4000 [01:25<1:59:13,  1.81s/it]

(C) Closed the closet/cabinet.
GT: (C) Closed the closet/cabinet.
Part  Acc: 85.42%
Total Acc: 85.42%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.7, 0.8, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 1

  1%|          | 49/4000 [01:27<1:59:26,  1.81s/it]

(B) Tidied up the blanket.
GT: (B) Tidied up the blanket.
Part  Acc: 85.71%
Total Acc: 85.71%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 18.9, 19.0, 19.1, 19.2, 19.3, 19.3, 19.5, 19.5, 19.6, 19.7, 19.8, 19.9, 19.9, 20.1, 20.1, 20.2, 20.3, 20.4, 20.5, 20.5, 20.7, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.5, 22.6, 22.7, 22.8, 22.8, 23.0, 23.0, 23.1, 23.2, 23.3, 23.4, 23.4, 23.6, 23.6, 23.7, 23.8, 23.9, 24.0, 24.0, 24.2, 24.2, 24.3, 24.4, 24.5, 24.6, 24.6, 24.8, 24.8, 24.9, 25.0, 25.1, 25.2, 25.2, 25.4, 25.4, 25.5, 25.6, 25.7, 25.8, 25.8, 26.0, 26.0, 26.1, 26.2, 26.3, 26.4, 26.4, 26.6, 26.6, 26.7, 26.8, 26.9, 27.0, 27.0, 27.2, 27.2, 27.3, 27.4, 27.5, 27.6, 27.6, 27.8, 27.8, 27.9, 28.0, 28.1, 28.2, 28.3

  1%|▏         | 50/4000 [01:28<1:55:56,  1.76s/it]

(C) Took the laptop.
GT: (C) Took the laptop.
Part  Acc: 86.00%
Total Acc: 86.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 21.2, 21.3, 21.4, 21.5, 21.7, 21.8, 21.9, 22.0, 22.1, 22.3, 22.4, 22.5, 22.6, 22.7, 22.9, 23.0, 23.1, 23.2, 23.3, 23.5, 23.6, 23.7, 23.8, 23.9, 24.1, 24.2, 24.3, 24.4, 24.5, 24.7, 24.8, 24.9, 25.0, 25.1, 25.3, 25.4, 25.5, 25.6, 25.7, 25.9, 26.0, 26.1, 26.2, 26.3, 26.5, 26.6, 26.7, 26.8, 26.9, 27.1, 27.2, 27.3, 27.4, 27.5, 27.7, 27.8, 27.9, 28.0, 28.1, 28.3, 28.4, 28.5, 28.6, 28.7, 28.9, 29.0, 29.1, 29.2, 29.3, 29.4, 29.6, 29.7, 29.8, 29.9, 30.0, 30.2, 30.3, 30.4, 30.5, 30.6, 30.8, 30.9, 31.0, 31.1, 31.2, 31.4, 31.5, 31.6, 31.7, 31.8, 32.0, 32.1, 32.2, 32.3, 32.4, 32.6, 32.7, 32.8, 32.9, 33.0, 33.2, 33.3, 33.4, 33.5, 33.6, 33.8, 33.9, 34.0, 34.1, 34.2

  1%|▏         | 51/4000 [01:30<1:55:01,  1.75s/it]

(B) Put down the towel.
GT: (B) Put down the towel.
Part  Acc: 86.27%
Total Acc: 86.27%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.3, 17.4, 17.6, 17.7, 17.8, 17.9, 18.1, 18.2, 18.3, 18.4, 18.6, 18.7, 18.9, 19.0, 19.1, 19.2, 19.4, 19.5, 19.6, 19.7, 19.9, 20.0, 20.1, 20.2, 20.4, 20.5, 20.6, 20.8, 20.9, 21.0, 21.1, 21.3, 21.4, 21.5, 21.6, 21.8, 21.9, 22.1, 22.2, 22.3, 22.4, 22.6, 22.7, 22.8, 22.9, 23.1, 23.2, 23.4, 23.5, 23.6, 23.7, 23.9, 24.0, 24.1, 24.2, 24.4, 24.5, 24.6, 24.7, 24.9, 25.0, 25.2, 25.3, 25.4, 25.5, 25.7, 25.8, 25.9, 26.0, 26.2, 26.3, 26.4, 26.6, 26.7, 26.8, 26.9, 27.1, 27.2, 27.3, 27.4, 27.6

  1%|▏         | 52/4000 [01:32<1:55:17,  1.75s/it]

(B) Took the book.
GT: (B) Took the book.
Part  Acc: 86.54%
Total Acc: 86.54%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.4, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.3, 3.4, 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.1, 6.2, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.4, 7.5, 7.7, 7.8, 7.9, 8.0, 8.1, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.7, 14.8, 14.9, 15.0, 15.1, 15.3, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.6, 16

  1%|▏         | 53/4000 [01:33<1:53:04,  1.72s/it]

(D) Sat at the table.
GT: (A) Took the picture.
Part  Acc: 84.91%
Total Acc: 84.91%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 24.6, 24.7, 24.8, 24.9, 25.0, 25.1, 25.1, 25.2, 25.3, 25.4, 25.5, 25.5, 25.6, 25.7, 25.8, 25.9, 25.9, 26.0, 26.1, 26.2, 26.3, 26.3, 26.4, 26.5, 26.6, 26.7, 26.8, 26.8, 26.9, 27.0, 27.1, 27.2, 27.2, 27.3, 27.4, 27.5, 27.6, 27.6, 27.7, 27.8, 27.9, 28.0, 28.1, 28.1, 28.2, 28.3, 28.4, 28.5, 28.5, 28.6, 28.7, 28.8, 28.9, 28.9, 29.0, 29.1, 29.2, 29.3, 29.3, 29.4, 29.5, 29.6, 29.7, 29.8, 29.8, 29.9, 30.0, 30.1, 30.2, 30.2, 30.3, 30.4, 30.5, 30.6, 30.6, 30.7, 30.8, 30.9, 31.0, 31.0, 31.1, 31.2, 31.3, 31.4, 31.5, 31.6, 31.6, 31.7, 31.8, 31.9, 31.9, 32.0, 32.1, 32.2, 32.3, 32.3, 32.4, 32.5, 32.6, 32.7, 32.7, 32.8, 32.9, 33.0, 33.1, 33.2, 33.3, 33.3, 33.4, 33.5

  1%|▏         | 54/4000 [01:35<1:51:54,  1.70s/it]

(D) Put down the laptop.
GT: (D) Put down the laptop.
Part  Acc: 85.19%
Total Acc: 85.19%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 15.6, 15.6, 15.7, 15.8, 15.9, 15.9, 16.0, 16.1, 16.1, 16.2, 16.3, 16.4, 16.4, 16.5, 16.5, 16.7, 16.7, 16.8, 16.8, 16.9, 17.0, 17.1, 17.1, 17.2, 17.3, 17.3, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.1, 18.2, 18.2, 18.3, 18.4, 18.5, 18.5, 18.6, 18.7, 18.8, 18.8, 18.9, 19.0, 19.0, 19.1, 19.1, 19.3, 19.3, 19.4, 19.4, 19.5, 19.6, 19.7, 19.7, 19.8, 19.9, 19.9, 20.0, 20.1, 20.2, 20.2, 20.3, 20.3, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.8, 20.9, 21.0, 21.1, 21.1, 21.2, 21.3, 21.4, 21.4, 21.5, 21.6, 21.6, 21.7, 21.8, 21.9, 21.9, 22.0, 22.0, 22.1, 22.2, 22.3, 22.3, 22.4, 22.5, 22.5, 22.6, 22.7, 22.8, 22.8, 22.9, 22.9, 23.1, 23.1, 23.2, 23.2

  1%|▏         | 55/4000 [01:37<1:50:16,  1.68s/it]

(B) Took the towel.
GT: (C) Tidied up the clothes.
Part  Acc: 83.64%
Total Acc: 83.64%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 5.6, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.4, 8.5, 8.7, 8.8, 8.9, 9.0, 9.1, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.2, 13.3, 13.5, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.4, 14.5, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 15.9, 16.0, 16.2, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.2, 17.4, 17.5, 17.7, 17.8, 17.9, 18.0, 18.1, 18.3, 18.4, 18.5, 18.6, 18.8, 18.9, 19.0, 19.1, 19.2, 19.4, 19.5, 19.6, 19.7

  1%|▏         | 56/4000 [01:38<1:48:02,  1.64s/it]

(B) Threw the shoe.
GT: (B) Threw the shoe.
Part  Acc: 83.93%
Total Acc: 83.93%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.9, 4.9, 5.1, 5.1, 5.2, 5.2, 5.4, 5.4, 5.4, 5.6, 5.6, 5.7, 5.7, 5.9, 5.9, 5.9, 6.0, 6.0, 6.2, 6.2, 6.4, 6.4, 6.4, 6.5, 6.5, 6.7, 6.7, 6.9, 6.9, 6.9, 7.0, 7.0, 7.2, 7.2, 7.4, 7.4, 7.4, 7.5, 7.5, 7.7, 7.7, 7.8, 7.8, 7.8, 8.0, 8.0, 8.2, 8.2, 8.3, 8.3, 8.3, 8.5, 8.5, 8.7, 8.7, 8.8, 8.8, 8.8, 9.0, 9.0, 9.2, 9.2, 9.3, 9.3, 9.5, 9.5, 9.5, 9.6, 9.6, 9.8, 9.8, 10.0, 10.0, 10.0, 10.1, 10.1, 10.3, 10.3, 10.5, 10.5, 10.5, 10.6, 10.6, 10.8, 10.8, 10.9, 10.9, 10.9, 11.1, 11.1, 11.3, 11.3, 11.4, 11.4, 11.4, 11.6, 11.6, 11.8, 11.8, 11.9, 11.9, 11.9, 12.1, 12.1, 12.3, 12.3, 12.4, 12.4, 12.4, 12.6, 12.6, 12.7, 12.7, 12.9, 12.9, 12.9, 13.1, 13.1, 13.2, 13.2, 13.4, 13.

  1%|▏         | 57/4000 [01:42<2:20:55,  2.14s/it]

(A) Took the paper/notebook.
GT: (A) Took the paper/notebook.
Part  Acc: 84.21%
Total Acc: 84.21%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.7, 0.8, 0.9, 1.0, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2 seconds. Carefu

  1%|▏         | 58/4000 [01:43<2:09:45,  1.98s/it]

(B) Opened the bag.
GT: (B) Opened the bag.
Part  Acc: 84.48%
Total Acc: 84.48%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 19.9, 20.0, 20.0, 20.1, 20.1, 20.2, 20.3, 20.3, 20.4, 20.4, 20.5, 20.6, 20.6, 20.7, 20.7, 20.8, 20.9, 20.9, 21.0, 21.0, 21.1, 21.2, 21.2, 21.3, 21.3, 21.4, 21.5, 21.5, 21.6, 21.6, 21.7, 21.8, 21.8, 21.9, 21.9, 22.0, 22.1, 22.1, 22.2, 22.2, 22.3, 22.4, 22.4, 22.5, 22.5, 22.6, 22.7, 22.7, 22.8, 22.8, 22.9, 23.0, 23.0, 23.1, 23.1, 23.2, 23.3, 23.3, 23.4, 23.4, 23.5, 23.6, 23.6, 23.7, 23.8, 23.8, 23.9, 23.9, 24.0, 24.1, 24.1, 24.2, 24.2, 24.3, 24.4, 24.4, 24.5, 24.5, 24.6, 24.7, 24.7, 24.8, 24.8, 24.9, 25.0, 25.0, 25.1, 25.1, 25.2, 25.3, 25.3, 25.4, 25.4, 25.5, 25.6, 25.6, 25.7, 25.7, 25.8, 25.9, 25.9, 26.0, 26.0, 26.1, 26.2, 26.2, 26.3, 26.3, 26.4, 26.5

  1%|▏         | 59/4000 [01:45<2:02:51,  1.87s/it]

(A) Put down the phone/camera.
GT: (A) Put down the phone/camera.
Part  Acc: 84.75%
Total Acc: 84.75%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.7, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.8, 11.9, 12.0, 12.1, 12.2, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.9, 18.0, 18.1, 18.2, 18.3, 1

  2%|▏         | 60/4000 [01:46<1:59:22,  1.82s/it]

(A) Lied on the bed.
GT: (A) Lied on the bed.
Part  Acc: 85.00%
Total Acc: 85.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.4, 13.5, 13.6, 13.7, 

  2%|▏         | 61/4000 [01:48<1:55:26,  1.76s/it]

(D) Put down the book.
GT: (D) Put down the book.
Part  Acc: 85.25%
Total Acc: 85.25%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.1, 12.2, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.5, 13.6, 13.7, 1

  2%|▏         | 62/4000 [01:50<1:53:42,  1.73s/it]

(C) Opened the book.
GT: (C) Opened the book.
Part  Acc: 85.48%
Total Acc: 85.48%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.5, 0.5, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.2, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.8, 4.8, 5.0, 5.0, 5.2, 5.2, 5.4, 5.5, 5.5, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.5, 9.5, 9.7, 9.8, 9.9, 10.0, 10.0, 10.2, 10.2, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8,

  2%|▏         | 63/4000 [01:51<1:51:26,  1.70s/it]

(C) Opened the door.
GT: (B) Put down the box.
Part  Acc: 84.13%
Total Acc: 84.13%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.2, 0.2, 0.3, 0.4, 0.4, 0.4, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.6, 6.7, 6.8, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2 seconds. Carefully 

  2%|▏         | 64/4000 [01:53<1:48:02,  1.65s/it]

(B) Took the towel.
GT: (D) Took the paper/notebook.
Part  Acc: 82.81%
Total Acc: 82.81%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 25.0, 25.1, 25.1, 25.2, 25.3, 25.3, 25.4, 25.4, 25.5, 25.6, 25.6, 25.7, 25.8, 25.8, 25.9, 26.0, 26.0, 26.1, 26.2, 26.2, 26.3, 26.3, 26.4, 26.5, 26.5, 26.6, 26.7, 26.7, 26.8, 26.9, 26.9, 27.0, 27.1, 27.1, 27.2, 27.2, 27.3, 27.4, 27.4, 27.5, 27.6, 27.6, 27.7, 27.8, 27.8, 27.9, 27.9, 28.0, 28.1, 28.1, 28.2, 28.3, 28.3, 28.4, 28.5, 28.5, 28.6, 28.7, 28.7, 28.8, 28.8, 28.9, 29.0, 29.0, 29.1, 29.2, 29.2, 29.3, 29.4, 29.4, 29.5, 29.5, 29.6, 29.7, 29.7, 29.8, 29.9, 30.0, 30.0, 30.1, 30.2, 30.2, 30.3, 30.4, 30.4, 30.5, 30.5, 30.6, 30.7, 30.7, 30.8, 30.9, 30.9, 31.0, 31.1, 31.1, 31.2, 31.2, 31.3, 31.4, 31.4, 31.5, 31.6, 31.6, 31.7, 31.8, 31.8, 31.9, 32.0, 32.0

  2%|▏         | 65/4000 [01:55<1:48:16,  1.65s/it]

(D) Washed the clothes.
GT: (D) Washed the clothes.
Part  Acc: 83.08%
Total Acc: 83.08%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 11.9, 12.1, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.6, 17.8, 17.9, 18.0, 18.1, 18.3, 18.4, 18.5, 18.6, 18.8, 18.9, 19.0, 19.1, 19.3, 19.4, 19.5, 19.6, 19.8, 19.9, 20.0, 20.2, 20.3, 20.4, 20.6, 20.7, 20.8, 20.9, 21.1, 21.2, 21.3, 21.4, 21.6, 21.7, 21.8, 21.9, 22.1, 22.2, 22.3, 22.4, 22.6, 22.7, 22.8, 22.9, 23.1, 23.2, 23.4, 23.5, 23.6, 23.7, 23.9, 24.0, 24.1, 24.2, 24.4, 24.5, 24.6, 24.7, 24.9, 25.0, 25.1, 25.2, 25.4, 25.5, 25.6, 25.7

  2%|▏         | 66/4000 [01:56<1:49:09,  1.66s/it]

(A) Opened the laptop.
GT: (A) Opened the laptop.
Part  Acc: 83.33%
Total Acc: 83.33%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.7, 8.8, 8.8, 9.0, 9.0, 9.2, 9.3, 9.3, 9.5, 9.5, 9.6, 9.8, 9.8, 10.0, 10.1, 10.1, 10.3, 10.3, 10.5, 10.6, 10.6, 10.8, 10.8, 11.0, 11.1, 11.1, 11.3, 11.3, 11.4, 11.6, 11.6, 11.8, 11.9, 11.9, 12.1, 12.1, 12.3, 12.4, 12.4, 12.6, 12.6, 12.7, 12.9, 12.9, 13.1, 13.1, 13.2, 13.4, 13.4, 13.6, 13.6, 13.7, 13.9, 13.9, 14.1, 14.2, 14.2, 14.4, 14.4, 14.5, 14.7, 14.7, 14.9, 14.9, 15.0, 15.2, 15.2, 15.4, 15.4, 15.5, 15.7, 15.7, 15.9, 15.9, 16.0, 16.2, 16.2, 16.3, 16.5, 16.5, 16.7, 16.7, 16.8, 17.0, 17.0, 17.2, 17.2, 17.3, 17.5, 17.5, 17.7, 17.7, 17.8, 18.0, 18.0, 18.1, 18.1, 18.3, 18.5, 18.5, 18.6, 18.8, 18.8, 19.0, 19.0, 19.1, 19.3, 19.3, 19.5, 19.5, 19.6, 19.8,

  2%|▏         | 67/4000 [01:59<2:02:20,  1.87s/it]

(D) Put down the shoe.
GT: (D) Put down the shoe.
Part  Acc: 83.58%
Total Acc: 83.58%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 5.2, 5.4, 5.5, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.3, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.4, 16.5, 16.6, 16.7, 16.8, 17.0, 17.1, 17.2, 17.3, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.4, 18.6, 18.7, 18.8, 18.9, 19.1, 19.2, 19.3, 19.4, 1

  2%|▏         | 68/4000 [02:00<1:58:40,  1.81s/it]

(A) Sat at the table.
GT: (A) Sat at the table.
Part  Acc: 83.82%
Total Acc: 83.82%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.1, 7.2, 7.4, 7.5, 7.7, 7.8, 8.0, 8.1, 8.3, 8.4, 8.6, 8.7, 8.9, 9.0, 9.2, 9.3, 9.5, 9.6, 9.8, 9.9, 10.1, 10.2, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.0, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.1, 13.2, 13.4, 13.5, 13.7, 13.8, 14.0, 14.1, 14.3, 14.5, 14.6, 14.8, 14.9, 15.1, 15.2, 15.4, 15.5, 15.7, 15.8, 16.0, 16.1, 16.3, 16.4, 16.6, 16.7, 16.9, 17.0, 17.2, 17.3, 17.5, 17.6, 17.8, 17.9, 18.1, 18.2, 18.4, 18.5, 18.7, 18.8, 19.0, 19.1, 19.3, 19.4, 19.6, 19.7, 19.9, 20.0, 20.2, 20.3, 20.5, 20.6, 20.8, 20.9, 21.1, 21.2, 21.4, 21.5, 21.7, 21.8, 22.0, 22.1, 22.3, 22.5, 22.6, 22.8, 22.9, 23.1, 23.2, 23.4, 23.5, 23.7, 23.8, 24.0, 

  2%|▏         | 69/4000 [02:02<2:01:34,  1.86s/it]

(D) Threw the pillow.
GT: (D) Threw the pillow.
Part  Acc: 84.06%
Total Acc: 84.06%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 16.1, 16.3, 16.3, 16.5, 16.5, 16.6, 16.8, 16.8, 17.0, 17.0, 17.1, 17.3, 17.3, 17.4, 17.4, 17.6, 17.8, 17.8, 17.9, 17.9, 18.1, 18.1, 18.3, 18.4, 18.4, 18.6, 18.6, 18.7, 18.9, 18.9, 19.1, 19.1, 19.2, 19.4, 19.4, 19.6, 19.6, 19.7, 19.9, 19.9, 20.1, 20.1, 20.2, 20.4, 20.4, 20.5, 20.5, 20.7, 20.7, 20.9, 21.0, 21.0, 21.2, 21.2, 21.4, 21.5, 21.5, 21.7, 21.7, 21.8, 22.0, 22.0, 22.2, 22.2, 22.3, 22.5, 22.5, 22.7, 22.7, 22.8, 23.0, 23.0, 23.1, 23.1, 23.3, 23.5, 23.5, 23.6, 23.6, 23.8, 24.0, 24.0, 24.1, 24.1, 24.3, 24.3, 24.5, 24.6, 24.6, 24.8, 24.8, 24.9, 25.1, 25.1, 25.3, 25.3, 25.4, 25.6, 25.6, 25.8, 25.8, 25.9, 26.1, 26.1, 26.2, 26.2, 26.4, 26.6, 26.6, 26.7

  2%|▏         | 70/4000 [02:07<2:55:36,  2.68s/it]

(B) Threw the shoe.
GT: (B) Threw the shoe.
Part  Acc: 84.29%
Total Acc: 84.29%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.4, 0.4, 0.5, 0.6, 0.8, 0.8, 0.9, 1.0, 1.2, 1.2, 1.3, 1.4, 1.6, 1.6, 1.7, 1.8, 2.0, 2.0, 2.1, 2.2, 2.4, 2.4, 2.5, 2.6, 2.8, 2.8, 2.9, 3.0, 3.2, 3.2, 3.3, 3.4, 3.6, 3.6, 3.7, 3.8, 4.0, 4.0, 4.1, 4.2, 4.4, 4.4, 4.5, 4.6, 4.8, 4.8, 4.9, 5.0, 5.2, 5.2, 5.3, 5.4, 5.6, 5.6, 5.7, 5.8, 6.0, 6.0, 6.1, 6.2, 6.4, 6.4, 6.5, 6.6, 6.8, 6.8, 6.9, 7.0, 7.2, 7.2, 7.3, 7.4, 7.6, 7.6, 7.7, 7.8, 8.0, 8.0, 8.1, 8.2, 8.4, 8.4, 8.5, 8.6, 8.8, 8.8, 8.9, 9.0, 9.2, 9.2, 9.3, 9.4, 9.6, 9.6, 9.7, 9.8, 10.0, 10.0, 10.1, 10.2, 10.4, 10.4, 10.5, 10.6, 10.8, 10.8, 10.9, 11.0, 11.2, 11.2, 11.3, 11.4, 11.6, 11.6, 11.7, 11.8, 12.0, 12.0, 12.1, 12.2, 12.4, 12.4, 12.5, 1

  2%|▏         | 71/4000 [02:08<2:35:03,  2.37s/it]

(C) Put down the pillow.
GT: (D) Took the towel.
Part  Acc: 83.10%
Total Acc: 83.10%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 5.4, 5.5, 5.5, 5.7, 5.9, 5.9, 6.0, 6.2, 6.2, 6.4, 6.5, 6.5, 6.7, 6.9, 6.9, 7.0, 7.0, 7.2, 7.3, 7.3, 7.5, 7.7, 7.7, 7.8, 8.0, 8.0, 8.2, 8.3, 8.3, 8.5, 8.6, 8.6, 8.8, 9.0, 9.0, 9.1, 9.3, 9.3, 9.5, 9.6, 9.6, 9.8, 10.0, 10.0, 10.1, 10.3, 10.3, 10.4, 10.6, 10.6, 10.8, 10.8, 10.9, 11.1, 11.1, 11.3, 11.4, 11.4, 11.6, 11.7, 11.7, 11.9, 12.1, 12.1, 12.2, 12.4, 12.4, 12.6, 12.7, 12.7, 12.9, 13.1, 13.1, 13.2, 13.4, 13.4, 13.5, 13.7, 13.7, 13.9, 13.9, 14.0, 14.2, 14.2, 14.4, 14.5, 14.5, 14.7, 14.8, 14.8, 15.0, 15.2, 15.2, 15.3, 15.5, 15.5, 15.7, 15.8, 15.8, 16.0, 16.2, 16.2, 16.3, 16.5, 16.5, 16.6, 16.8, 16.8, 17.0, 17.1, 17.1, 17.3, 17.5, 17.5, 17.6, 17.6, 17.8

  2%|▏         | 72/4000 [02:13<3:25:32,  3.14s/it]

(B) Tidied up the table.
GT: (B) Tidied up the table.
Part  Acc: 83.33%
Total Acc: 83.33%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 3.9, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.5, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.5, 6.6, 6.6, 6.7, 6.7, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.1 seconds. Carefully 

  2%|▏         | 73/4000 [02:15<2:53:05,  2.64s/it]

(B) Took the food.
GT: (B) Took the food.
Part  Acc: 83.56%
Total Acc: 83.56%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 9.2, 9.4, 9.5, 9.7, 9.8, 10.0, 10.1, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.1, 11.2, 11.4, 11.5, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.6, 13.7, 13.9, 14.0, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.6, 15.7, 15.9, 15.9, 16.1, 16.2, 16.4, 16.5, 16.7, 16.8, 17.0, 17.1, 17.3, 17.3, 17.5, 17.6, 17.8, 17.9, 18.1, 18.2, 18.4, 18.6, 18.7, 18.8, 18.9, 19.1, 19.2, 19.4, 19.5, 19.7, 19.8, 20.0, 20.1, 20.2, 20.3, 20.5, 20.6, 20.8, 20.9, 21.1, 21.2, 21.4, 21.5, 21.6, 21.7, 21.9, 22.0, 22.2, 22.3, 22.5, 22.6, 22.8, 22.9, 23.0, 23.1, 23.3, 23.4, 23.6, 23.7, 23.9, 24.0, 24.2, 24.3, 24.4, 24.5, 24.

  2%|▏         | 74/4000 [02:17<2:37:12,  2.40s/it]

(B) Tidied up the closet/cabinet.
GT: (B) Tidied up the closet/cabinet.
Part  Acc: 83.78%
Total Acc: 83.78%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.4, 0.5, 0.7, 0.8, 1.0, 1.1, 1.3, 1.5, 1.6, 1.7, 1.9, 2.1, 2.2, 2.4, 2.5, 2.7, 2.8, 3.0, 3.1, 3.3, 3.5, 3.6, 3.7, 3.9, 4.1, 4.2, 4.4, 4.5, 4.7, 4.8, 5.0, 5.1, 5.3, 5.5, 5.6, 5.8, 5.9, 6.1, 6.2, 6.4, 6.5, 6.7, 6.8, 7.0, 7.2, 7.3, 7.5, 7.6, 7.8, 8.0, 8.1, 8.2, 8.4, 8.6, 8.7, 8.8, 9.0, 9.2, 9.3, 9.5, 9.6, 9.8, 10.0, 10.1, 10.2, 10.4, 10.6, 10.7, 10.9, 11.0, 11.2, 11.3, 11.5, 11.6, 11.8, 11.9, 12.1, 12.3, 12.4, 12.6, 12.7, 12.9, 13.1, 13.2, 13.3, 13.5, 13.7, 13.8, 13.9, 14.1, 14.3, 14.4, 14.6, 14.7, 14.9, 15.1, 15.2, 15.3, 15.5, 15.7, 15.8, 16.0, 16.1, 16.3, 16.4, 16.6, 16.7, 16.9, 17.1, 17.2, 17.4, 17.5, 17.7, 17.8, 18.0, 18.2, 18.3, 18.4, 18

  2%|▏         | 75/4000 [02:18<2:22:47,  2.18s/it]

(C) Sat on the bed.
GT: (C) Sat on the bed.
Part  Acc: 84.00%
Total Acc: 84.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.7, 11.7, 11.9, 11.9, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.7, 13.7, 13.9, 13.9, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.7, 15.7, 15.9, 15.9, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.7, 17.7, 17.9, 17.9, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.7, 19.8,

  2%|▏         | 76/4000 [02:20<2:12:33,  2.03s/it]

(A) Put down the dish.
GT: (A) Put down the dish.
Part  Acc: 84.21%
Total Acc: 84.21%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 24.8, 24.9, 25.0, 25.1, 25.2, 25.3, 25.4, 25.5, 25.5, 25.6, 25.7, 25.8, 25.9, 26.0, 26.1, 26.2, 26.2, 26.3, 26.4, 26.5, 26.6, 26.7, 26.8, 26.9, 27.0, 27.0, 27.1, 27.2, 27.3, 27.4, 27.5, 27.6, 27.7, 27.7, 27.8, 27.9, 28.0, 28.1, 28.2, 28.3, 28.4, 28.5, 28.5, 28.6, 28.7, 28.8, 28.9, 29.0, 29.1, 29.2, 29.2, 29.3, 29.4, 29.5, 29.6, 29.7, 29.8, 29.9, 30.0, 30.0, 30.1, 30.2, 30.3, 30.4, 30.5, 30.6, 30.7, 30.7, 30.8, 30.9, 31.0, 31.1, 31.2, 31.3, 31.4, 31.5, 31.5, 31.6, 31.7, 31.8, 31.9, 32.0, 32.1, 32.2, 32.2, 32.3, 32.4, 32.5, 32.6, 32.7, 32.8, 32.9, 33.0, 33.0, 33.1, 33.2, 33.3, 33.4, 33.5, 33.6, 33.7, 33.7, 33.8, 33.9, 34.0, 34.1, 34.2, 34.3, 34.4, 34.5

  2%|▏         | 77/4000 [02:22<2:06:51,  1.94s/it]

(D) Put down the broom.
GT: (D) Put down the broom.
Part  Acc: 84.42%
Total Acc: 84.42%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 17.4, 17.5, 17.6, 17.7, 17.7, 17.8, 17.9, 18.0, 18.1, 18.1, 18.2, 18.3, 18.4, 18.4, 18.5, 18.6, 18.7, 18.8, 18.8, 18.9, 19.0, 19.1, 19.1, 19.2, 19.3, 19.4, 19.5, 19.5, 19.6, 19.7, 19.8, 19.8, 19.9, 20.0, 20.1, 20.2, 20.2, 20.3, 20.4, 20.5, 20.5, 20.6, 20.7, 20.8, 20.9, 20.9, 21.0, 21.1, 21.2, 21.2, 21.3, 21.4, 21.5, 21.6, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.1, 22.2, 22.3, 22.4, 22.5, 22.5, 22.6, 22.7, 22.8, 22.8, 22.9, 23.0, 23.1, 23.2, 23.2, 23.3, 23.4, 23.5, 23.5, 23.6, 23.7, 23.8, 23.9, 23.9, 24.0, 24.1, 24.2, 24.2, 24.3, 24.4, 24.5, 24.6, 24.6, 24.7, 24.8, 24.9, 24.9, 25.0, 25.1, 25.2, 25.3, 25.3, 25.4, 25.5, 25.6, 25.6, 25.7, 25.8, 25.9, 26.0

  2%|▏         | 78/4000 [02:23<1:59:48,  1.83s/it]

(B) Put down the bag.
GT: (B) Put down the bag.
Part  Acc: 84.62%
Total Acc: 84.62%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.1, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.2, 7.3, 7.4, 7.4, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.8, 7.9, 8.0, 8.0, 8.0, 8.1, 8.2, 8.2, 8.3, 8.3, 8.4, 8.5, 8.5, 8.5, 8.6, 8.7, 8.8, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.6, 9.7, 9.8, 9.8, 9.9, 9.9, 10.0, 10.0, 10.1, 10.1, 10.2, 10.2, 10.3, 10.4, 10.4, 10.5, 10.5, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.5, 11.6, 11.7, 11.7, 11.8, 11.8, 11.9, 12.0, 12.0, 12.0, 12.1, 12.2, 12.2, 12.3, 12.3, 12.4, 12.5, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.8, 1

  2%|▏         | 79/4000 [02:25<1:53:23,  1.74s/it]

(A) Put down the blanket.
GT: (A) Put down the blanket.
Part  Acc: 84.81%
Total Acc: 84.81%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.7, 2.8, 2.8, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.1, 9.2, 9.3, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.1, 10.2, 10.3, 10.3, 10.4, 10.4, 10.5, 10.6, 10.6 seconds

  2%|▏         | 80/4000 [02:26<1:50:10,  1.69s/it]

(B) Lied on the floor.
GT: (B) Lied on the floor.
Part  Acc: 85.00%
Total Acc: 85.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 4.9, 5.1, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.1, 6.1, 6.3, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 6.9, 7.1, 7.1, 7.3, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.3, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.3, 9.5, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.3, 10.5, 10.5, 10.7, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.3, 11.5, 11.5, 11.7, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.5, 12.7, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.7, 13.9, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.7, 14.9, 14.9, 15.1, 15.1, 15.2, 15.3,

  2%|▏         | 81/4000 [02:28<1:47:26,  1.64s/it]

(B) Ate the sandwich.
GT: (B) Ate the sandwich.
Part  Acc: 85.19%
Total Acc: 85.19%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.8, 14.0, 14.1, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.0, 16.2, 16.3, 16.3, 16.5, 16.5, 16.6, 16.8, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.5, 18.7, 18.8, 18.8, 19.0, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.2, 21.4, 21.5, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1

  2%|▏         | 82/4000 [02:30<1:47:17,  1.64s/it]

(B) Put down the clothes.
GT: (C) Put down the blanket.
Part  Acc: 84.15%
Total Acc: 84.15%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 15.5, 15.7, 15.8, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.4, 18.6, 18.7, 18.9, 19.0, 19.1, 19.3, 19.4, 19.5, 19.6, 19.8, 19.9, 20.1, 20.2, 20.3, 20.5, 20.6, 20.7, 20.8, 21.0, 21.1, 21.3, 21.4, 21.5, 21.6, 21.8, 21.9, 22.0, 22.2, 22.3, 22.5, 22.6, 22.7, 22.8, 22.9, 23.1, 23.2, 23.4, 23.5, 23.6, 23.8, 23.9, 24.0, 24.1, 24.3, 24.4, 24.6, 24.7, 24.8, 24.9, 25.1, 25.2, 25.3, 25.5, 25.6, 25.8, 25.9, 26.0, 26.1, 26.2, 26.4, 26.5, 26.7, 26.8, 26.9, 27.1, 27.2, 27.3, 27.4, 27.6, 27.7, 27.9, 28.0, 28.1, 28.3, 28.4, 28.5, 28.6, 28.8, 28.9, 29.1, 29.2, 29.3, 29.4, 29.6, 29.7, 29.8, 30.0

  2%|▏         | 83/4000 [02:31<1:44:54,  1.61s/it]

(C) Took the blanket.
GT: (C) Took the blanket.
Part  Acc: 84.34%
Total Acc: 84.34%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.8, 14.9, 15.0, 15.1, 15.2, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.8, 18.9, 19.0, 19.1, 19.2, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.8, 19.9, 20.0

  2%|▏         | 84/4000 [02:33<1:47:57,  1.65s/it]

(C) Sat on the sofa/couch.
GT: (C) Sat on the sofa/couch.
Part  Acc: 84.52%
Total Acc: 84.52%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 24.3, 24.4, 24.4, 24.5, 24.5, 24.6, 24.6, 24.7, 24.8, 24.8, 24.9, 24.9, 25.0, 25.0, 25.0, 25.1, 25.1, 25.2, 25.2, 25.3, 25.3, 25.4, 25.5, 25.5, 25.6, 25.6, 25.7, 25.7, 25.8, 25.9, 25.9, 26.0, 26.0, 26.1, 26.1, 26.2, 26.3, 26.3, 26.4, 26.4, 26.5, 26.5, 26.6, 26.7, 26.7, 26.8, 26.8, 26.9, 26.9, 27.0, 27.1, 27.1, 27.2, 27.2, 27.3, 27.3, 27.4, 27.5, 27.5, 27.6, 27.6, 27.7, 27.7, 27.8, 27.8, 27.9, 28.0, 28.0, 28.1, 28.1, 28.2, 28.2, 28.3, 28.4, 28.4, 28.5, 28.5, 28.6, 28.6, 28.7, 28.8, 28.8, 28.8, 28.8, 28.9, 28.9, 29.0, 29.1, 29.1, 29.2, 29.2, 29.3, 29.3, 29.4, 29.5, 29.5, 29.6, 29.6, 29.7, 29.7, 29.8, 29.9, 29.9, 30.0, 30.0, 30.1, 30.1, 30.2, 30.3, 30.3

  2%|▏         | 85/4000 [02:36<2:22:23,  2.18s/it]

(B) Took the box.
GT: (A) Opened the box.
Part  Acc: 83.53%
Total Acc: 83.53%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.7, 7.8, 8.0, 8.2, 8.3, 8.4, 8.6, 8.8, 8.9, 9.0, 9.2, 9.4, 9.5, 9.6, 9.8, 10.0, 10.1, 10.2, 10.4, 10.6, 10.7, 10.8, 11.0, 11.2, 11.3, 11.4, 11.6, 11.8, 11.9, 12.0, 12.2, 12.4, 12.5, 12.6, 12.8, 13.0, 13.1, 13.2, 13.4, 13.6, 13.7, 13.8, 14.0, 14.2, 14.3, 14.4, 14.6, 14.8, 14.9, 15.0, 15.2, 15.4, 15.5, 15.6, 15.8, 16.0, 16.1, 16.2, 16.4, 16.6, 16.7, 16.8, 17.0, 17.2, 17.3, 17.4, 17.6, 17.8, 17.9, 18.0, 18.2, 18.4, 18.5, 18.6, 18.8, 19.0, 19.1, 19.2, 19.4, 19.6, 19.7, 19.8, 20.0, 20.2, 20.3, 20.4, 20.6, 20.8, 20.9, 21.0, 21.2, 21.4, 21.5, 21.6, 21.8, 22.0, 22.1, 22.2, 22.4, 22.6, 22.7, 22.8, 23.0, 23.2, 23.3, 23.4, 23.6, 23.8, 23.9, 24.0, 24.2, 24.4, 2

  2%|▏         | 86/4000 [02:38<2:12:25,  2.03s/it]

(D) Put down the bag.
GT: (D) Put down the bag.
Part  Acc: 83.72%
Total Acc: 83.72%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.4, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7

  2%|▏         | 87/4000 [02:40<2:03:13,  1.89s/it]

(B) Threw the shoe.
GT: (B) Threw the shoe.
Part  Acc: 83.91%
Total Acc: 83.91%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.7, 11.8, 11.8, 11.9, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.3, 12.4, 12.4, 12.5, 12.6, 12.6, 12.7, 12.7, 12.8, 12.8, 12.9, 12.9, 13.0, 13.1, 13.2, 13.2, 13.3, 13.3, 13.4, 13.4, 13.5, 13.6, 13.6, 13.7, 13.7, 13.8, 13.8, 14.0, 14.0, 14.1, 14.1, 14.2, 14.2, 14.3, 14.3, 14.4, 14.5, 14.5, 14.6, 14.6, 14.7, 14.7, 14.8, 14.9, 15.0, 15.0, 15.1, 15.1, 15.2, 15.2, 15.3, 15.4, 15.4, 15.5, 15.5, 15.6, 15.7, 15.7, 15.8, 15.9, 15.9, 16.0, 16.0, 16.1, 16.1, 16.2, 16.2, 16.3, 16.4, 16.4, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 16.9, 17.0, 17.0, 17.1, 17.1, 17.2, 17.3, 17.4, 17.4, 17.5

  2%|▏         | 88/4000 [02:41<1:57:39,  1.80s/it]

(C) Took the blanket.
GT: (C) Took the blanket.
Part  Acc: 84.09%
Total Acc: 84.09%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.5, 0.7, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.8, 4.0, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.0, 5.2, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.9, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 7.9, 8.1, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0 seconds. Carefully 

  2%|▏         | 89/4000 [02:43<1:52:39,  1.73s/it]

(A) Opened the box.
GT: (A) Opened the box.
Part  Acc: 84.27%
Total Acc: 84.27%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 1.1, 1.3, 1.4, 1.5, 1.7, 1.9, 2.0, 2.1, 2.3, 2.5, 2.6, 2.7, 2.9, 3.1, 3.2, 3.3, 3.5, 3.7, 3.8, 3.9, 4.1, 4.3, 4.4, 4.5, 4.7, 4.9, 5.0, 5.1, 5.3, 5.4, 5.6, 5.7, 5.9, 6.0, 6.2, 6.3, 6.5, 6.6, 6.8, 6.9, 7.1, 7.2, 7.3, 7.5, 7.7, 7.8, 7.9, 8.1, 8.3, 8.4, 8.5, 8.7, 8.9, 9.0, 9.1, 9.3, 9.5, 9.6, 9.7, 9.9, 10.1, 10.2, 10.3, 10.5, 10.7, 10.8, 10.9, 11.1, 11.3, 11.4, 11.5, 11.7, 11.9, 12.0, 12.1, 12.3, 12.4, 12.6, 12.7, 12.9, 13.0, 13.2, 13.3, 13.5, 13.6, 13.8, 13.9, 14.1, 14.2, 14.3, 14.5, 14.7, 14.8, 14.9, 15.1, 15.3, 15.4, 15.5, 15.7, 15.9, 16.0, 16.1, 16.3, 16.5, 16.6, 16.8, 16.9, 17.1, 17.2, 17.4, 17.5, 17.7, 17.8, 18.0,

  2%|▏         | 90/4000 [02:44<1:49:11,  1.68s/it]

(B) Put down the food.
GT: (D) Tidied up the clothes.
Part  Acc: 83.33%
Total Acc: 83.33%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 12.0, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 14.9, 15.1, 15.2, 15.2, 15.4, 15.5, 15.6, 15.7, 15.8, 16.0, 16.1, 16.2, 16.3, 16.4, 16.6, 16.7, 16.8, 16.9, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.4, 18.6, 18.7, 18.8, 18.9, 19.0, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 20.0, 20.1, 20.2, 20.3, 20.4, 20.6, 20.7, 20.8, 20.9, 21.0, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 22.0, 22.1, 22.2, 22.3, 22.4, 22.6, 22.7, 22.8, 22.9, 23.0, 23.2, 23.3, 23.4, 23.5, 23.6, 23.8, 23.8, 23.9, 24.1, 24.2, 24.3, 24.4, 24.6, 24.7, 24.8

  2%|▏         | 91/4000 [02:46<1:50:17,  1.69s/it]

(B) Opened the box.
GT: (B) Opened the box.
Part  Acc: 83.52%
Total Acc: 83.52%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.6, 0.8, 0.9, 1.0, 1.2, 1.3, 1.4, 1.5, 1.6, 1.8, 1.9, 2.0, 2.2, 2.3, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.1, 3.3, 3.4, 3.5, 3.7, 3.8, 3.9, 4.0, 4.1, 4.3, 4.4, 4.5, 4.6, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.2, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.0, 7.1, 7.3, 7.4, 7.5, 7.7, 7.8, 7.9, 8.0, 8.1, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.2, 9.3, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.5, 14.7, 14.8, 14.9, 15.0, 15.1, 15.3, 15.4, 15.5, 15.7, 15.8, 1

  2%|▏         | 92/4000 [02:48<1:48:09,  1.66s/it]

(B) Put down the clothes.
GT: (C) Put down the towel.
Part  Acc: 82.61%
Total Acc: 82.61%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12

  2%|▏         | 93/4000 [02:49<1:44:42,  1.61s/it]

(A) Sat at the table.
GT: (A) Sat at the table.
Part  Acc: 82.80%
Total Acc: 82.80%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 18.9, 19.0, 19.1, 19.2, 19.3, 19.3, 19.4, 19.5, 19.6, 19.7, 19.7, 19.8, 19.9, 20.0, 20.1, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.9, 21.0, 21.0, 21.1, 21.2, 21.3, 21.4, 21.4, 21.5, 21.6, 21.7, 21.8, 21.8, 22.0, 22.0, 22.1, 22.2, 22.3, 22.4, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 22.9, 23.0, 23.1, 23.2, 23.3, 23.3, 23.4, 23.5, 23.6, 23.7, 23.7, 23.8, 23.9, 24.0, 24.1, 24.2, 24.2, 24.3, 24.4, 24.5, 24.6, 24.6, 24.7, 24.8, 24.9, 25.0, 25.0, 25.1, 25.2, 25.3, 25.4, 25.4, 25.5, 25.6, 25.7, 25.8, 25.9, 25.9, 26.0, 26.1, 26.2, 26.3, 26.3, 26.4, 26.5, 26.6, 26.7, 26.7, 26.8, 26.9, 27.0, 27.1, 27.1, 27.2, 27.3, 27.4, 27.5, 27.6, 27.6, 27.7, 27.8

  2%|▏         | 94/4000 [02:51<1:48:32,  1.67s/it]

(A) Closed the closet/cabinet.
GT: (A) Closed the closet/cabinet.
Part  Acc: 82.98%
Total Acc: 82.98%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.8, 23.9

  2%|▏         | 95/4000 [02:53<1:49:19,  1.68s/it]

(B) Tidied up the table.
GT: (B) Tidied up the table.
Part  Acc: 83.16%
Total Acc: 83.16%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.5, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.7, 12.8, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.8, 13.9, 14.0, 14.1, 14.2, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.7, 14.8, 14.9, 15.0, 15.1, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.6, 15.7, 15.8, 15.9, 16.0, 16.0

  2%|▏         | 96/4000 [02:55<1:54:22,  1.76s/it]

(B) Closed the closet/cabinet.
GT: (D) Opened the box.
Part  Acc: 82.29%
Total Acc: 82.29%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.4, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 7.9, 8.0, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.3, 10.4, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.2, 13.3, 13.4, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.8, 13.9, 13.9, 14.0, 14.1, 14.1, 14.2, 14.3, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.7, 14.8, 14.9, 14

  2%|▏         | 97/4000 [02:56<1:55:15,  1.77s/it]

(B) Took the paper/notebook.
GT: (B) Took the paper/notebook.
Part  Acc: 82.47%
Total Acc: 82.47%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.2, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.4, 11.6, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.2, 12.4, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.4, 14.6, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.6, 16.8, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.4, 17.6, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.2, 18.3, 18.4, 1

  2%|▏         | 98/4000 [02:58<1:52:59,  1.74s/it]

(C) Took the dish.
GT: (C) Took the dish.
Part  Acc: 82.65%
Total Acc: 82.65%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 24.3, 24.4, 24.4, 24.5, 24.5, 24.6, 24.6, 24.7, 24.8, 24.8, 24.9, 24.9, 25.0, 25.0, 25.0, 25.1, 25.1, 25.2, 25.2, 25.3, 25.3, 25.4, 25.5, 25.5, 25.6, 25.6, 25.7, 25.7, 25.8, 25.9, 25.9, 26.0, 26.0, 26.1, 26.1, 26.2, 26.3, 26.3, 26.4, 26.4, 26.5, 26.5, 26.6, 26.7, 26.7, 26.8, 26.8, 26.9, 26.9, 27.0, 27.1, 27.1, 27.2, 27.2, 27.3, 27.3, 27.4, 27.5, 27.5, 27.6, 27.6, 27.7, 27.7, 27.8, 27.8, 27.9, 28.0, 28.0, 28.1, 28.1, 28.2, 28.2, 28.3, 28.4, 28.4, 28.5, 28.5, 28.6, 28.6, 28.7, 28.8, 28.8, 28.8, 28.8, 28.9, 28.9, 29.0, 29.1, 29.1, 29.2, 29.2, 29.3, 29.3, 29.4, 29.5, 29.5, 29.6, 29.6, 29.7, 29.7, 29.8, 29.9, 29.9, 30.0, 30.0, 30.1, 30.1, 30.2, 30.3, 30.3

  2%|▏         | 99/4000 [03:01<2:25:57,  2.24s/it]

(B) Put down the shoe.
GT: (B) Put down the shoe.
Part  Acc: 82.83%
Total Acc: 82.83%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.3, 11.3, 11.4, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.

  2%|▎         | 100/4000 [03:03<2:11:48,  2.03s/it]

(A) Took the book.
GT: (A) Took the book.
Part  Acc: 83.00%
Total Acc: 83.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.9, 1.0, 1.1, 1.3, 1.4, 1.6, 1.7, 1.8, 2.0, 2.1, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.4, 4.5, 4.7, 4.8, 4.9, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.1, 7.2, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.2, 10.3, 10.5, 10.6, 10.7, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 16.0, 16.1, 16.3, 16.4, 

  3%|▎         | 101/4000 [03:05<2:05:19,  1.93s/it]

(B) Opened the refrigerator.
GT: (B) Opened the refrigerator.
Part  Acc: 83.17%
Total Acc: 83.17%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.2, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.6, 14.7, 14.8, 14.8, 14.9, 14.9, 15.0, 15.0, 15.1, 15.2, 15.2, 15.3, 15.3, 15.4, 15.5, 15.5, 15.6, 15.6, 15.7, 15.8, 15.8, 15.9, 15.9, 16.0, 16.0, 16.1, 16.2, 16.2, 16.3, 16.3, 16.4, 16.5, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 16.9, 17.0, 17.0, 17.1, 17.2, 17.2, 17.3, 17.3, 17.4, 17.5, 17.5, 17.6, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.0, 18.1, 18.2, 18.2, 18.3, 18.4, 18.4, 18.5, 18.5, 18.6, 18.7, 18.7, 18.8, 18.8, 18.9, 19.0, 19.0, 19.1, 19.1, 19.2, 19.2, 19.3, 19.4, 19.4, 19.5, 19.5, 19.6, 19.7, 19.7, 19.8, 19.8, 19.9, 20.0, 20.0, 20.1, 20.1, 20.2, 20.2, 20.3, 20.4, 20.4, 20.5, 20.5, 20.6, 20.7

  3%|▎         | 102/4000 [03:06<2:00:43,  1.86s/it]

(A) Put down the towel.
GT: (A) Put down the towel.
Part  Acc: 83.33%
Total Acc: 83.33%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.2, 11.2, 11.4, 11.5, 11.6, 11.7, 11.8, 12.0, 12.0, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.8, 17.8, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.4, 20.5, 20.6, 20.7

  3%|▎         | 103/4000 [03:08<1:57:32,  1.81s/it]

(B) Closed the book.
GT: (B) Closed the book.
Part  Acc: 83.50%
Total Acc: 83.50%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.6, 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 1

  3%|▎         | 104/4000 [03:10<1:55:04,  1.77s/it]

(A) Sat on the floor.
GT: (A) Sat on the floor.
Part  Acc: 83.65%
Total Acc: 83.65%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.3, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.5, 15.7, 15.8, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.0, 17.1, 17.3, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.2, 18.3, 18.5, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.5, 19.7, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.7, 20.9, 21.0, 21.0, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 21.9, 22.1, 22.2, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.4, 23.5, 23.7, 23.7, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.4

  3%|▎         | 105/4000 [03:11<1:53:36,  1.75s/it]

(B) Put down the clothes.
GT: (B) Put down the clothes.
Part  Acc: 83.81%
Total Acc: 83.81%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.2, 12.2, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.1, 13.1, 13.2, 13.3, 13.3, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.9, 13.9, 14.0, 14.1, 14.1, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.7, 14.8, 14.9, 15.0, 15.0, 15.1, 15.2, 15.2, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.8, 15.9, 15.9, 16.0, 16.1, 16.1, 16.2, 16.3, 16.4, 16.4, 16.5, 16.6, 16.6, 16.7, 16.8, 16.9, 16.9, 17.0, 17.1, 17.2, 17.2, 17.3, 17.4, 17.5, 17.5, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.1, 18.2, 18.3, 18.3

  3%|▎         | 106/4000 [03:13<1:49:33,  1.69s/it]

(B) Put down the laptop.
GT: (B) Put down the laptop.
Part  Acc: 83.96%
Total Acc: 83.96%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.6

  3%|▎         | 107/4000 [03:15<1:46:11,  1.64s/it]

(C) Put down the book.
GT: (C) Put down the book.
Part  Acc: 84.11%
Total Acc: 84.11%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.9, 24.0, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6, 24.7, 24.8, 25.0, 25.1, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 25.8, 25.9, 26.0, 26.2

  3%|▎         | 108/4000 [03:16<1:45:28,  1.63s/it]

(A) Put down the dish.
GT: (A) Put down the dish.
Part  Acc: 84.26%
Total Acc: 84.26%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 12.4, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.8, 15.9, 16.0, 16.1, 16.2, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.2, 17.4, 17.5, 17.6, 17.8, 17.9, 18.0, 18.1, 18.3, 18.4, 18.5, 18.6, 18.8, 18.9, 19.0, 19.2, 19.3, 19.4, 19.5, 19.6, 19.8, 19.9, 20.0, 20.2, 20.3, 20.4, 20.5, 20.6, 20.8, 20.9, 21.0, 21.2, 21.3, 21.4, 21.5, 21.6, 21.8, 21.9, 22.0, 22.2, 22.3, 22.4, 22.5, 22.7, 22.8, 22.9, 23.0, 23.2, 23.3, 23.4, 23.6, 23.7, 23.8, 23.9, 24.0, 24.2, 24.3, 24.4, 24.6, 24.7, 24.8, 24.9, 25.0, 25.2, 25.3, 25.4, 25.6, 25.7, 25.8, 25.9, 26.0

  3%|▎         | 109/4000 [03:18<1:47:39,  1.66s/it]

(D) Closed the laptop.
GT: (D) Closed the laptop.
Part  Acc: 84.40%
Total Acc: 84.40%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 18.6, 18.7, 18.8, 18.9, 18.9, 19.0, 19.1, 19.2, 19.3, 19.3, 19.4, 19.5, 19.6, 19.7, 19.7, 19.8, 19.9, 20.0, 20.1, 20.1, 20.2, 20.3, 20.4, 20.4, 20.5, 20.6, 20.7, 20.8, 20.8, 20.9, 21.0, 21.1, 21.2, 21.2, 21.3, 21.4, 21.5, 21.6, 21.6, 21.7, 21.8, 21.9, 21.9, 22.0, 22.1, 22.2, 22.3, 22.3, 22.4, 22.5, 22.6, 22.7, 22.7, 22.8, 22.9, 23.0, 23.1, 23.1, 23.2, 23.3, 23.4, 23.4, 23.5, 23.6, 23.7, 23.8, 23.8, 23.9, 24.0, 24.1, 24.2, 24.2, 24.3, 24.4, 24.5, 24.6, 24.6, 24.7, 24.8, 24.9, 24.9, 25.0, 25.1, 25.2, 25.3, 25.3, 25.4, 25.5, 25.6, 25.7, 25.7, 25.8, 25.9, 26.0, 26.1, 26.1, 26.2, 26.3, 26.4, 26.4, 26.5, 26.6, 26.7, 26.8, 26.8, 26.9, 27.0, 27.1, 27.2, 27.2

  3%|▎         | 110/4000 [03:20<1:48:10,  1.67s/it]

(A) Threw the clothes.
GT: (D) Put down the towel.
Part  Acc: 83.64%
Total Acc: 83.64%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.6, 13.7,

  3%|▎         | 111/4000 [03:21<1:47:53,  1.66s/it]

(C) Put down the picture.
GT: (C) Put down the picture.
Part  Acc: 83.78%
Total Acc: 83.78%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.8, 14.0, 14.1, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.3, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.8, 16.9, 17.1, 17.2, 17.3, 17.4, 17.5, 17.7, 17.8, 17.9, 18.0, 18.1, 18.3, 18.4, 18.5, 18.6, 18.7, 18.9, 19.0, 19.1, 19.2, 19.3, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.2, 20.3, 20.4, 20.5, 20.6, 20.8, 20.9, 21.0, 21.1, 21.2, 21.4, 21.5, 21.6, 21.7, 21.8, 22.0, 22.1, 22.2, 22.3, 22.4, 22.6, 22.7, 22.8, 22.9, 23.0, 23.2, 23.3, 23.4, 23.5, 23.6, 23.8, 23.9, 24.0, 24.1, 24.2, 24.4, 24.5, 24.6, 24.7, 24.8, 24.9, 25.1, 25.2, 25.3, 25.4, 25.5, 25.7, 25.8, 25.9, 26.0, 26.1, 26.3, 26.4, 26.5, 26.6, 26.7, 26.9

  3%|▎         | 112/4000 [03:23<1:46:51,  1.65s/it]

(D) Opened the laptop.
GT: (D) Opened the laptop.
Part  Acc: 83.93%
Total Acc: 83.93%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.8, 17.9, 1

  3%|▎         | 113/4000 [03:24<1:46:09,  1.64s/it]

(A) Put down the bag.
GT: (A) Put down the bag.
Part  Acc: 84.07%
Total Acc: 84.07%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.4, 4.5, 4.7, 4.8, 4.9, 5.0, 5.2, 5.3, 5.5, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.3, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.1, 8.3, 8.4, 8.5, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.3, 13.4, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.6, 17.8, 17.9, 18.0, 18.1, 18.3, 18.4, 18.6, 18.7, 18.8, 18.9, 19.1, 19.2, 19.3, 

  3%|▎         | 114/4000 [03:26<1:49:49,  1.70s/it]

(D) Took the clothes.
GT: (D) Took the clothes.
Part  Acc: 84.21%
Total Acc: 84.21%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.4, 10.6, 10.6, 10.8, 10.8, 10.9, 11.1, 11.1, 11.3, 11.4, 11.4, 11.6, 11.6, 11.8, 11.9, 11.9, 12.1, 12.1, 12.2, 12.4, 12.4, 12.6, 12.7, 12.7, 12.9, 12.9, 13.1, 13.2, 13.2, 13.4, 13.6, 13.6, 13.7, 13.7, 13.9, 14.0, 14.0, 14.2, 14.2, 14.4, 14.5, 14.5, 14.7, 14.9, 14.9, 15.0, 15.0, 15.2, 15.3, 15.3, 15.5, 15.5, 15.7, 15.8, 15.8, 16.0, 16.2, 16.2, 16.3, 16.3, 16.5, 16.7, 16.7, 16.8, 17.0, 17.0, 17.1, 17.1, 17.3, 17.5, 17.5, 17.6, 17.6, 17.8, 18.0, 18.0, 18.1, 18.3, 18.3, 18.4, 18.4, 18.6, 18.8, 18.8, 18.9, 18.9, 19.1, 19.3, 19.3, 19.4, 19.6, 19.6, 19.8, 19.8, 19.9, 20.1, 20.1, 20.2, 20.2, 20.4, 20.6, 20.6, 20.7, 20.9, 20.9, 21.1, 21.1, 21.2, 21.4, 21.4

  3%|▎         | 115/4000 [03:34<3:42:53,  3.44s/it]

(D) Opened the box.
GT: (D) Opened the box.
Part  Acc: 84.35%
Total Acc: 84.35%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.0 second

  3%|▎         | 116/4000 [03:36<3:11:19,  2.96s/it]

(D) Took the cup/glass/bottle.
GT: (D) Took the cup/glass/bottle.
Part  Acc: 84.48%
Total Acc: 84.48%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 22.6, 22.7, 22.7, 22.8, 22.9, 22.9, 23.0, 23.0, 23.1, 23.2, 23.2, 23.3, 23.4, 23.5, 23.5, 23.6, 23.7, 23.7, 23.8, 23.9, 24.0, 24.0, 24.1, 24.2, 24.2, 24.3, 24.4, 24.4, 24.5, 24.6, 24.6, 24.7, 24.7, 24.8, 24.9, 25.0, 25.0, 25.1, 25.2, 25.2, 25.3, 25.4, 25.5, 25.5, 25.6, 25.7, 25.7, 25.8, 25.9, 25.9, 26.0, 26.1, 26.1, 26.2, 26.3, 26.3, 26.4, 26.5, 26.5, 26.6, 26.7, 26.7, 26.8, 26.9, 26.9, 27.0, 27.1, 27.2, 27.2, 27.3, 27.4, 27.4, 27.5, 27.6, 27.6, 27.7, 27.8, 27.8, 27.9, 28.0, 28.0, 28.1, 28.2, 28.2, 28.3, 28.4, 28.4, 28.5, 28.6, 28.7, 28.7, 28.8, 28.9, 28.9, 29.0, 29.1, 29.2, 29.2, 29.3, 29.3, 29.4, 29.5, 29.5, 29.6, 29.7, 29.7, 29.8, 29.9, 29.9, 30.0

  3%|▎         | 117/4000 [03:37<2:44:55,  2.55s/it]

(A) Took the towel.
GT: (D) Took the dish.
Part  Acc: 83.76%
Total Acc: 83.76%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.0, 7.1, 7.2, 7.3, 7.4, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18

  3%|▎         | 118/4000 [03:39<2:25:18,  2.25s/it]

(A) Took the pillow.
GT: (A) Took the pillow.
Part  Acc: 83.90%
Total Acc: 83.90%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.5, 7.6, 7.8, 7.9, 8.1, 8.2, 8.4, 8.5, 8.7, 8.8, 9.0, 9.1, 9.3, 9.4, 9.6, 9.7, 9.9, 10.1, 10.2, 10.4, 10.5, 10.7, 10.8, 11.0, 11.2, 11.3, 11.5, 11.6, 11.8, 11.9, 12.1, 12.2, 12.4, 12.6, 12.7, 12.9, 13.0, 13.2, 13.3, 13.5, 13.6, 13.8, 14.0, 14.1, 14.3, 14.4, 14.6, 14.7, 14.9, 15.0, 15.2, 15.3, 15.5, 15.6, 15.8, 15.9, 16.1, 16.2, 16.4, 16.5, 16.7, 16.8, 17.0, 17.1, 17.3, 17.5, 17.6, 17.8, 17.9, 18.1, 18.2, 18.4, 18.5, 18.7, 18.9, 19.0, 19.2, 19.3, 19.5, 19.6, 19.8, 20.0, 20.1, 20.3, 20.4, 20.6, 20.7, 20.9, 21.0, 21.2, 21.3, 21.5, 21.6, 21.8, 21.9, 22.1, 22.3, 22.4, 22.6, 22.7, 22.9, 23.0, 23.2, 23.3, 23.5, 23.6, 23.8, 23.9, 24.1, 24.2, 24.4, 24.5, 24.

  3%|▎         | 119/4000 [03:41<2:20:13,  2.17s/it]

(C) Took the cup/glass/bottle.
GT: (C) Took the cup/glass/bottle.
Part  Acc: 84.03%
Total Acc: 84.03%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.1, 6.2, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.6, 18.7, 18.8, 18.

  3%|▎         | 120/4000 [03:43<2:12:48,  2.05s/it]

(B) Took the laptop.
GT: (B) Took the laptop.
Part  Acc: 84.17%
Total Acc: 84.17%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6

  3%|▎         | 121/4000 [03:44<2:05:39,  1.94s/it]

(D) Took the food.
GT: (D) Took the food.
Part  Acc: 84.30%
Total Acc: 84.30%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.0, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.1, 2.2, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1 seconds. Carefully 

  3%|▎         | 122/4000 [03:46<1:57:33,  1.82s/it]

(A) Ate the sandwich.
GT: (A) Ate the sandwich.
Part  Acc: 84.43%
Total Acc: 84.43%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.4, 17.4, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.8, 18.8, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 20.0, 20.0, 20.2, 20.2, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21.0, 21.2, 21.2, 21.4, 21.4, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.6, 22.6, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.8, 23.9, 24.0, 24.1, 24.3, 24.3, 24.5, 24.6, 24.7, 24.8, 24.9, 25.0, 25.1, 25.2, 25.3, 25.4, 25.5, 25.7, 25.7, 25.9, 26.0, 26.1, 26.2, 26.3, 26.4, 26.5, 26.6, 26.7, 26.9, 26.9, 27.1, 27.1, 27.3, 27.4, 27.5, 27.6, 27.7, 27.8, 27.9, 28.1, 28.1

  3%|▎         | 123/4000 [03:48<1:58:04,  1.83s/it]

(A) Put down the phone/camera.
GT: (A) Put down the phone/camera.
Part  Acc: 84.55%
Total Acc: 84.55%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.5, 20.5, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.5, 21.5, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.5, 22.5, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.5, 23.5, 23.7, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.5, 24.5, 24.7, 24.8, 24.9, 25.0, 25.1, 25.2, 25.3, 25.5, 25.5

  3%|▎         | 124/4000 [03:49<1:54:46,  1.78s/it]

(B) Closed the refrigerator.
GT: (B) Closed the refrigerator.
Part  Acc: 84.68%
Total Acc: 84.68%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.6, 13.7, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.8, 16.9, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.8, 24.0, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6, 24.7, 24.9, 25.0, 25.1, 25.2, 25.3, 25.4, 25.5, 25.6, 25.8, 25.9, 26.0

  3%|▎         | 125/4000 [03:51<1:53:16,  1.75s/it]

(D) Took the book.
GT: (D) Took the book.
Part  Acc: 84.80%
Total Acc: 84.80%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.7, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2

  3%|▎         | 126/4000 [03:52<1:48:58,  1.69s/it]

(D) Opened the book.
GT: (D) Opened the book.
Part  Acc: 84.92%
Total Acc: 84.92%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.0, 1.1, 1.3, 1.4, 1.5, 1.7, 1.8, 1.9, 2.1, 2.2, 2.3, 2.5, 2.5, 2.7, 2.8, 2.9, 3.1, 3.2, 3.3, 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.3, 5.4, 5.5, 5.7, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.4, 7.5, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.7, 10.7, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.5, 13.6, 13.7, 13.9, 13.9, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.3, 15.4, 15.5, 15.6

  3%|▎         | 127/4000 [03:54<1:46:10,  1.64s/it]

(D) Took the broom.
GT: (D) Took the broom.
Part  Acc: 85.04%
Total Acc: 85.04%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.9, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.2, 12.2, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.8, 12.8, 12.9, 13.0, 13

  3%|▎         | 128/4000 [03:56<1:46:33,  1.65s/it]

(C) Opened the laptop.
GT: (C) Opened the laptop.
Part  Acc: 85.16%
Total Acc: 85.16%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 9.7, 9.8, 10.0, 10.1, 10.2, 10.4, 10.5, 10.7, 10.8, 10.9, 11.1, 11.2, 11.4, 11.5, 11.7, 11.8, 11.9, 12.1, 12.2, 12.4, 12.5, 12.7, 12.8, 12.9, 13.1, 13.2, 13.4, 13.5, 13.6, 13.8, 13.9, 14.1, 14.2, 14.4, 14.5, 14.6, 14.8, 14.9, 15.1, 15.2, 15.4, 15.5, 15.6, 15.8, 15.9, 16.1, 16.2, 16.3, 16.5, 16.6, 16.8, 16.9, 17.1, 17.2, 17.3, 17.5, 17.6, 17.8, 17.9, 18.1, 18.2, 18.3, 18.5, 18.6, 18.8, 18.9, 19.1, 19.2, 19.3, 19.5, 19.6, 19.8, 19.9, 20.0, 20.2, 20.3, 20.5, 20.6, 20.8, 20.9, 21.0, 21.2, 21.3, 21.5, 21.6, 21.8, 21.9, 22.0, 22.2, 22.3, 22.5, 22.6, 22.7, 22.9, 23.0, 23.2, 23.3, 23.5, 23.6, 23.7, 23.9, 24.0, 24.2, 24.3, 24.5, 24.6, 24.7, 24.9, 25.0, 25.2, 

  3%|▎         | 129/4000 [03:57<1:47:00,  1.66s/it]

(B) Put down the bag.
GT: (B) Put down the bag.
Part  Acc: 85.27%
Total Acc: 85.27%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6,

  3%|▎         | 130/4000 [03:59<1:47:38,  1.67s/it]

(D) Took the clothes.
GT: (D) Took the clothes.
Part  Acc: 85.38%
Total Acc: 85.38%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 24.1, 24.1, 24.2, 24.3, 24.3, 24.4, 24.4, 24.5, 24.6, 24.6, 24.7, 24.8, 24.9, 24.9, 25.0, 25.1, 25.1, 25.2, 25.2, 25.3, 25.4, 25.4, 25.5, 25.6, 25.6, 25.7, 25.7, 25.8, 25.9, 25.9, 26.0, 26.1, 26.2, 26.2, 26.3, 26.4, 26.4, 26.5, 26.5, 26.6, 26.7, 26.7, 26.8, 26.9, 26.9, 27.0, 27.0, 27.1, 27.2, 27.2, 27.3, 27.4, 27.4, 27.5, 27.6, 27.7, 27.7, 27.8, 27.8, 27.9, 28.0, 28.0, 28.1, 28.2, 28.2, 28.3, 28.3, 28.4, 28.5, 28.5, 28.6, 28.7, 28.7, 28.8, 28.8, 29.0, 29.0, 29.1, 29.1, 29.2, 29.3, 29.3, 29.4, 29.5, 29.5, 29.6, 29.6, 29.7, 29.8, 29.8, 29.9, 30.0, 30.0, 30.1, 30.1, 30.2, 30.3, 30.4, 30.4, 30.5, 30.6, 30.6, 30.7, 30.8, 30.8, 30.9, 30.9, 31.0, 31.1, 31.1

  3%|▎         | 131/4000 [04:01<1:51:43,  1.73s/it]

(C) Took the cup/glass/bottle.
GT: (A) Ate the sandwich.
Part  Acc: 84.73%
Total Acc: 84.73%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 7.9, 8.0, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.3 seconds. Car

  3%|▎         | 132/4000 [04:02<1:47:31,  1.67s/it]

(A) Took the sandwich.
GT: (A) Took the sandwich.
Part  Acc: 84.85%
Total Acc: 84.85%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.3, 2.3, 2.4, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.2 seconds. Careful

  3%|▎         | 133/4000 [04:04<1:44:36,  1.62s/it]

(A) Closed the laptop.
GT: (A) Closed the laptop.
Part  Acc: 84.96%
Total Acc: 84.96%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 5.1, 5.2, 5.3, 5.5, 5.6, 5.8, 5.9, 6.0, 6.2, 6.3, 6.4, 6.5, 6.7, 6.8, 7.0, 7.1, 7.3, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.3, 9.4, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.4, 10.5, 10.6, 10.8, 10.9, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.3, 12.4, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.5, 13.6, 13.8, 13.9, 14.1, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.5, 15.7, 15.8, 15.9, 16.1, 16.2, 16.3, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.3, 17.4, 17.6, 17.7, 17.9, 18.0, 18.1, 18.3, 18.4, 18.5, 18.6, 18.8, 18.9, 19.1, 19.2, 19.4, 19.5, 19.6, 19.7, 19.9, 20.0

  3%|▎         | 134/4000 [04:06<1:48:00,  1.68s/it]

(D) Put down the phone/camera.
GT: (D) Put down the phone/camera.
Part  Acc: 85.07%
Total Acc: 85.07%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.1, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 2.0, 2.1, 2.2, 2.3, 2.5, 2.6, 2.7, 2.9, 3.0, 3.1, 3.2, 3.4, 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.3, 4.4, 4.5, 4.6, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.3, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.8, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.9, 14.0, 14.1, 14.3, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.5, 15.7

  3%|▎         | 135/4000 [04:08<1:49:39,  1.70s/it]

(C) Put down the blanket.
GT: (C) Put down the blanket.
Part  Acc: 85.19%
Total Acc: 85.19%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.6, 12.6, 12.7, 12.8, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.2, 13.3, 13.4, 13.5, 13.5, 13.6, 13.7, 13.7, 13.8, 13.9, 13.9, 14.0, 14.1, 14.2, 14.2, 14.3, 14.4, 14.4, 14.5, 14.6, 14.6, 14.7, 14.8, 14.8, 14.9, 15.0, 15.1, 15.1, 15.2, 15.3, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.8, 15.8, 15.9, 16.0, 16.0, 16.1, 16.2, 16.2, 16.3, 16.4, 16.4, 16.5, 16.6, 16.7, 16.8, 16.8, 16.9, 17.0, 17.0, 17.1, 17.2, 17.2, 17.

  3%|▎         | 136/4000 [04:09<1:49:54,  1.71s/it]

(B) Sat on the sofa/couch.
GT: (B) Sat on the sofa/couch.
Part  Acc: 85.29%
Total Acc: 85.29%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.2, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.7, 2.9, 3.0, 3.0, 3.2, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.6, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 8.9, 9.1, 9.1, 9.2, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.9, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 12

  3%|▎         | 137/4000 [04:11<1:47:44,  1.67s/it]

(B) Put down the sandwich.
GT: (B) Put down the sandwich.
Part  Acc: 85.40%
Total Acc: 85.40%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.0, 4.1, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.9, 17.0, 17.1, 17.2, 17.

  3%|▎         | 138/4000 [04:12<1:45:36,  1.64s/it]

(C) Put down the food.
GT: (A) Closed the laptop.
Part  Acc: 84.78%
Total Acc: 84.78%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.9, 1.0, 1.1, 1.3, 1.4, 1.6, 1.7, 1.8, 2.0, 2.1, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.4, 4.5, 4.7, 4.8, 4.9, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.1, 7.2, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.2, 10.3, 10.5, 10.6, 10.7, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 16.0, 16.1, 16.3, 16.4, 

  3%|▎         | 139/4000 [04:14<1:48:12,  1.68s/it]

(C) Sat on the sofa/couch.
GT: (C) Sat on the sofa/couch.
Part  Acc: 84.89%
Total Acc: 84.89%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.7, 0.8, 1.0, 1.1, 1.1, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.8, 2.8, 2.9, 3.1, 3.2, 3.2, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.6, 4.6, 4.7, 4.9, 5.0, 5.0, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.7, 6.7, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.1, 8.1, 8.2, 8.4, 8.5, 8.5, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.2, 10.3, 10.3, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 12.0, 12.0, 12.1, 12.3, 12.4, 12.4, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.

  4%|▎         | 140/4000 [04:16<1:45:02,  1.63s/it]

(A) Took the shoe.
GT: (A) Took the shoe.
Part  Acc: 85.00%
Total Acc: 85.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9

  4%|▎         | 141/4000 [04:17<1:42:59,  1.60s/it]

(B) Took the sandwich.
GT: (B) Took the sandwich.
Part  Acc: 85.11%
Total Acc: 85.11%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.8, 1.0, 1.1, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 1.9, 2.0, 2.2, 2.3, 2.4, 2.6, 2.7, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.5, 3.6, 3.8, 3.9, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.8, 4.9, 5.0, 5.1, 5.2, 5.4, 5.5, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.6, 7.7, 7.8, 7.9, 8.0, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.4, 10.5, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 1

  4%|▎         | 142/4000 [04:19<1:45:08,  1.64s/it]

(C) Sat on the bed.
GT: (C) Sat on the bed.
Part  Acc: 85.21%
Total Acc: 85.21%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.8, 13.9, 14.1, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.6, 14.8, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.6, 18.8, 18.9, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 21.9, 22.0

  4%|▎         | 143/4000 [04:21<1:44:24,  1.62s/it]

(C) Put down the book.
GT: (C) Put down the book.
Part  Acc: 85.31%
Total Acc: 85.31%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.3, 2.3, 2.4, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.2 seconds. Careful

  4%|▎         | 144/4000 [04:22<1:43:03,  1.60s/it]

(D) Took the broom.
GT: (D) Took the broom.
Part  Acc: 85.42%
Total Acc: 85.42%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.5, 13.7, 13.9, 13.9, 14.0, 14.2, 14.3, 14.5, 14.7, 14.7, 14.8, 15.0, 15.2, 15.3, 15.3, 15.5, 15.7, 15.8, 16.0, 16.1, 16.1, 16.3, 16.5, 16.6, 16.8, 17.0, 17.0, 17.1, 17.3, 17.4, 17.6, 17.6, 17.8, 17.9, 18.1, 18.3, 18.4, 18.4, 18.6, 18.8, 18.9, 19.1, 19.1, 19.2, 19.4, 19.6, 19.7, 19.9, 19.9, 20.1, 20.2, 20.4, 20.5, 20.5, 20.7, 20.9, 21.0, 21.2, 21.4, 21.4, 21.5, 21.7, 21.8, 22.0, 22.0, 22.2, 22.3, 22.5, 22.7, 22.8, 22.8, 23.0, 23.2, 23.3, 23.5, 23.6, 23.6, 23.8, 24.0, 24.1, 24.3, 24.3, 24.5, 24.6, 24.8, 24.9, 25.1, 25.1, 25.3, 25.4, 25.6, 25.8, 25.8, 25.9, 26.1, 26.3, 26.4, 26.6, 26.6, 26.7, 26.9, 27.1, 27.2, 27.2, 27.4, 27.6, 27.7, 27.9, 28.0, 28.0

  4%|▎         | 145/4000 [04:25<2:00:10,  1.87s/it]

(C) Put down the pillow.
GT: (C) Put down the pillow.
Part  Acc: 85.52%
Total Acc: 85.52%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 5.3, 5.3, 5.5, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.1, 6.1, 6.2, 6.3, 6.4, 6.5, 6.5, 6.7, 6.7, 6.9, 6.9, 7.0, 7.1, 7.2, 7.3, 7.3, 7.5, 7.5, 7.6, 7.7, 7.8, 7.9, 7.9, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.5, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.3, 9.3, 9.4, 9.5, 9.6, 9.7, 9.7, 9.9, 9.9, 10.0, 10.1, 10.2, 10.3, 10.3, 10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 10.9, 11.1, 11.1, 11.2, 11.3, 11.4, 11.5, 11.5, 11.7, 11.7, 11.8, 11.9, 12.0, 12.1, 12.1, 12.3, 12.3, 12.4, 12.5, 12.6, 12.7, 12.7, 12.9, 12.9, 13.0, 13.1, 13.2, 13.3, 13.3, 13.5, 13.5, 13.6, 13.7, 13.8, 13.9, 13.9, 14.1, 14.1, 14.2, 14.3, 14.4, 14.5, 14.5, 14.7, 14.7, 14.8, 14.9, 15.0, 15.1, 15.1, 15.3, 15.3, 15.4,

  4%|▎         | 146/4000 [04:26<1:54:13,  1.78s/it]

(A) Put down the food.
GT: (C) Took the shoe.
Part  Acc: 84.93%
Total Acc: 84.93%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.1, 12.3, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.3, 13.5, 13.5, 13.6, 13.8, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.8, 15.0, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.0, 16.2, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.2, 17.4, 17.4, 17.5, 17.7, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.4, 18.6, 18.7, 18.7, 18.9, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 19.9, 20.1, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.3, 21.5, 21.6, 21.6, 21.8

  4%|▎         | 147/4000 [04:28<1:51:27,  1.74s/it]

(D) Put down the shoe.
GT: (D) Put down the shoe.
Part  Acc: 85.03%
Total Acc: 85.03%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9,

  4%|▎         | 148/4000 [04:30<1:53:17,  1.76s/it]

(B) Took the paper/notebook.
GT: (B) Took the paper/notebook.
Part  Acc: 85.14%
Total Acc: 85.14%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.3, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.2, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.3, 13.3, 13.4, 13.5, 13.5, 13.6, 13.7, 13.8, 13.8, 13.9, 14.0, 14.1, 14.1, 14.2, 14.3, 14.3, 14.4, 14.5, 14.6, 14.6, 14.7, 14.8, 14.9, 14.9, 15.0, 15.1, 15.2, 15.2, 15.3, 15.4, 15.4, 15.5, 15.6, 15.7, 15.7, 15.8, 15.9, 16.0, 16.0, 16.1, 16.2, 16.3, 16.3, 16.4, 16.5, 16.5, 16.6, 16.7, 16.8, 16.9, 16.9, 17.0, 17.1, 

  4%|▎         | 149/4000 [04:31<1:48:25,  1.69s/it]

(A) Took the book.
GT: (B) Closed the refrigerator.
Part  Acc: 84.56%
Total Acc: 84.56%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.6, 16.7, 16.8, 16.9, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.4, 23.5, 23.6, 23.7, 23.8, 23.9, 24.0, 24.2, 24.3, 24.4, 24.5, 24.6, 24.7, 24.8, 25.0

  4%|▍         | 150/4000 [04:33<1:46:06,  1.65s/it]

(A) Closed the box.
GT: (B) Tidied up the closet/cabinet.
Part  Acc: 84.00%
Total Acc: 84.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.7, 23.8, 23.9, 24.0, 24.1, 24.2, 24.2, 24.3, 24.4, 24.5, 24.6, 24.7, 24.7, 24.8, 24.9, 25.0, 25.1, 25.2, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 25.7, 25.8, 25.9, 26.0, 26.1, 26.2, 26.2

  4%|▍         | 151/4000 [04:34<1:45:48,  1.65s/it]

(C) Put down the dish.
GT: (C) Put down the dish.
Part  Acc: 84.11%
Total Acc: 84.11%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.1, 16.2, 16.3, 16.4, 16.5, 1

  4%|▍         | 152/4000 [04:36<1:43:45,  1.62s/it]

(A) Closed the laptop.
GT: (A) Closed the laptop.
Part  Acc: 84.21%
Total Acc: 84.21%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 28.6, 28.7, 28.8, 28.9, 29.0, 29.1, 29.3, 29.4, 29.5, 29.5, 29.6, 29.7, 29.8, 29.9, 30.0, 30.1, 30.3, 30.4, 30.5, 30.5, 30.6, 30.7, 30.8, 30.9, 31.0, 31.1, 31.2, 31.4, 31.5, 31.5, 31.6, 31.7, 31.8, 31.9, 32.0, 32.1, 32.2, 32.3, 32.5, 32.5, 32.6, 32.7, 32.8, 32.9, 33.0, 33.1, 33.2, 33.3, 33.4, 33.5, 33.6, 33.7, 33.8, 33.9, 34.0, 34.1, 34.2, 34.3, 34.4, 34.5, 34.6, 34.7, 34.8, 34.9, 35.0, 35.1, 35.2, 35.3, 35.4, 35.5, 35.6, 35.7, 35.8, 35.9, 36.0, 36.1, 36.2, 36.3, 36.4, 36.5, 36.6, 36.7, 36.8, 36.9, 37.0, 37.1, 37.2, 37.3, 37.4, 37.5, 37.5, 37.7, 37.8, 37.9, 38.0, 38.1, 38.2, 38.3, 38.4, 38.5, 38.5, 38.6, 38.8, 38.9, 39.0, 39.1, 39.2, 39.3, 39.4, 39.5

  4%|▍         | 153/4000 [04:38<1:45:06,  1.64s/it]

(D) Put down the phone/camera.
GT: (D) Put down the phone/camera.
Part  Acc: 84.31%
Total Acc: 84.31%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 18.1, 18.2, 18.2, 18.4, 18.4, 18.5, 18.5, 18.6, 18.7, 18.8, 18.8, 18.9, 19.0, 19.1, 19.1, 19.2, 19.3, 19.4, 19.4, 19.5, 19.6, 19.6, 19.7, 19.8, 19.8, 19.9, 20.0, 20.1, 20.1, 20.2, 20.3, 20.4, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.9, 20.9, 21.0, 21.1, 21.2, 21.2, 21.3, 21.3, 21.4, 21.5, 21.6, 21.6, 21.7, 21.8, 21.9, 21.9, 22.0, 22.1, 22.2, 22.2, 22.3, 22.3, 22.5, 22.5, 22.6, 22.6, 22.8, 22.8, 22.9, 22.9, 23.0, 23.1, 23.2, 23.2, 23.3, 23.3, 23.5, 23.5, 23.6, 23.6, 23.7, 23.8, 23.9, 23.9, 24.0, 24.1, 24.2, 24.2, 24.3, 24.3, 24.5, 24.5, 24.6, 24.6, 24.7, 24.8, 24.9, 24.9, 25.0, 25.1, 25.2, 25.2, 25.3, 25.4, 25.5, 25.5, 25.6, 25.7, 25.7, 25.8, 25.9, 25.9

  4%|▍         | 154/4000 [04:39<1:42:48,  1.60s/it]

(B) Threw the towel.
GT: (B) Threw the towel.
Part  Acc: 84.42%
Total Acc: 84.42%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.3, 13.3, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.9, 14.0, 14.0, 14.1, 14.2, 14.3, 14.3, 14.4, 14.5, 14.6, 14.6, 14.7, 14.8, 14.9, 14.9, 15.0, 15.1, 15.2, 15.3, 15.3, 15.4, 15.5, 15.6, 15.6, 15.7, 15.8, 15.9, 15.9, 16.0, 16.1, 16.2, 16.2, 16.3, 16.4, 16.5, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.0, 17.1, 17.2, 17.3, 17.3, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.9, 18.0, 18.0, 18.1, 18.2, 

  4%|▍         | 155/4000 [04:41<1:41:00,  1.58s/it]

(C) Took the towel.
GT: (C) Took the towel.
Part  Acc: 84.52%
Total Acc: 84.52%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 27.5, 27.6, 27.7, 27.7, 27.8, 27.9, 28.0, 28.0, 28.1, 28.2, 28.2, 28.3, 28.4, 28.4, 28.5, 28.6, 28.6, 28.7, 28.8, 28.9, 28.9, 29.0, 29.1, 29.1, 29.2, 29.3, 29.3, 29.4, 29.5, 29.5, 29.6, 29.7, 29.8, 29.8, 29.9, 30.0, 30.0, 30.1, 30.2, 30.2, 30.3, 30.4, 30.5, 30.5, 30.6, 30.7, 30.7, 30.8, 30.9, 30.9, 31.0, 31.1, 31.1, 31.2, 31.3, 31.4, 31.4, 31.5, 31.6, 31.6, 31.7, 31.8, 31.8, 31.9, 32.0, 32.1, 32.1, 32.2, 32.3, 32.3, 32.4, 32.5, 32.5, 32.6, 32.7, 32.7, 32.8, 32.9, 33.0, 33.0, 33.1, 33.2, 33.2, 33.3, 33.4, 33.4, 33.5, 33.6, 33.7, 33.7, 33.8, 33.9, 33.9, 34.0, 34.1, 34.1, 34.2, 34.3, 34.3, 34.4, 34.5, 34.6, 34.6, 34.7, 34.8, 34.8, 34.9, 35.0, 35.0, 35.1

  4%|▍         | 156/4000 [04:42<1:39:46,  1.56s/it]

(B) Took the towel.
GT: (B) Took the towel.
Part  Acc: 84.62%
Total Acc: 84.62%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.5, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.5, 12.6, 12.8, 12.8, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.5, 15.6, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.4, 17.5, 17.6, 17.7, 17.8, 18.0, 18.0, 18.2, 18.3, 18.4, 18.5, 18.6, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.5, 19.5, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.4, 20.5, 20.6, 20.7, 20.8, 21.0, 21.0, 21.2, 21.3, 21.4, 21.5, 21.6, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.5, 22.5, 22.7, 22.8, 22.9, 23.0, 23.1

  4%|▍         | 157/4000 [04:44<1:40:13,  1.56s/it]

(D) Put down the dish.
GT: (D) Put down the dish.
Part  Acc: 84.71%
Total Acc: 84.71%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.8, 1.9, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.1, 3.2, 3.4, 3.5, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 5.0, 5.1, 5.3, 5.4, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.4, 9.5, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.4, 10.5, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.2, 13.3, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.5, 14.6

  4%|▍         | 158/4000 [04:45<1:42:02,  1.59s/it]

(C) Sat on the sofa/couch.
GT: (C) Sat on the sofa/couch.
Part  Acc: 84.81%
Total Acc: 84.81%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.7, 5.7, 5.9, 6.0, 6.1, 6.2, 6.3, 6.5, 6.5, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.2, 8.2, 8.4, 8.5, 8.6, 8.7, 8.8, 9.0, 9.0, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.7, 10.7, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.5, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 14.0, 14.0, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.6, 17.7, 17.8, 17.

  4%|▍         | 159/4000 [04:47<1:42:18,  1.60s/it]

(B) Closed the box.
GT: (B) Closed the box.
Part  Acc: 84.91%
Total Acc: 84.91%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 17.1, 17.2, 17.2, 17.3, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.1, 18.2, 18.2, 18.3, 18.4, 18.4, 18.5, 18.6, 18.6, 18.8, 18.8, 18.9, 19.0, 19.0, 19.1, 19.2, 19.2, 19.4, 19.4, 19.5, 19.6, 19.6, 19.7, 19.8, 19.8, 19.9, 20.0, 20.1, 20.2, 20.2, 20.3, 20.4, 20.4, 20.5, 20.6, 20.7, 20.8, 20.8, 20.9, 21.0, 21.0, 21.1, 21.2, 21.3, 21.4, 21.4, 21.5, 21.6, 21.6, 21.7, 21.8, 21.9, 22.0, 22.0, 22.1, 22.2, 22.2, 22.3, 22.4, 22.4, 22.6, 22.6, 22.7, 22.8, 22.8, 22.9, 23.0, 23.0, 23.1, 23.2, 23.3, 23.4, 23.4, 23.5, 23.6, 23.6, 23.7, 23.8, 23.9, 24.0, 24.0, 24.1, 24.2, 24.2, 24.3, 24.4, 24.5, 24.6, 24.6, 24.7, 24.8, 24.8, 24.9, 25.0, 25.1, 25.2, 25.2

  4%|▍         | 160/4000 [04:48<1:40:06,  1.56s/it]

(A) Put down the laptop.
GT: (A) Put down the laptop.
Part  Acc: 85.00%
Total Acc: 85.00%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 17.3, 17.4, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.1, 18.2, 18.2, 18.3, 18.4, 18.5, 18.5, 18.6, 18.7, 18.8, 18.8, 18.9, 19.0, 19.0, 19.1, 19.2, 19.2, 19.3, 19.4, 19.4, 19.5, 19.6, 19.6, 19.7, 19.8, 19.8, 19.9, 20.0, 20.1, 20.1, 20.2, 20.3, 20.4, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.8, 20.9, 21.0, 21.0, 21.1, 21.2, 21.2, 21.3, 21.4, 21.5, 21.5, 21.6, 21.7, 21.8, 21.8, 21.9, 22.0, 22.0, 22.1, 22.2, 22.2, 22.3, 22.4, 22.4, 22.5, 22.6, 22.6, 22.7, 22.8, 22.8, 22.9, 23.0, 23.1, 23.1, 23.2, 23.3, 23.4, 23.4, 23.5, 23.6, 23.6, 23.7, 23.8, 23.8, 23.9, 24.0, 24.0, 24.1, 24.2, 24.2, 24.3, 24.4, 24.4, 24.5, 24.6, 24.7, 24.7, 24.8, 24.9

  4%|▍         | 161/4000 [04:50<1:39:35,  1.56s/it]

(B) Took the book.
GT: (B) Took the book.
Part  Acc: 85.09%
Total Acc: 85.09%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 17.1, 17.2, 17.2, 17.3, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.1, 18.2, 18.2, 18.3, 18.4, 18.4, 18.5, 18.6, 18.6, 18.8, 18.8, 18.9, 19.0, 19.0, 19.1, 19.2, 19.2, 19.4, 19.4, 19.5, 19.6, 19.6, 19.7, 19.8, 19.8, 19.9, 20.0, 20.1, 20.2, 20.2, 20.3, 20.4, 20.4, 20.5, 20.6, 20.7, 20.8, 20.8, 20.9, 21.0, 21.0, 21.1, 21.2, 21.3, 21.4, 21.4, 21.5, 21.6, 21.6, 21.7, 21.8, 21.9, 22.0, 22.0, 22.1, 22.2, 22.2, 22.3, 22.4, 22.4, 22.6, 22.6, 22.7, 22.8, 22.8, 22.9, 23.0, 23.0, 23.1, 23.2, 23.3, 23.4, 23.4, 23.5, 23.6, 23.6, 23.7, 23.8, 23.9, 24.0, 24.0, 24.1, 24.2, 24.2, 24.3, 24.4, 24.5, 24.6, 24.6, 24.7, 24.8, 24.8, 24.9, 25.0, 25.1, 25.2, 25.2

  4%|▍         | 162/4000 [04:51<1:37:48,  1.53s/it]

(C) Took the book.
GT: (C) Took the book.
Part  Acc: 85.19%
Total Acc: 85.19%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0, 1.1, 1.1, 1.2, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.6, 2.6, 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 4.9, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.4, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.5, 8.6, 8.6, 8.7, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.3, 12.4, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.1, 13.

  4%|▍         | 163/4000 [04:53<1:39:56,  1.56s/it]

(B) Put down the phone/camera.
GT: (B) Put down the phone/camera.
Part  Acc: 85.28%
Total Acc: 85.28%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.9, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.4, 8.5, 8.6, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.5, 10.6, 10.6, 10.7, 10.8, 10.9, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 11.9, 12.0, 12.0, 12.1, 12.2, 12.2, 12.3, 12.3, 12.4, 12.5, 12.5, 12.6, 

  4%|▍         | 164/4000 [04:55<1:40:00,  1.56s/it]

(C) Closed the refrigerator.
GT: (C) Closed the refrigerator.
Part  Acc: 85.37%
Total Acc: 85.37%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.

  4%|▍         | 165/4000 [04:56<1:41:36,  1.59s/it]

(D) Put down the shoe.
GT: (D) Put down the shoe.
Part  Acc: 85.45%
Total Acc: 85.45%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 34.9, 35.0, 35.0, 35.1, 35.1, 35.2, 35.3, 35.3, 35.4, 35.5, 35.5, 35.5, 35.5, 35.6, 35.7, 35.7, 35.8, 35.8, 35.9, 36.0, 36.0, 36.1, 36.2, 36.2, 36.3, 36.3, 36.4, 36.5, 36.5, 36.5, 36.6, 36.6, 36.7, 36.7, 36.8, 36.9, 36.9, 37.0, 37.0, 37.1, 37.2, 37.2, 37.3, 37.4, 37.4, 37.5, 37.5, 37.5, 37.6, 37.6, 37.7, 37.7, 37.8, 37.9, 37.9, 38.0, 38.1, 38.1, 38.2, 38.2, 38.3, 38.4, 38.4, 38.5, 38.5, 38.5, 38.6, 38.6, 38.7, 38.8, 38.8, 38.9, 38.9, 39.0, 39.1, 39.1, 39.2, 39.3, 39.3, 39.4, 39.4, 39.5, 39.5, 39.5, 39.6, 39.6, 39.7, 39.8, 39.8, 39.9, 40.0, 40.0, 40.1, 40.1, 40.2, 40.3, 40.3, 40.4, 40.4, 40.5, 40.5, 40.5, 40.6, 40.7, 40.7, 40.8, 40.8, 40.9, 41.0, 41.0

  4%|▍         | 166/4000 [04:59<1:58:45,  1.86s/it]

(C) Took the pillow.
GT: (A) Put down the laptop.
Part  Acc: 84.94%
Total Acc: 84.94%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.2, 3.2, 3.3, 3.3, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.1, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4, 7.4, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.8, 7.9, 8.0, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.4, 8.5, 8.6, 8.6, 8.7, 8.7, 8.8, 8.9, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.6, 9.7, 9.7, 9.8, 9.9, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.3 seconds. Car

  4%|▍         | 167/4000 [05:00<1:54:00,  1.78s/it]

(D) Put down the blanket.
GT: (D) Put down the blanket.
Part  Acc: 85.03%
Total Acc: 85.03%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 3.0, 3.0, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4 seconds. Carefully 

  4%|▍         | 168/4000 [05:02<1:48:40,  1.70s/it]

(A) Threw the blanket.
GT: (C) Took the book.
Part  Acc: 84.52%
Total Acc: 84.52%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.8, 14.8, 14.9, 15.0, 15.1, 15.2, 15.2, 15.4, 15.4, 15.5, 15.6, 15.6, 15.8, 15.8, 15.9, 16.0, 16.1, 16.2, 16.2, 16.3, 16.4, 16.5, 16.6, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.3, 17.4, 17.5, 17.6, 17.7, 17.7, 17.8, 17.9, 18.0, 18.1, 18.1, 18.3, 18.3, 18.4, 18.5, 18.5, 18.7, 18.7, 18.8, 18.9, 19.0, 19.1, 19.1, 19.2, 19.3, 19.4, 19.5, 19.5, 19.6, 19.7, 19.8, 19.9, 19.9, 20.0, 20.1, 20.2, 20.3, 20.3, 20.5, 20.5, 20.6, 20.7, 20.8, 20.9, 20.9, 21.0, 21.1, 21.2, 21.3, 21.3, 21.4, 21.5, 21.6, 21.7, 21.7, 21.8, 21.9, 22.0, 22.1, 22.1, 22.3, 22.3, 22.4, 22.5, 22.6, 22.7, 22.7, 22.8, 22.9, 23.0, 23.1, 23.1, 23.2, 23.3, 23.4, 23.5, 23.5, 23.6, 23.7

  4%|▍         | 169/4000 [05:04<1:49:53,  1.72s/it]

(B) Put down the paper/notebook.
GT: (A) Put down the sandwich.
Part  Acc: 84.02%
Total Acc: 84.02%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 15.8, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.7, 16.8, 16.9, 17.0, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.9, 18.0, 18.1, 18.2, 18.3, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.6, 20.7, 20.8, 20.9, 21.0, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.3, 23.4, 23.5, 23.6, 23.7, 23.9, 24.0, 24.1, 24.2, 24.3, 24.4, 24.6, 24.7, 24.8, 24.9, 25.0, 25.1, 25.3, 25.4, 25.5, 25.6, 25.7, 25.8, 26.0, 26.1, 26.2, 26.3, 26.4, 26.6, 26.7, 26.8, 26.9, 27.0, 27.1, 27.3, 27.4, 27.5, 27.6, 27.7, 27.8, 28.0, 28.1, 28.2, 28.3, 28.4, 28.6, 28.7

  4%|▍         | 170/4000 [05:05<1:49:12,  1.71s/it]

(A) Closed the door.
GT: (A) Closed the door.
Part  Acc: 84.12%
Total Acc: 84.12%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 3.0, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.4, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.2, 11.4, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.6, 12.7, 12.8, 12.9, 13.0, 13.

  4%|▍         | 171/4000 [05:07<1:46:07,  1.66s/it]

(C) Put down the pillow.
GT: (C) Put down the pillow.
Part  Acc: 84.21%
Total Acc: 84.21%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.2, 12.2, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.1, 13.1, 13.2, 13.3, 13.3, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.9, 13.9, 14.0, 14.1, 14.1, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.7, 14.8, 14.9, 15.0, 15.0, 15.1, 15.2, 15.2, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.8, 15.9, 15.9, 16.0, 16.1, 16.1, 16.2, 16.3, 16.4, 16.4, 16.5, 16.6, 16.6, 16.7, 16.8, 16.9, 16.9, 17.0, 17.1, 17.2, 17.2, 17.3, 17.4, 17.5, 17.5, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.1, 18.2, 18.3, 18.3

  4%|▍         | 172/4000 [05:09<1:44:20,  1.64s/it]

(D) Threw the pillow.
GT: (D) Threw the pillow.
Part  Acc: 84.30%
Total Acc: 84.30%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.7, 1.9, 2.0, 2.1, 2.3, 2.4, 2.5, 2.7, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.8, 4.9, 5.0, 5.1, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.9, 7.0, 7.1, 7.2, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.3, 15.4, 15.5, 15.6, 15.8, 15.9, 16.0, 16.2, 16.3, 16.4, 16.6, 16.7, 16.8, 17.0, 17.1, 17.2, 17.4, 1

  4%|▍         | 173/4000 [05:10<1:45:27,  1.65s/it]

(B) Put down the bag.
GT: (B) Put down the bag.
Part  Acc: 84.39%
Total Acc: 84.39%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.5, 14.5, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.5, 15.5, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.8, 18.8, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.8, 19.8, 20.0, 20.0, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 21.0, 21.0, 21.2, 21.2, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.2, 22.2, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.4, 23.5, 23.6, 23.7, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.5, 24.6, 24.7, 24.8, 24.9, 25.0, 25.1, 25.2, 25.3, 25.5

  4%|▍         | 174/4000 [05:12<1:44:57,  1.65s/it]

(C) Closed the refrigerator.
GT: (C) Closed the refrigerator.
Part  Acc: 84.48%
Total Acc: 84.48%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 1

  4%|▍         | 175/4000 [05:14<2:04:10,  1.95s/it]

(A) Threw the shoe.
GT: (A) Threw the shoe.
Part  Acc: 84.57%
Total Acc: 84.57%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.5, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.2, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.2, 17.4,

  4%|▍         | 176/4000 [05:16<2:02:13,  1.92s/it]

(D) Closed the closet/cabinet.
GT: (D) Closed the closet/cabinet.
Part  Acc: 84.66%
Total Acc: 84.66%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.7, 9.9, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.7, 10.9, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.7, 11.9, 11.9, 12.0, 12.1, 12.1, 12.2, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.9, 12.9, 13.0, 13.1, 13.1, 13.2, 13.3, 13.4, 13.5, 13.5, 13.6, 13.7, 13.7, 13.9, 13.9, 14.0, 14.1, 14.1, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.8, 14.9, 14.9, 15.0, 15.1, 15.1, 15.2, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.8, 15.9, 15.9, 16.0, 16.1, 16.1, 16.3, 16.3, 16.4,

  4%|▍         | 177/4000 [05:18<1:54:40,  1.80s/it]

(B) Closed the door.
GT: (D) Threw the broom.
Part  Acc: 84.18%
Total Acc: 84.18%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 19.7, 19.8, 19.9, 20.0, 20.1, 20.1, 20.2, 20.3, 20.3, 20.5, 20.5, 20.6, 20.7, 20.7, 20.8, 20.9, 21.0, 21.1, 21.1, 21.2, 21.3, 21.4, 21.5, 21.5, 21.6, 21.7, 21.7, 21.9, 21.9, 22.0, 22.1, 22.1, 22.3, 22.3, 22.4, 22.5, 22.5, 22.6, 22.7, 22.8, 22.9, 22.9, 23.0, 23.1, 23.2, 23.3, 23.3, 23.4, 23.5, 23.5, 23.7, 23.7, 23.8, 23.9, 23.9, 24.0, 24.1, 24.2, 24.3, 24.3, 24.4, 24.5, 24.6, 24.7, 24.7, 24.8, 24.9, 25.0, 25.1, 25.1, 25.2, 25.3, 25.3, 25.5, 25.5, 25.6, 25.7, 25.7, 25.8, 25.9, 26.0, 26.1, 26.1, 26.2, 26.3, 26.4, 26.5, 26.5, 26.6, 26.7, 26.7, 26.9, 26.9, 27.0, 27.1, 27.1, 27.2, 27.3, 27.4, 27.5, 27.5, 27.6, 27.7, 27.8, 27.9, 27.9, 28.0, 28.1, 28.2, 28.3

  4%|▍         | 178/4000 [05:20<1:53:16,  1.78s/it]

(C) Opened the closet/cabinet.
GT: (C) Opened the closet/cabinet.
Part  Acc: 84.27%
Total Acc: 84.27%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.1 secon

  4%|▍         | 179/4000 [05:21<1:51:21,  1.75s/it]

(A) Threw the towel.
GT: (A) Threw the towel.
Part  Acc: 84.36%
Total Acc: 84.36%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.7, 12.8, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.3, 13.4, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.9, 14.0, 14.0, 14.1, 14.2, 14.3, 14.4, 14.4, 14.5, 14.6, 14.6, 14.7, 14.8, 14.9, 15.0, 15.0, 15.1, 15.2, 15.3, 15.3, 15.4, 15.5, 15.6,

  4%|▍         | 180/4000 [05:23<1:47:50,  1.69s/it]

(D) Put down the pillow.
GT: (B) Threw the pillow.
Part  Acc: 83.89%
Total Acc: 83.89%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 20.8, 20.9, 21.1, 21.2, 21.4, 21.5, 21.7, 21.8, 22.0, 22.1, 22.3, 22.4, 22.6, 22.7, 22.9, 23.0, 23.2, 23.3, 23.5, 23.6, 23.8, 23.9, 24.1, 24.2, 24.4, 24.5, 24.7, 24.8, 25.0, 25.1, 25.3, 25.4, 25.6, 25.7, 25.9, 26.0, 26.2, 26.3, 26.5, 26.6, 26.8, 26.9, 27.1, 27.3, 27.4, 27.6, 27.7, 27.9, 28.0, 28.2, 28.3, 28.5, 28.6, 28.8, 28.9, 29.1, 29.2, 29.4, 29.5, 29.7, 29.8, 30.0, 30.1, 30.3, 30.4, 30.6, 30.7, 30.9, 31.0, 31.2, 31.3, 31.5, 31.6, 31.8, 31.9, 32.1, 32.2, 32.4, 32.5, 32.7, 32.8, 33.0, 33.1, 33.3, 33.4, 33.6, 33.7, 33.9, 34.0, 34.2, 34.3, 34.5, 34.6, 34.8, 34.9, 35.1, 35.2, 35.4, 35.5, 35.7, 35.8, 36.0, 36.1, 36.3, 36.4, 36.6, 36.7, 36.9, 37.1, 37.2

  5%|▍         | 181/4000 [05:25<1:48:03,  1.70s/it]

(C) Put down the dish.
GT: (C) Put down the dish.
Part  Acc: 83.98%
Total Acc: 83.98%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.9, 1.1, 1.2, 1.4, 1.5, 1.7, 1.8, 1.9, 2.1, 2.3, 2.4, 2.5, 2.7, 2.8, 3.0, 3.1, 3.2, 3.4, 3.5, 3.7, 3.8, 4.0, 4.1, 4.2, 4.4, 4.6, 4.7, 4.8, 5.0, 5.1, 5.3, 5.4, 5.5, 5.7, 5.8, 6.0, 6.1, 6.3, 6.4, 6.5, 6.7, 6.9, 7.0, 7.1, 7.3, 7.4, 7.6, 7.7, 7.8, 8.0, 8.1, 8.3, 8.4, 8.6, 8.7, 8.8, 9.0, 9.2, 9.3, 9.4, 9.6, 9.7, 9.9, 10.0, 10.2, 10.3, 10.4, 10.6, 10.7, 10.9, 11.0, 11.1, 11.3, 11.5, 11.6, 11.7, 11.9, 12.0, 12.2, 12.3, 12.5, 12.6, 12.7, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.8, 13.9, 14.0, 14.2, 14.3, 14.5, 14.6, 14.8, 14.9, 15.0, 15.2, 15.3, 15.5, 15.6, 15.7, 15.9, 16.1, 16.2, 16.3, 16.5, 16.6, 16.8, 16.9, 17.1, 17.2, 17.3, 17.5, 17.6, 17.8, 17.9, 18.0, 1

  5%|▍         | 182/4000 [05:26<1:47:13,  1.68s/it]

(D) Opened the bag.
GT: (D) Opened the bag.
Part  Acc: 84.07%
Total Acc: 84.07%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 1.0, 1.1, 1.2, 1.4, 1.5, 1.6, 1.8, 1.9, 2.0, 2.2, 2.3, 2.4, 2.6, 2.7, 2.8, 2.9, 3.1, 3.2, 3.3, 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.3, 15.4, 15.5, 15.7, 15.8, 15.9, 16.1, 1

  5%|▍         | 183/4000 [05:28<1:46:11,  1.67s/it]

(B) Took the bag.
GT: (B) Took the bag.
Part  Acc: 84.15%
Total Acc: 84.15%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.5, 0.7, 0.8, 1.0, 1.1, 1.3, 1.4, 1.6, 1.7, 1.9, 2.1, 2.2, 2.4, 2.5, 2.7, 2.8, 3.0, 3.1, 3.3, 3.4, 3.6, 3.7, 3.9, 4.0, 4.2, 4.3, 4.5, 4.7, 4.8, 5.0, 5.1, 5.3, 5.4, 5.6, 5.7, 5.9, 6.0, 6.2, 6.3, 6.5, 6.6, 6.8, 6.9, 7.1, 7.3, 7.4, 7.6, 7.7, 7.9, 8.0, 8.2, 8.3, 8.5, 8.6, 8.8, 8.9, 9.1, 9.2, 9.4, 9.5, 9.7, 9.9, 10.0, 10.2, 10.3, 10.5, 10.6, 10.8, 10.9, 11.1, 11.2, 11.4, 11.5, 11.7, 11.8, 12.0, 12.1, 12.3, 12.4, 12.6, 12.8, 12.9, 13.1, 13.2, 13.4, 13.5, 13.7, 13.8, 14.0, 14.1, 14.3, 14.4, 14.6, 14.7, 14.9, 15.0, 15.2, 15.4, 15.5, 15.7, 15.8, 16.0, 16.1, 16.3, 16.4, 16.6, 16.8, 16.9, 17.1, 17.2, 17.4, 17.5, 17.7, 17.8, 18.0, 18.1, 18.3, 18.

  5%|▍         | 184/4000 [05:30<1:47:04,  1.68s/it]

(D) Opened the bag.
GT: (A) Put down the paper/notebook.
Part  Acc: 83.70%
Total Acc: 83.70%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.8, 2.0, 2.1, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.1, 3.3, 3.4, 3.5, 3.7, 3.8, 3.9, 4.1, 4.2, 4.3, 4.4, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.4, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.4, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.

  5%|▍         | 185/4000 [05:31<1:45:29,  1.66s/it]

(B) Put down the dish.
GT: (B) Put down the dish.
Part  Acc: 83.78%
Total Acc: 83.78%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 10.9, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 12.9, 13.0, 13.1, 13.1, 13.2, 13.3, 13.4, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.8, 13.9, 14.0, 14.0, 14.1, 14.2, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.7, 14.8, 14.9, 14.9, 15.0, 15.1, 15.1, 15

  5%|▍         | 186/4000 [05:33<1:43:47,  1.63s/it]

(A) Took the bag.
GT: (C) Opened the bag.
Part  Acc: 83.33%
Total Acc: 83.33%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.6, 12.6, 12.7, 12.8, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.2, 13.3, 13.4, 13.5, 13.5, 13.6, 13.7, 13.7, 13.8, 13.9, 13.9, 14.0, 14.1, 14.2, 14.2, 14.3, 14.4, 14.4, 14.5, 14.6, 14.6, 14.7, 14.8, 14.8, 14.9, 15.0, 15.1, 15.1, 15.2, 15.3, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.8, 15.8, 15.9, 16.0, 16.0, 16.1, 16.2, 16.2, 16.3, 16.4, 16.4, 16.5, 16.6, 16.7, 16.8, 16.8, 16.9, 17.0, 17.0, 17.1, 17.2, 17.2, 17.

  5%|▍         | 187/4000 [05:34<1:42:18,  1.61s/it]

(C) Took the shoe.
GT: (C) Took the shoe.
Part  Acc: 83.42%
Total Acc: 83.42%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.3, 18.5, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.0, 19.2, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.0, 20.2, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.7, 20.9, 20.9, 21.0, 21.1, 21.2, 21.3, 21.4, 21.4, 21.6, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.1, 22.3, 22.4, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.8, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.3, 24.5, 24.5, 24.6, 24.7, 24.8, 24.9, 25.0, 25.1, 25.2, 25.3, 25.3, 25.5, 25.5, 25.6, 25.7, 25.8, 25.9, 26.0, 26.0, 26.2, 26.2

  5%|▍         | 188/4000 [05:36<1:41:39,  1.60s/it]

(C) Took the dish.
GT: (C) Took the dish.
Part  Acc: 83.51%
Total Acc: 83.51%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 22.5, 22.6, 22.7, 22.8, 22.9, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.4, 23.5, 23.6, 23.7, 23.8, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.3, 24.4, 24.5, 24.6, 24.7, 24.7, 24.8, 24.9, 25.0, 25.1, 25.2, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 25.7, 25.8, 25.9, 26.0, 26.1, 26.1, 26.2, 26.3, 26.4, 26.5, 26.6, 26.6, 26.7, 26.8, 26.9, 27.0, 27.0, 27.1, 27.2, 27.3, 27.4, 27.5, 27.5, 27.6, 27.7, 27.8, 27.9, 27.9, 28.0, 28.1, 28.2, 28.3, 28.4, 28.4, 28.5, 28.6, 28.7, 28.8, 28.8, 28.9, 29.0, 29.1, 29.2, 29.3, 29.3, 29.4, 29.5, 29.6, 29.7, 29.8, 29.8, 29.9, 30.0, 30.1, 30.2, 30.2, 30.3, 30.4, 30.5, 30.6, 30.7, 30.7, 30.8, 30.9, 31.0, 31.1, 31.1, 31.2, 31.3, 31.4, 31.5

  5%|▍         | 189/4000 [05:37<1:42:30,  1.61s/it]

(C) Took the phone/camera.
GT: (B) Took the book.
Part  Acc: 83.07%
Total Acc: 83.07%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.9, 8.1, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 8.9, 9.0, 9.2, 9.3, 9.4, 9.5, 9.7, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.3, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.6, 17.8, 17.9, 18.0, 18.1, 18.3, 18.4, 18.5, 18.6, 18.8, 18.9, 19.0, 19.1, 19.2, 19.4, 19.5, 19.6, 19.7, 19.9, 20.0, 20.1, 20.2, 20.4, 20.5, 20.6, 20.7, 20.9, 21.0, 21.1, 21.2, 21.4, 21.5, 21.6, 21.7, 21.

  5%|▍         | 190/4000 [05:39<1:42:41,  1.62s/it]

(C) Put down the box.
GT: (C) Put down the box.
Part  Acc: 83.16%
Total Acc: 83.16%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 17.9, 18.0, 18.0, 18.1, 18.1, 18.2, 18.2, 18.3, 18.3, 18.4, 18.5, 18.5, 18.6, 18.6, 18.7, 18.7, 18.8, 18.8, 18.9, 19.0, 19.0, 19.1, 19.1, 19.2, 19.2, 19.3, 19.3, 19.4, 19.5, 19.5, 19.6, 19.6, 19.7, 19.7, 19.8, 19.8, 19.9, 20.0, 20.0, 20.1, 20.1, 20.2, 20.2, 20.3, 20.3, 20.4, 20.5, 20.5, 20.6, 20.6, 20.7, 20.7, 20.8, 20.8, 20.9, 21.0, 21.0, 21.1, 21.1, 21.2, 21.2, 21.3, 21.3, 21.4, 21.4, 21.5, 21.6, 21.6, 21.7, 21.7, 21.8, 21.8, 21.9, 21.9, 22.0, 22.1, 22.1, 22.2, 22.2, 22.3, 22.3, 22.4, 22.4, 22.5, 22.6, 22.6, 22.7, 22.7, 22.8, 22.8, 22.9, 22.9, 23.0, 23.1, 23.1, 23.2, 23.2, 23.3, 23.3, 23.4, 23.4, 23.5, 23.6, 23.6, 23.7, 23.7, 23.8, 23.8, 23.9, 23.9

  5%|▍         | 191/4000 [05:41<1:41:22,  1.60s/it]

(C) Took the sandwich.
GT: (C) Took the sandwich.
Part  Acc: 83.25%
Total Acc: 83.25%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 0.9, 1.0, 1.0, 1.2, 1.2, 1.3, 1.4, 1.5, 1.6, 1.6, 1.8, 1.8, 1.9, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.6, 7.6, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0 seconds

  5%|▍         | 192/4000 [05:42<1:44:50,  1.65s/it]

(D) Put down the paper/notebook.
GT: (D) Put down the paper/notebook.
Part  Acc: 83.33%
Total Acc: 83.33%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.5, 13.7, 13.9, 13.9, 14.0, 14.2, 14.3, 14.5, 14.7, 14.7, 14.8, 15.0, 15.2, 15.3, 15.3, 15.5, 15.7, 15.8, 16.0, 16.1, 16.1, 16.3, 16.5, 16.6, 16.8, 17.0, 17.0, 17.1, 17.3, 17.4, 17.6, 17.6, 17.8, 17.9, 18.1, 18.3, 18.4, 18.4, 18.6, 18.8, 18.9, 19.1, 19.1, 19.2, 19.4, 19.6, 19.7, 19.9, 19.9, 20.1, 20.2, 20.4, 20.5, 20.5, 20.7, 20.9, 21.0, 21.2, 21.4, 21.4, 21.5, 21.7, 21.8, 22.0, 22.0, 22.2, 22.3, 22.5, 22.7, 22.8, 22.8, 23.0, 23.2, 23.3, 23.5, 23.6, 23.6, 23.8, 24.0, 24.1, 24.3, 24.3, 24.5, 24.6, 24.8, 24.9, 25.1, 25.1, 25.3, 25.4, 25.6, 25.8, 25.8, 25.9, 26.1, 26.3, 26.4, 26.6, 26.6, 26.7, 26.9, 27.1, 27.2, 27.2, 27.4, 27.6, 27.7, 27.9, 28.0, 28.0

  5%|▍         | 193/4000 [05:45<2:00:45,  1.90s/it]

(D) Put down the pillow.
GT: (D) Put down the pillow.
Part  Acc: 83.42%
Total Acc: 83.42%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.0, 18.1, 18.2, 18.3, 18.4, 18.

  5%|▍         | 194/4000 [05:47<1:55:05,  1.81s/it]

(C) Put down the pillow.
GT: (C) Put down the pillow.
Part  Acc: 83.51%
Total Acc: 83.51%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 24.5, 24.6, 24.7, 24.7, 24.8, 24.9, 25.0, 25.0, 25.0, 25.1, 25.2, 25.3, 25.3, 25.4, 25.5, 25.6, 25.7, 25.7, 25.8, 25.9, 26.0, 26.1, 26.1, 26.2, 26.3, 26.4, 26.5, 26.5, 26.6, 26.7, 26.8, 26.8, 26.9, 27.0, 27.1, 27.2, 27.2, 27.3, 27.4, 27.5, 27.6, 27.6, 27.7, 27.8, 27.9, 28.0, 28.0, 28.1, 28.2, 28.3, 28.3, 28.4, 28.5, 28.6, 28.7, 28.7, 28.8, 28.8, 28.9, 29.0, 29.0, 29.1, 29.2, 29.3, 29.3, 29.4, 29.5, 29.6, 29.7, 29.7, 29.8, 29.9, 30.0, 30.1, 30.1, 30.2, 30.3, 30.4, 30.5, 30.5, 30.6, 30.7, 30.8, 30.8, 30.9, 31.0, 31.1, 31.2, 31.2, 31.3, 31.4, 31.5, 31.6, 31.6, 31.7, 31.8, 31.9, 32.0, 32.0, 32.1, 32.2, 32.3, 32.3, 32.4, 32.5, 32.6, 32.7, 32.7, 32.7, 32.8

  5%|▍         | 195/4000 [05:49<2:13:08,  2.10s/it]

(D) Closed the box.
GT: (D) Closed the box.
Part  Acc: 83.59%
Total Acc: 83.59%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 25.0, 25.1, 25.1, 25.2, 25.3, 25.3, 25.4, 25.4, 25.5, 25.6, 25.6, 25.7, 25.8, 25.8, 25.9, 26.0, 26.0, 26.1, 26.2, 26.2, 26.3, 26.3, 26.4, 26.5, 26.5, 26.6, 26.7, 26.7, 26.8, 26.9, 26.9, 27.0, 27.1, 27.1, 27.2, 27.2, 27.3, 27.4, 27.4, 27.5, 27.6, 27.6, 27.7, 27.8, 27.8, 27.9, 27.9, 28.0, 28.1, 28.1, 28.2, 28.3, 28.3, 28.4, 28.5, 28.5, 28.6, 28.7, 28.7, 28.8, 28.8, 28.9, 29.0, 29.0, 29.1, 29.2, 29.2, 29.3, 29.4, 29.4, 29.5, 29.5, 29.6, 29.7, 29.7, 29.8, 29.9, 30.0, 30.0, 30.1, 30.2, 30.2, 30.3, 30.4, 30.4, 30.5, 30.5, 30.6, 30.7, 30.7, 30.8, 30.9, 30.9, 31.0, 31.1, 31.1, 31.2, 31.2, 31.3, 31.4, 31.4, 31.5, 31.6, 31.6, 31.7, 31.8, 31.8, 31.9, 32.0, 32.0

  5%|▍         | 196/4000 [05:51<2:04:22,  1.96s/it]

(A) Put down the blanket.
GT: (A) Put down the blanket.
Part  Acc: 83.67%
Total Acc: 83.67%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.9, 10.9, 11.0, 11.1, 11.1, 11.2, 11.2, 11.3, 11.3, 11.4, 11.4, 11.5, 11.5, 11.6, 11.7, 11.7, 11.8, 11.8, 11.9, 12.0, 12.0, 12.1, 12.1, 12.1, 12.2, 12.3, 12.3, 12.4, 12.4, 12.5, 12.6, 12.6, 12.7, 12.7, 12.8, 12.9, 12.9, 12.9, 13.0, 13.0, 13.1, 13.2, 13.2, 13.3, 13.3, 13.4, 13.5, 13.5, 13.6, 13.6, 13.6, 13.7, 13.8, 13.8, 13.9, 13.9, 14.0, 14.1, 14.1, 14.2, 14.2, 14.3, 14.4, 14.4, 14.5, 14.5, 14.5, 14.6, 14.7, 14.7, 14.8, 14.8, 14.9, 15.0, 15.0, 15.1, 15.1, 15.2, 15.3, 15.3, 15.3, 15.4, 15.4, 15.5, 15.6, 15.6, 15.7, 15.7, 15.8, 15.9, 15.9, 16.0, 16.0, 16.0, 16.1, 16.2, 16.2, 16.3, 16.3, 16.4, 16.5, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.8, 16.9, 16.9

  5%|▍         | 197/4000 [05:53<2:11:29,  2.07s/it]

(A) Put down the paper/notebook.
GT: (A) Put down the paper/notebook.
Part  Acc: 83.76%
Total Acc: 83.76%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.3 seconds. Carefu

  5%|▍         | 198/4000 [05:55<2:00:56,  1.91s/it]

(B) Closed the book.
GT: (C) Put down the paper/notebook.
Part  Acc: 83.33%
Total Acc: 83.33%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.2, 1.4, 1.5, 1.6, 1.7, 1.9, 2.0, 2.1, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.1, 3.3, 3.4, 3.5, 3.6, 3.8, 3.9, 4.0, 4.2, 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.6, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 16.0, 16.1, 16.2, 16.4, 16.5, 1

  5%|▍         | 199/4000 [05:56<1:56:32,  1.84s/it]

(A) Put down the box.
GT: (A) Put down the box.
Part  Acc: 83.42%
Total Acc: 83.42%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 23.3, 23.5, 23.6, 23.8, 24.0, 24.1, 24.2, 24.4, 24.6, 24.7, 24.9, 25.0, 25.2, 25.4, 25.5, 25.6, 25.8, 26.0, 26.1, 26.3, 26.4, 26.6, 26.8, 26.9, 27.0, 27.2, 27.4, 27.5, 27.7, 27.8, 28.0, 28.2, 28.3, 28.4, 28.6, 28.8, 28.9, 29.1, 29.2, 29.4, 29.6, 29.7, 29.8, 30.0, 30.2, 30.3, 30.5, 30.6, 30.8, 31.0, 31.1, 31.2, 31.4, 31.6, 31.7, 31.9, 32.0, 32.2, 32.4, 32.5, 32.6, 32.8, 33.0, 33.1, 33.3, 33.4, 33.6, 33.8, 33.9, 34.0, 34.2, 34.4, 34.5, 34.7, 34.8, 35.0, 35.2, 35.3, 35.4, 35.6, 35.8, 35.9, 36.1, 36.2, 36.4, 36.6, 36.7, 36.8, 37.0, 37.2, 37.3, 37.5, 37.6, 37.8, 38.0, 38.1, 38.3, 38.4, 38.6, 38.7, 38.9, 39.0, 39.2, 39.4, 39.5, 39.7, 39.8, 40.0, 40.1, 40.3

  5%|▌         | 200/4000 [05:58<1:55:00,  1.82s/it]

(B) Put down the paper/notebook.
GT: (B) Put down the paper/notebook.
Part  Acc: 83.50%
Total Acc: 83.50%
-------------------------------------------------- Action Sequence --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.3 sec

  5%|▌         | 201/4000 [06:00<1:50:25,  1.74s/it]

(B) Open the door.
GT: (A) Put down the pillow.
Part  Acc: 0.00%
Total Acc: 83.08%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.2, 3.3, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.1, 13.1, 13.

  5%|▌         | 202/4000 [06:01<1:45:08,  1.66s/it]

(C) Take the book.
GT: (D) Open the book.
Part  Acc: 0.00%
Total Acc: 82.67%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.6, 5.7, 5.7, 5.7, 5.8 seconds. Carefully 

  5%|▌         | 203/4000 [06:03<1:40:37,  1.59s/it]

(B) Take the book.
GT: (B) Take the book.
Part  Acc: 33.33%
Total Acc: 82.76%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.5, 0.8, 1.0, 1.3, 1.5, 1.7, 2.0, 2.2, 2.4, 2.6, 2.9, 3.1, 3.4, 3.6, 3.8, 4.1, 4.3, 4.6, 4.8, 5.0, 5.3, 5.5, 5.7, 6.0, 6.2, 6.4, 6.7, 6.9, 7.2, 7.4, 7.6, 7.9, 8.1, 8.4, 8.6, 8.8, 9.1, 9.3, 9.6, 9.7, 10.0, 10.2, 10.5, 10.7, 10.9, 11.2, 11.4, 11.7, 11.9, 12.1, 12.4, 12.6, 12.9, 13.1, 13.3, 13.5, 13.8, 14.0, 14.2, 14.5, 14.7, 15.0, 15.2, 15.4, 15.7, 15.9, 16.2, 16.4, 16.7, 16.9, 17.1, 17.3, 17.6, 17.8, 18.0, 18.3, 18.5, 18.8, 19.0, 19.2, 19.5, 19.7, 20.0, 20.2, 20.4, 20.7, 20.9, 21.1, 21.3, 21.6, 21.8, 22.1, 22.3, 22.5, 22.8, 23.0, 23.3, 23.5, 23.7, 24.0, 24.2, 24.5, 24.7, 24.9, 25.1, 25.4, 25.6, 25.8, 26.1, 26.3, 26.6, 26.8, 27.0, 27.3, 27.5

  5%|▌         | 204/4000 [06:04<1:42:17,  1.62s/it]

(B) Put down the clothes.
GT: (D) Throw the blanket.
Part  Acc: 25.00%
Total Acc: 82.35%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.4, 12.5, 12.6, 12.7, 12.8, 13.0, 13.1, 13.2, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.4, 14.6, 14.6, 14.7, 14.8, 14.9, 15.1, 15.2, 15.3, 15.4, 15.5, 15.7, 15.8, 15.9, 15.9, 16.0, 16.2, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.9, 18.0, 18.1, 18.2, 18.3, 18.5, 18.6, 18.7, 18.7, 18.8, 19.0, 19.1, 19.2, 19.3, 19.4, 19.6, 19.7, 19.8, 19.9, 20.1, 20.1, 20.2, 20.3, 20.4, 20.6, 20.7, 20.8, 20.9, 21.0, 21.2, 21.3, 21.4, 21.5, 21.5,

  5%|▌         | 205/4000 [06:06<1:39:35,  1.57s/it]

(B) Open the box.
GT: (C) Tidy up the closet/cabinet.
Part  Acc: 20.00%
Total Acc: 81.95%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.7, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.5 seconds. Carefully 

  5%|▌         | 206/4000 [06:07<1:37:00,  1.53s/it]

(A) Take the shoe.
GT: (C) Take the paper/notebook.
Part  Acc: 16.67%
Total Acc: 81.55%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.1, 14.2, 14.4, 14.6, 14.7, 14.9, 15.0, 15.2, 15.4, 15.5, 15.7, 15.9, 16.0, 16.2, 16.3, 16.5, 16.7, 16.9, 17.0, 17.2, 17.3, 17.5, 17.7, 17.8, 18.0, 18.2, 18.3, 18.5, 18.6, 18.8, 19.0, 19.1, 19.3, 19.5, 19.6, 19.8, 19.9, 20.1, 20.3, 20.4, 20.6, 20.8, 20.9, 21.1, 21.2, 21.4, 21.6, 21.7, 21.9, 22.0, 22.2, 22.4, 22.5, 22.7, 22.9, 23.0, 23.2, 23.3, 23.5, 23.7, 23.8, 24.0, 24.2, 24.3, 24.5, 24.6, 24.8, 25.0, 25.1, 25.3, 25.5, 25.6, 25.8, 25.9, 26.1, 26.3, 26.4, 26.6, 26.8, 26.9, 27.1, 27.2, 27.4, 27.6, 27.7, 27.9, 28.1, 28.2, 28.4, 28.5, 28.7, 28.9, 29.0, 29.2, 29.4, 29.5, 29.7, 29.8, 30.0, 30.2, 30.3, 30.5, 30.6, 30.8, 31.0, 31.1, 31.3, 31.5, 31.6, 31.8

  5%|▌         | 207/4000 [06:09<1:40:10,  1.58s/it]

(B) Sit on.
GT: (D) Lie on.
Part  Acc: 14.29%
Total Acc: 81.16%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.4, 0.7, 0.9, 1.2, 1.5, 1.7, 2.0, 2.3, 2.5, 2.9, 3.1, 3.3, 3.7, 3.9, 4.2, 4.5, 4.7, 5.0, 5.3, 5.5, 5.8, 6.1, 6.3, 6.6, 6.9, 7.1, 7.4, 7.7, 7.9, 8.3, 8.5, 8.8, 9.1, 9.3, 9.6, 9.9, 10.1, 10.4, 10.7, 10.9, 11.2, 11.5, 11.7, 12.0, 12.3, 12.6, 12.8, 13.1, 13.4, 13.6, 13.9, 14.2, 14.5, 14.7, 15.0, 15.3, 15.5, 15.8, 16.1, 16.4, 16.6, 16.9, 17.2, 17.4, 17.7, 18.0, 18.2, 18.5, 18.8, 19.0, 19.4, 19.6, 19.9, 20.2, 20.4, 20.7, 21.0, 21.2, 21.5, 21.8, 22.0, 22.3, 22.6, 22.8, 23.1, 23.4, 23.6, 23.9, 24.2, 24.4, 24.8, 25.0, 25.2, 25.6, 25.8, 26.1, 26.4, 26.6, 26.9, 27.2, 27.4, 27.7, 28.0, 28.2, 28.5, 28.8, 29.0, 29.3, 29.6, 29.9, 30.1, 30.4, 30.7, 31.0, 31.2,

  5%|▌         | 208/4000 [06:11<1:40:02,  1.58s/it]

(D) Take the book.
GT: (A) Take the clothes.
Part  Acc: 12.50%
Total Acc: 80.77%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.0, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.1, 12.1, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.1, 13.1, 13.2, 13.3, 13.4, 13.5, 13.5, 13.6, 13.7, 13.8, 13.8, 13.9, 14.0, 14.1, 14.1, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.8, 14.8, 14.9, 15.0, 15.1, 1

  5%|▌         | 209/4000 [06:12<1:40:01,  1.58s/it]

(B) Put down the laptop.
GT: (B) Put down the laptop.
Part  Acc: 22.22%
Total Acc: 80.86%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.8, 2.1, 2.4, 2.7, 3.1, 3.4, 3.7, 4.0, 4.3, 4.6, 4.9, 5.2, 5.5, 5.8, 6.1, 6.4, 6.7, 7.0, 7.3, 7.6, 7.9, 8.2, 8.5, 8.8, 9.1, 9.5, 9.8, 10.1, 10.4, 10.7, 11.0, 11.3, 11.6, 11.9, 12.2, 12.5, 12.8, 13.1, 13.4, 13.7, 14.0, 14.3, 14.6, 14.9, 15.2, 15.5, 15.8, 16.2, 16.5, 16.8, 17.1, 17.4, 17.7, 18.0, 18.3, 18.6, 18.9, 19.2, 19.5, 19.8, 20.1, 20.5, 20.8, 21.1, 21.4, 21.7, 22.0, 22.3, 22.6, 22.9, 23.2, 23.5, 23.8, 24.1, 24.4, 24.7, 25.0, 25.3, 25.6, 25.9, 26.2, 26.5, 26.9, 27.2, 27.5, 27.8, 28.1, 28.4, 28.7, 29.0, 29.3, 29.6, 29.9, 30.2, 30.5, 30.8, 31.1, 31.4, 31.7, 32.0, 32.3, 32.6, 32.9, 33.2, 33.6, 33.9, 34.2, 34.5, 34.8, 35.1, 35.4, 35.7, 36.0, 36.3, 3

  5%|▌         | 210/4000 [06:14<1:48:52,  1.72s/it]

(D) Take the bag.
GT: (D) Take the bag.
Part  Acc: 30.00%
Total Acc: 80.95%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2 se

  5%|▌         | 211/4000 [06:16<1:44:54,  1.66s/it]

(A) Put down the bag.
GT: (A) Put down the bag.
Part  Acc: 36.36%
Total Acc: 81.04%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 1

  5%|▌         | 212/4000 [06:17<1:42:52,  1.63s/it]

(A) Take the picture.
GT: (A) Take the picture.
Part  Acc: 41.67%
Total Acc: 81.13%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.6, 14.7, 14.7, 14.8, 14.8, 14.9, 14.9, 15.0, 15.0, 15.1, 15.1, 15.2, 15.2, 15.3, 15.4, 15.4, 15.5, 15.5, 15.6, 15.6, 15.7, 15.7, 15.8, 15.8, 15.9, 15.9, 16.0, 16.1, 16.1, 16.2, 16.2, 16.3, 16.3, 16.4, 16.4, 16.5, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 17.0, 17.0, 17.1, 17.1, 17.2, 17.2, 17.3, 17.3, 17.4, 17.4, 17.5, 17.5, 17.6, 17.7, 17.7, 17.8, 17.8, 17.9, 17.9, 18.0, 18.0, 18.1, 18.1, 18.2, 18.2, 18.3, 18.4, 18.4, 18.5, 18.5, 18.6, 18.6, 18.7, 18.7, 18.8, 18.8, 18.9, 18.9, 19.0, 19.1, 19.1, 19.2, 19.2, 19.3, 19.3, 19.4, 19.4, 19.5, 19.5, 19.6, 19.6, 19.7, 19.8, 19.8, 19.9, 19.9, 20.0, 20.0, 20.1, 20.1, 20.2, 20.2, 20.3, 20.3, 20.4, 20.5, 20.5

  5%|▌         | 213/4000 [06:19<1:45:25,  1.67s/it]

(D) Put down the paper/notebook.
GT: (D) Put down the paper/notebook.
Part  Acc: 46.15%
Total Acc: 81.22%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.8, 0.9, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.1, 9.2 seconds. Carefully 

  5%|▌         | 214/4000 [06:21<1:44:15,  1.65s/it]

(B) Wash the window.
GT: (B) Wash the window.
Part  Acc: 50.00%
Total Acc: 81.31%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 1

  5%|▌         | 215/4000 [06:22<1:44:19,  1.65s/it]

(B) Take the phone/camera.
GT: (C) Take the blanket.
Part  Acc: 46.67%
Total Acc: 80.93%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.1, 6.2, 6.4, 6.6, 6.7, 6.9, 7.1, 7.3, 7.4, 7.6, 7.8, 7.9, 8.1, 8.3, 8.4, 8.6, 8.8, 9.0, 9.1, 9.3, 9.5, 9.6, 9.8, 10.0, 10.2, 10.4, 10.5, 10.7, 10.9, 11.0, 11.2, 11.4, 11.5, 11.7, 11.9, 12.1, 12.2, 12.4, 12.6, 12.7, 12.9, 13.1, 13.2, 13.4, 13.6, 13.8, 13.9, 14.1, 14.3, 14.4, 14.6, 14.8, 14.9, 15.1, 15.3, 15.5, 15.6, 15.8, 16.0, 16.1, 16.3, 16.5, 16.7, 16.8, 17.0, 17.2, 17.4, 17.5, 17.7, 17.9, 18.0, 18.2, 18.4, 18.6, 18.7, 18.9, 19.1, 19.2, 19.4, 19.6, 19.8, 19.9, 20.1, 20.3, 20.4, 20.6, 20.8, 20.9, 21.1, 21.3, 21.5, 21.6, 21.8, 22.0, 22.1, 22.3, 22.5, 22.6, 22.8, 23.0, 23.2, 23.3, 23.5, 23.7, 23.8, 24.0, 24.2, 24.4, 24.6, 24.7, 24.9, 25.1, 25.2, 25.

  5%|▌         | 216/4000 [06:24<1:46:20,  1.69s/it]

(D) Open the box.
GT: (D) Open the box.
Part  Acc: 50.00%
Total Acc: 81.02%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.9, 14.0, 14.0, 14.1, 14.1, 14.2, 14.3, 14.3, 14.4, 14.4, 14.5, 14.6, 14.6, 14.7, 14.7, 14.8, 14.9, 14.9, 15.0, 15.1, 15.1, 15.2, 15.2, 15.3, 15.4, 15.4, 15.5, 15.5, 15.6, 15.7, 15.7, 15.8, 15.8, 15.9, 16.0, 16.0, 16.1, 16.1, 16.2, 16.3, 16.3, 16.4, 16.4, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 17.0, 17.0, 17.1, 17.1, 17.2, 17.3, 17.3, 17.4, 17.5, 17.5, 17.6, 17.6, 17.7, 17.8, 17.8, 17.9, 17.9, 18.0, 18.1, 18.1, 18.2, 18.2, 18.3, 18.4, 18.4, 18.5, 18.5, 18.6, 18.7, 18.7, 18.8, 18.8, 18.9, 19.0, 19.0, 19.1, 19.2, 19.2, 19.3, 19.3, 19.4, 19.5, 19.5, 19.6, 19.6, 19.7, 19.8, 19.8, 19.9, 19.9, 20.0, 20.1, 20.1, 20.2, 20.2, 20.3, 20.4, 20.4, 20.5, 20.5

  5%|▌         | 217/4000 [06:26<1:43:22,  1.64s/it]

(C) Close the door.
GT: (B) Close the laptop.
Part  Acc: 47.06%
Total Acc: 80.65%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.2, 0.3, 0.4, 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.0, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.9, 3.0, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9 seconds. Carefully 

  5%|▌         | 218/4000 [06:27<1:38:41,  1.57s/it]

(D) Take.
GT: (C) Put down.
Part  Acc: 44.44%
Total Acc: 80.28%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 11.7, 11.8, 12.0, 12.1, 12.3, 12.4, 12.6, 12.7, 12.9, 13.0, 13.2, 13.3, 13.5, 13.6, 13.8, 13.9, 14.1, 14.2, 14.4, 14.5, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.2, 16.3, 16.5, 16.6, 16.7, 16.9, 17.0, 17.2, 17.3, 17.5, 17.6, 17.8, 17.9, 18.1, 18.2, 18.4, 18.5, 18.7, 18.8, 19.0, 19.1, 19.3, 19.4, 19.6, 19.7, 19.9, 20.0, 20.2, 20.3, 20.5, 20.6, 20.8, 20.9, 21.1, 21.2, 21.4, 21.5, 21.7, 21.8, 22.0, 22.1, 22.3, 22.4, 22.6, 22.7, 22.9, 23.0, 23.2, 23.3, 23.5, 23.6, 23.8, 23.9, 24.1, 24.2, 24.4, 24.5, 24.7, 24.8, 25.0, 25.1, 25.3, 25.4, 25.6, 25.7, 25.9, 26.0, 26.2, 26.3, 26.5, 26.6, 26.8, 26.9, 27.1, 27.2, 27.4, 27.5, 27.7, 27.8, 28.0

  5%|▌         | 219/4000 [06:29<1:39:24,  1.58s/it]

(A) Open the door.
GT: (B) Sit at the table.
Part  Acc: 42.11%
Total Acc: 79.91%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.6, 12.7, 12.8, 12.8, 12.9

  6%|▌         | 220/4000 [06:30<1:40:53,  1.60s/it]

(D) Put down the book.
GT: (C) Open the box.
Part  Acc: 40.00%
Total Acc: 79.55%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.1, 3.2, 3.3, 3.4, 3.5, 3.7, 3.8, 3.9, 4.0, 4.1, 4.3, 4.4, 4.6, 4.7, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.1, 6.2, 6.4, 6.5, 6.6, 6.7, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.3, 9.4, 9.6, 9.7, 9.9, 10.0, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.0, 12.1, 12.3, 12.4, 12.6, 12.7, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.8, 16.9, 17.1, 17.2, 17.3, 17.4, 17.6, 

  6%|▌         | 221/4000 [06:32<1:39:35,  1.58s/it]

(D) Take the box.
GT: (A) Wash the table.
Part  Acc: 38.10%
Total Acc: 79.19%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.9, 9.0, 9.2, 9.3, 9.4, 9.6, 9.8, 9.9, 10.0, 10.1, 10.2, 10.4, 10.6, 10.7, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.5, 12.8, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.6, 13.7, 14.0, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 15.0, 15.1, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 15.9, 16.2, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.1, 17.2, 17.4, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.3, 18.4, 18.6, 18.7, 18.8, 18.9, 19.1, 19.3, 19.4, 19.5, 19.7, 19.8, 19.9, 20.0, 20.1, 20.3, 20.5, 20.6, 20.7, 20.9, 21.0, 21.1, 21.2, 21.5, 21.6, 21.7, 21.8, 21.9, 22.1, 22.2, 22.3, 22.4, 22.7, 22.8, 22.9, 23.0, 23.2, 23.3, 23.4, 23.6, 23.8, 23.9, 

  6%|▌         | 222/4000 [06:33<1:38:41,  1.57s/it]

(C) Take the shoe.
GT: (C) Take the shoe.
Part  Acc: 40.91%
Total Acc: 79.28%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.5, 0.7, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4, 2.7, 2.9, 3.1, 3.3, 3.5, 3.8, 4.0, 4.2, 4.4, 4.6, 4.8, 5.0, 5.2, 5.5, 5.7, 5.9, 6.1, 6.3, 6.5, 6.7, 6.9, 7.2, 7.4, 7.6, 7.8, 8.0, 8.2, 8.4, 8.6, 8.9, 9.1, 9.3, 9.5, 9.7, 10.0, 10.2, 10.4, 10.6, 10.8, 11.0, 11.2, 11.4, 11.7, 11.9, 12.1, 12.3, 12.5, 12.7, 12.9, 13.1, 13.4, 13.6, 13.8, 14.0, 14.2, 14.4, 14.6, 14.8, 15.1, 15.3, 15.5, 15.7, 15.9, 16.1, 16.3, 16.5, 16.8, 17.0, 17.2, 17.4, 17.6, 17.9, 18.1, 18.3, 18.5, 18.7, 18.9, 19.1, 19.3, 19.6, 19.8, 20.0, 20.2, 20.4, 20.6, 20.8, 21.1, 21.3, 21.5, 21.7, 21.9, 22.1, 22.3, 22.5, 22.8, 23.0, 23.2, 23.4, 23.6, 23.8, 24.1, 24.3, 24.5, 24.7, 24.9, 25

  6%|▌         | 223/4000 [06:35<1:40:41,  1.60s/it]

(B) Put down the towel.
GT: (B) Put down the towel.
Part  Acc: 43.48%
Total Acc: 79.37%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.8, 7.0, 7.0, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 8.0, 8.0, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.8, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.8, 9.8, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.8, 10.8, 11.0, 11.1, 11.2, 11.3, 11.4, 11.6, 11.6, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.6, 12.6, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.6, 13.6, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.4, 19.5, 19

  6%|▌         | 224/4000 [06:37<1:43:14,  1.64s/it]

(A) Open the laptop.
GT: (A) Open the laptop.
Part  Acc: 45.83%
Total Acc: 79.46%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18

  6%|▌         | 225/4000 [06:38<1:43:46,  1.65s/it]

(B) Put down the phone/camera.
GT: (B) Put down the phone/camera.
Part  Acc: 48.00%
Total Acc: 79.56%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 

  6%|▌         | 226/4000 [06:40<1:42:02,  1.62s/it]

(B) Put down the food.
GT: (D) Open the bag.
Part  Acc: 46.15%
Total Acc: 79.20%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.4, 16.5, 16.6, 16.8, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.6, 19.7, 19.8, 19.9

  6%|▌         | 227/4000 [06:42<1:44:26,  1.66s/it]

(A) Open the closet/cabinet.
GT: (C) Close the closet/cabinet.
Part  Acc: 44.44%
Total Acc: 78.85%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.9, 11.1, 11.3, 11.6, 11.8, 12.0, 12.2, 12.4, 12.6, 12.8, 13.0, 13.2, 13.4, 13.7, 13.9, 14.1, 14.3, 14.5, 14.7, 14.9, 15.1, 15.3, 15.5, 15.8, 16.0, 16.2, 16.4, 16.6, 16.8, 17.0, 17.2, 17.4, 17.6, 17.9, 18.1, 18.3, 18.5, 18.7, 18.9, 19.1, 19.3, 19.5, 19.8, 20.0, 20.2, 20.4, 20.6, 20.8, 21.0, 21.2, 21.4, 21.6, 21.9, 22.1, 22.3, 22.5, 22.7, 22.9, 23.1, 23.3, 23.5, 23.7, 24.0, 24.2, 24.4, 24.6, 24.8, 25.0, 25.2, 25.4, 25.6, 25.8, 26.1, 26.3, 26.5, 26.7, 26.9, 27.1, 27.3, 27.5, 27.7, 27.9, 28.2, 28.4, 28.6, 28.8, 29.0, 29.2, 29.4, 29.6, 29.8, 30.0, 30.3, 30.5, 30.7, 30.9, 31.1, 31.3, 31.5, 31.7, 31.9, 32.2, 32.4, 32.6, 32.8, 33.0, 33.2, 33.4, 33.6, 33.8

  6%|▌         | 228/4000 [06:43<1:42:38,  1.63s/it]

(B) Open the refrigerator.
GT: (A) Close the refrigerator.
Part  Acc: 42.86%
Total Acc: 78.51%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.5, 0.5, 0.7, 0.8, 0.9, 1.0, 1.0, 1.2, 1.2, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.5, 6.5, 6.6, 6.8, 6.8, 7.0, 7.0, 7.2, 7.2, 7.4, 7.5, 7.5, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.8, 

  6%|▌         | 229/4000 [06:45<1:39:31,  1.58s/it]

(B) Put down.
GT: (B) Put down.
Part  Acc: 44.83%
Total Acc: 78.60%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.1, 14.2, 14.4, 14.6, 14.7, 14.9, 15.0, 15.2, 15.4, 15.5, 15.7, 15.9, 16.0, 16.2, 16.3, 16.5, 16.7, 16.9, 17.0, 17.2, 17.3, 17.5, 17.7, 17.8, 18.0, 18.2, 18.3, 18.5, 18.6, 18.8, 19.0, 19.1, 19.3, 19.5, 19.6, 19.8, 19.9, 20.1, 20.3, 20.4, 20.6, 20.8, 20.9, 21.1, 21.2, 21.4, 21.6, 21.7, 21.9, 22.0, 22.2, 22.4, 22.5, 22.7, 22.9, 23.0, 23.2, 23.3, 23.5, 23.7, 23.8, 24.0, 24.2, 24.3, 24.5, 24.6, 24.8, 25.0, 25.1, 25.3, 25.5, 25.6, 25.8, 25.9, 26.1, 26.3, 26.4, 26.6, 26.8, 26.9, 27.1, 27.2, 27.4, 27.6, 27.7, 27.9, 28.1, 28.2, 28.4, 28.5, 28.7, 28.9, 29.0, 29.2, 29.4, 29.5, 29.7, 29.8, 30.0, 30.2, 30.3, 30.5, 30.6, 30.8, 31.0, 31.1, 31.3, 31.5, 31.6, 31.8

  6%|▌         | 230/4000 [06:46<1:40:51,  1.61s/it]

(D) Open the door.
GT: (C) Lie on the sofa/couch.
Part  Acc: 43.33%
Total Acc: 78.26%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.5, 6.5, 6.7, 6.7, 6.9, 6.9, 7.0, 7.0, 7.2, 7.2, 7.4, 7.4, 7.5, 7.5, 7.7, 7.7, 7.8, 7.8, 7.8, 8.0, 8.0, 8.2, 8.2, 8.3, 8.3, 8.5, 8.5, 8.7, 8.7, 8.8, 8.8, 9.0, 9.0, 9.0, 9.2, 9.2, 9.3, 9.3, 9.5, 9.5, 9.6, 9.6, 9.8, 9.8, 10.0, 10.0, 10.1, 10.1, 10.1, 10.3, 10.3, 10.5, 10.5, 10.6, 10.6, 10.8, 10.8, 11.0, 11.0, 11.1, 11.1, 11.3, 11.3, 11.4, 11.4, 11.4, 11.6, 11.6, 11.8, 11.8, 11.9, 11.9, 12.1, 12.1, 12.3, 12.3, 12.4, 12.4, 12.6, 12.6, 12.7, 12.7, 12.7, 12.9, 12.9, 13.1, 13.1, 13.2, 13.2, 13.4, 13.4, 13.6, 13.6, 13.7, 13.7, 13.9, 13.9, 13.9, 14.1, 14.1, 14.2, 14.2, 14.4, 14.4, 14.5, 14.5, 14.7, 14.7, 14.9, 14.9, 15.0, 15.0, 15.0, 15.2, 15.2, 15.4, 15.4, 

  6%|▌         | 231/4000 [06:50<2:09:27,  2.06s/it]

(A) Put down the bag.
GT: (B) Put down the shoe.
Part  Acc: 41.94%
Total Acc: 77.92%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.4, 0.6, 0.8, 1.0, 1.2, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5, 2.6, 2.8, 3.0, 3.2, 3.4, 3.5, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.8, 5.0, 5.2, 5.4, 5.6, 5.7, 5.9, 6.1, 6.3, 6.5, 6.7, 6.9, 7.0, 7.2, 7.4, 7.6, 7.8, 7.9, 8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.2, 9.4, 9.6, 9.8, 10.0, 10.1, 10.3, 10.5, 10.7, 10.9, 11.1, 11.3, 11.4, 11.6, 11.8, 12.0, 12.2, 12.3, 12.5, 12.7, 12.9, 13.1, 13.3, 13.5, 13.6, 13.8, 14.0, 14.2, 14.4, 14.5, 14.7, 14.9, 15.1, 15.3, 15.5, 15.7, 15.8, 16.0, 16.2, 16.4, 16.6, 16.8, 17.0, 17.2, 17.3, 17.5, 17.7, 17.9, 18.1, 18.2, 18.4, 18.6, 18.8, 19.0, 19.2, 19.4, 19.5, 19.7, 19.9, 20.1, 20.3, 20.4, 20.6, 20.8, 21.0, 21.2, 21.4, 21.6, 21.7

  6%|▌         | 232/4000 [06:51<2:06:24,  2.01s/it]

(C) Sit on the sofa/couch.
GT: (C) Sit on the sofa/couch.
Part  Acc: 43.75%
Total Acc: 78.02%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.2, 0.6, 1.0, 1.4, 1.8, 2.2, 2.6, 3.0, 3.5, 3.9, 4.3, 4.7, 5.1, 5.5, 5.9, 6.3, 6.7, 7.2, 7.5, 8.0, 8.4, 8.8, 9.2, 9.6, 10.0, 10.4, 10.8, 11.2, 11.7, 12.1, 12.5, 12.9, 13.3, 13.7, 14.1, 14.5, 14.9, 15.4, 15.8, 16.2, 16.6, 17.0, 17.4, 17.8, 18.2, 18.7, 19.1, 19.4, 19.9, 20.3, 20.7, 21.1, 21.5, 21.9, 22.3, 22.7, 23.1, 23.6, 24.0, 24.4, 24.8, 25.2, 25.6, 26.0, 26.4, 26.9, 27.3, 27.7, 28.1, 28.5, 28.9, 29.3, 29.7, 30.1, 30.6, 30.9, 31.3, 31.8, 32.2, 32.6, 33.0, 33.4, 33.8, 34.2, 34.6, 35.1, 35.5, 35.9, 36.3, 36.7, 37.1, 37.5, 37.9, 38.3, 38.8, 39.2, 39.6, 40.0, 40.4, 40.8, 41.2, 41.6, 42.0, 42.5, 42.8, 43.3, 43.7, 44.1, 44.5, 44.9, 45.3, 45.7, 46.1, 46.5

  6%|▌         | 233/4000 [06:53<2:05:03,  1.99s/it]

(B) Put down the paper/notebook.
GT: (B) Put down the paper/notebook.
Part  Acc: 45.45%
Total Acc: 78.11%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2 seconds. Carefu

  6%|▌         | 234/4000 [06:55<1:55:29,  1.84s/it]

(A) Open the door.
GT: (B) Take the laptop.
Part  Acc: 44.12%
Total Acc: 77.78%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.5, 8.6, 8.8, 8.9, 9.1, 9.2, 9.4, 9.5, 9.7, 9.8, 10.0, 10.1, 10.3, 10.4, 10.6, 10.7, 10.9, 11.0, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.0, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.1, 13.2, 13.4, 13.5, 13.7, 13.8, 14.0, 14.1, 14.3, 14.4, 14.6, 14.7, 14.9, 15.0, 15.2, 15.3, 15.5, 15.6, 15.8, 15.9, 16.1, 16.2, 16.4, 16.5, 16.7, 16.8, 17.0, 17.1, 17.3, 17.4, 17.6, 17.7, 17.9, 18.0, 18.2, 18.3, 18.5, 18.6, 18.8, 18.9, 19.1, 19.2, 19.4, 19.5, 19.7, 19.8, 20.0, 20.1, 20.3, 20.4, 20.6, 20.7, 20.9, 21.0, 21.2, 21.3, 21.5, 21.6, 21.8, 21.9, 22.1, 22.2, 22.4, 22.5, 22.7, 22.8, 22.9, 23.1, 23.2, 23.4, 23.5, 23.7, 23.8, 24.0, 24.1, 24.3, 24.4, 24.6, 24.7, 24.9, 25

  6%|▌         | 235/4000 [06:56<1:50:54,  1.77s/it]

(C) Open the bag.
GT: (C) Open the bag.
Part  Acc: 45.71%
Total Acc: 77.87%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8 seconds.

  6%|▌         | 236/4000 [06:58<1:46:42,  1.70s/it]

(B) Open the bag.
GT: (A) Sit on the floor.
Part  Acc: 44.44%
Total Acc: 77.54%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.9, 3.0, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 1

  6%|▌         | 237/4000 [07:00<1:45:46,  1.69s/it]

(A) Close the laptop.
GT: (A) Close the laptop.
Part  Acc: 45.95%
Total Acc: 77.64%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 19.7, 19.8, 19.9, 20.0, 20.1, 20.1, 20.2, 20.3, 20.4, 20.5, 20.5, 20.6, 20.7, 20.8, 20.9, 20.9, 21.0, 21.1, 21.2, 21.3, 21.3, 21.4, 21.5, 21.6, 21.7, 21.7, 21.8, 21.9, 22.0, 22.1, 22.1, 22.2, 22.3, 22.4, 22.5, 22.5, 22.6, 22.7, 22.8, 22.9, 22.9, 23.0, 23.1, 23.2, 23.3, 23.3, 23.4, 23.5, 23.6, 23.7, 23.7, 23.8, 23.9, 24.0, 24.1, 24.1, 24.2, 24.3, 24.4, 24.5, 24.5, 24.6, 24.7, 24.8, 24.9, 24.9, 25.0, 25.1, 25.2, 25.3, 25.3, 25.4, 25.5, 25.6, 25.7, 25.7, 25.8, 25.9, 26.0, 26.1, 26.1, 26.2, 26.3, 26.4, 26.5, 26.6, 26.6, 26.7, 26.8, 26.9, 27.0, 27.0, 27.1, 27.2, 27.3, 27.4, 27.4, 27.5, 27.6, 27.7, 27.8, 27.8, 27.9, 28.0, 28.1, 28.2, 28.2, 28.3, 28.4, 28.5

  6%|▌         | 238/4000 [07:01<1:43:42,  1.65s/it]

(D) Open the window.
GT: (C) Close the window.
Part  Acc: 44.74%
Total Acc: 77.31%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 11.0, 11.1, 11.1, 11.2, 11.3, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.1, 12.2, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.6, 12.7, 12.8, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.2, 13.3, 13.4, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.8, 13.9, 14.0, 14.0, 14.1, 14.1, 14.2, 14.3, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.7, 14.8, 14.9, 14.9, 15.0, 15.1, 15.1, 15.2, 15.3, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.7, 15.8, 15.9, 15.9, 16.0, 16.1, 16.1, 16.2, 16.3, 16.3, 16.4, 16.4, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 17.0, 17.0, 17.1, 17.2, 17.2, 17.3, 17.4, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.1, 18.2

  6%|▌         | 239/4000 [07:03<1:42:17,  1.63s/it]

(A) Throw the bag.
GT: (A) Throw the bag.
Part  Acc: 46.15%
Total Acc: 77.41%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.6, 11.7, 11

  6%|▌         | 240/4000 [07:04<1:41:19,  1.62s/it]

(B) Tidy up the blanket.
GT: (C) Take the pillow.
Part  Acc: 45.00%
Total Acc: 77.08%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.5, 4.6, 4.8, 5.0, 5.2, 5.4, 5.5, 5.7, 5.9, 6.1, 6.2, 6.4, 6.6, 6.8, 6.9, 7.1, 7.3, 7.5, 7.6, 7.8, 8.0, 8.2, 8.4, 8.5, 8.7, 8.9, 9.1, 9.2, 9.4, 9.6, 9.8, 9.9, 10.1, 10.3, 10.5, 10.7, 10.8, 11.0, 11.2, 11.4, 11.5, 11.7, 11.9, 12.1, 12.2, 12.4, 12.6, 12.8, 13.0, 13.1, 13.3, 13.5, 13.7, 13.8, 14.0, 14.2, 14.4, 14.5, 14.7, 14.9, 15.1, 15.3, 15.4, 15.6, 15.8, 16.0, 16.1, 16.3, 16.5, 16.7, 16.9, 17.1, 17.2, 17.4, 17.6, 17.8, 17.9, 18.1, 18.3, 18.5, 18.6, 18.8, 19.0, 19.2, 19.4, 19.5, 19.7, 19.9, 20.1, 20.2, 20.4, 20.6, 20.8, 20.9, 21.1, 21.3, 21.5, 21.7, 21.8, 22.0, 22.2, 22.4, 22.5, 22.7, 22.9, 23.1, 23.2, 23.4, 23.6, 23.8, 24.0, 24.1, 24.3, 24.5, 24.7, 

  6%|▌         | 241/4000 [07:06<1:41:35,  1.62s/it]

(B) Take the shoe.
GT: (B) Take the shoe.
Part  Acc: 46.34%
Total Acc: 77.18%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 10.9, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.6, 12.6, 12.7, 12.8, 12.8, 12.9, 

  6%|▌         | 242/4000 [07:08<1:39:08,  1.58s/it]

(D) Put down the box.
GT: (A) Throw the blanket.
Part  Acc: 45.24%
Total Acc: 76.86%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.

  6%|▌         | 243/4000 [07:09<1:40:35,  1.61s/it]

(C) Throw the pillow.
GT: (C) Throw the pillow.
Part  Acc: 46.51%
Total Acc: 76.95%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 9.9, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.8, 17.0, 17.1, 17.2, 17.3, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.5, 18.6, 18.7, 18.8, 19.0, 19.1, 19.2, 19.3, 19.5, 19.6, 19.7, 19.8, 20.0, 20.1, 20.2, 20.4, 20.5, 20.6, 20.7, 20.9, 21.0, 21.1, 21.2, 21.4, 21.5, 21.6, 21.7, 21.9, 22.0, 22.1, 22.2, 22.4, 22.5, 22.6, 22.7, 22.9, 23.0, 23.1, 23.2, 23.4, 23.5, 23.6,

  6%|▌         | 244/4000 [07:11<1:49:51,  1.75s/it]

(D) Put down the cup/glass/bottle.
GT: (D) Put down the cup/glass/bottle.
Part  Acc: 47.73%
Total Acc: 77.05%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.7, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 15.9, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.6, 17.8, 17.9, 18.0, 18.1, 18.3, 18.4, 18.5, 18.7, 18.8, 18.9, 19.0, 19.2, 19.3, 19.4, 19.6, 19.7, 19.8, 20.0, 20.1, 20.2, 20.3, 20.5, 20.6, 20.7, 20.9, 21.0, 21.1, 21.3, 21.4, 21.5, 21.7, 21.8, 21.9, 22.1, 22.2, 22.3, 22.4, 22.6, 22.7, 22.8, 23.0, 23.1, 23.2, 23.4, 23.5, 23.6, 23.8, 23.9, 24.0, 24.1, 24.3, 24.4, 24.5, 24.7, 24.8, 24.9, 25.0, 25.2, 25.3, 25.4, 25.6, 25.7, 25.8, 25.9, 26.1, 26.2, 26.3, 26.5, 26.6, 26.7, 26.8, 27.0, 27.1, 27.2, 27.4, 27.5, 27.6, 27.8, 27.9

  6%|▌         | 245/4000 [07:13<1:48:32,  1.73s/it]

(A) Put down.
GT: (A) Put down.
Part  Acc: 48.89%
Total Acc: 77.14%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 

  6%|▌         | 246/4000 [07:14<1:42:05,  1.63s/it]

(D) Take.
GT: (B) Tidy up.
Part  Acc: 47.83%
Total Acc: 76.83%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 1.0, 1.1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.9, 2.0, 2.1, 2.3, 2.4, 2.5, 2.7, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.4, 8.5, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.8, 15.9, 16

  6%|▌         | 247/4000 [07:16<1:46:57,  1.71s/it]

(D) Lie on the sofa/couch.
GT: (D) Lie on the sofa/couch.
Part  Acc: 48.94%
Total Acc: 76.92%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.5, 6.5, 6.6, 6.6, 6.7, 6.7, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.1, 7.2, 7.2, 7.3, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.7, 7.8, 7.8, 7.9, 7.9, 8.0, 8.0, 8.1, 8.1, 8.2, 8.2, 8.3, 8.4, 8.4, 8.5, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 8.9, 9.0, 9.0, 9.1, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.4, 9.5, 9.5, 9.6, 9.7, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.1 seconds. Carefu

  6%|▌         | 248/4000 [07:18<1:43:16,  1.65s/it]

(C) Put down the blanket.
GT: (C) Put down the blanket.
Part  Acc: 50.00%
Total Acc: 77.02%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 1.0, 1.1, 1.2, 1.4, 1.5, 1.6, 1.7, 1.9, 2.0, 2.1, 2.2, 2.3, 2.5, 2.6, 2.7, 2.8, 2.9, 3.1, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 3.9, 4.1, 4.2, 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.1, 7.3, 7.4, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 14.9, 15

  6%|▌         | 249/4000 [07:19<1:42:30,  1.64s/it]

(A) Close the window.
GT: (A) Close the window.
Part  Acc: 51.02%
Total Acc: 77.11%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.2, 1

  6%|▋         | 250/4000 [07:21<1:43:10,  1.65s/it]

(A) Lie on the sofa/couch.
GT: (A) Lie on the sofa/couch.
Part  Acc: 52.00%
Total Acc: 77.20%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 1.0, 1.2, 1.3, 1.5, 1.7, 1.8, 2.0, 2.1, 2.3, 2.5, 2.6, 2.8, 2.9, 3.1, 3.3, 3.4, 3.6, 3.7, 3.9, 4.1, 4.2, 4.4, 4.5, 4.7, 4.9, 5.0, 5.2, 5.3, 5.5, 5.7, 5.8, 6.0, 6.1, 6.3, 6.5, 6.6, 6.8, 6.9, 7.1, 7.3, 7.4, 7.6, 7.7, 7.9, 8.1, 8.2, 8.4, 8.5, 8.7, 8.9, 9.0, 9.2, 9.3, 9.5, 9.7, 9.8, 10.0, 10.1, 10.3, 10.5, 10.6, 10.8, 10.9, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.1, 12.2, 12.4, 12.5, 12.7, 12.9, 13.0, 13.2, 13.3, 13.5, 13.7, 13.8, 14.0, 14.1, 14.3, 14.5, 14.6, 14.8, 14.9, 15.1, 15.3, 15.4, 15.6, 15.7, 15.9, 16.1, 16.2, 16.4, 16.5, 16.7, 16.9, 17.0, 17.2, 17.3, 17.5, 17.7, 17.8, 18.0, 18.1, 18.3, 18.5, 18.6, 18.8, 18.9, 19.1, 

  6%|▋         | 251/4000 [07:23<1:43:12,  1.65s/it]

(A) Take the picture.
GT: (A) Take the picture.
Part  Acc: 52.94%
Total Acc: 77.29%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.6, 13.7, 13.7, 13.8, 13.8, 13.9, 13.9, 14.0, 14.0, 14.1, 14.1, 14.2, 14.3, 14.3, 14.4, 14.4, 14.5, 14.5, 14.6, 14.6, 14.7, 14.8, 14.8, 14.9, 14.9, 15.0, 15.1, 15.1, 15.2, 15.2, 15.3, 15.3, 15.4, 15.4, 15.5, 15.6, 15.6, 15.7, 15.7, 15.8, 15.8, 15.9, 15.9, 16.0, 16.1, 16.1, 16.2, 16.2, 16.3, 16.3, 16.4, 16.4, 16.5, 16.5, 16.6, 16.7, 16.7, 16.8, 16.8, 16.9, 16.9, 17.0, 17.0, 17.1, 17.2, 17.2, 17.3, 17.3, 17.4, 17.5, 17.5, 17.6, 17.6, 17.7, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.1, 18.1, 18.2, 18.2, 18.3, 18.3, 18.4, 18.4, 18.5, 18.6, 18.6, 18.7, 18.7, 18.8, 18.8, 18.9, 18.9, 19.0, 19.1, 19.1, 19.2, 19.2, 19.3, 19.3, 19.4, 19.4, 19.5, 19.5, 19.6, 19.7

  6%|▋         | 252/4000 [07:24<1:39:50,  1.60s/it]

(C) Throw the book.
GT: (C) Throw the book.
Part  Acc: 53.85%
Total Acc: 77.38%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.4, 2.6, 2.8, 3.0, 3.2, 3.3, 3.5, 3.7, 3.9, 4.1, 4.3, 4.4, 4.6, 4.8, 4.9, 5.1, 5.3, 5.5, 5.7, 5.9, 6.0, 6.2, 6.4, 6.6, 6.8, 7.0, 7.1, 7.3, 7.5, 7.7, 7.9, 8.0, 8.2, 8.4, 8.6, 8.8, 9.0, 9.1, 9.3, 9.5, 9.6, 9.9, 10.0, 10.2, 10.4, 10.6, 10.7, 10.9, 11.1, 11.3, 11.5, 11.7, 11.8, 12.0, 12.2, 12.4, 12.6, 12.7, 12.9, 13.1, 13.3, 13.5, 13.7, 13.8, 14.0, 14.2, 14.4, 14.6, 14.7, 14.9, 15.1, 15.3, 15.4, 15.6, 15.8, 16.0, 16.2, 16.3, 16.5, 16.7, 16.9, 17.1, 17.3, 17.4, 17.6, 17.8, 18.0, 18.2, 18.4, 18.5, 18.7, 18.9, 19.1, 19.3, 19.4, 19.6, 19.8, 20.0, 20.1, 20.4, 20.5, 20.7, 20.9, 21.0, 21.2, 21.4,

  6%|▋         | 253/4000 [07:26<1:41:57,  1.63s/it]

(A) Take the pillow.
GT: (C) Close the window.
Part  Acc: 52.83%
Total Acc: 77.08%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6 seconds. Carefully 

  6%|▋         | 254/4000 [07:29<2:15:31,  2.17s/it]

(D) Open the book.
GT: (A) Throw the towel.
Part  Acc: 51.85%
Total Acc: 76.77%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.9, 1.1, 1.3, 1.5, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8, 3.1, 3.3, 3.5, 3.7, 3.9, 4.1, 4.4, 4.6, 4.8, 5.0, 5.2, 5.4, 5.6, 5.9, 6.1, 6.3, 6.5, 6.7, 6.9, 7.2, 7.4, 7.6, 7.8, 8.0, 8.2, 8.4, 8.7, 8.9, 9.1, 9.3, 9.5, 9.7, 10.0, 10.2, 10.4, 10.6, 10.8, 11.0, 11.2, 11.5, 11.7, 11.9, 12.1, 12.3, 12.5, 12.8, 13.0, 13.2, 13.4, 13.6, 13.8, 14.1, 14.3, 14.5, 14.7, 14.9, 15.1, 15.3, 15.6, 15.8, 16.0, 16.2, 16.4, 16.6, 16.9, 17.1, 17.3, 17.5, 17.8, 18.0, 18.2, 18.4, 18.6, 18.8, 19.0, 19.3, 19.5, 19.7, 19.9, 20.1, 20.3, 20.6, 20.8, 21.0, 21.2, 21.4, 21.6, 21.8, 22.1, 22.3, 22.5, 22.7, 22.9, 23.1, 23.4, 23.6, 23.8, 24.0, 24.2, 24.4, 24.7, 24.9, 25.1, 25.3, 25.5, 25.7, 25.9

  6%|▋         | 255/4000 [07:31<2:07:46,  2.05s/it]

(A) Put down the laptop.
GT: (A) Put down the laptop.
Part  Acc: 52.73%
Total Acc: 76.86%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.2, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.8, 7.0, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.0, 8.2, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.2, 9.4, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 11.9, 12.0

  6%|▋         | 256/4000 [07:33<1:57:02,  1.88s/it]

(A) Take the box.
GT: (A) Take the box.
Part  Acc: 53.57%
Total Acc: 76.95%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 1.1, 1.3, 1.4, 1.6, 1.8, 1.9, 2.1, 2.3, 2.5, 2.6, 2.8, 3.0, 3.1, 3.3, 3.5, 3.7, 3.8, 4.0, 4.2, 4.3, 4.5, 4.7, 4.9, 5.0, 5.2, 5.4, 5.5, 5.7, 5.9, 6.1, 6.2, 6.4, 6.6, 6.7, 6.9, 7.1, 7.3, 7.4, 7.6, 7.8, 7.9, 8.1, 8.3, 8.5, 8.6, 8.8, 9.0, 9.1, 9.3, 9.5, 9.6, 9.8, 10.0, 10.2, 10.3, 10.5, 10.7, 10.8, 11.0, 11.2, 11.4, 11.5, 11.7, 11.9, 12.0, 12.2, 12.4, 12.6, 12.7, 12.9, 13.1, 13.2, 13.4, 13.6, 13.8, 13.9, 14.1, 14.3, 14.4, 14.6, 14.8, 15.0, 15.1, 15.3, 15.5, 15.6, 15.8, 16.0, 16.1, 16.3, 16.5, 16.7, 16.9, 17.0, 17.2, 17.4, 17.6, 17.7, 17.9, 18.1, 18.2, 18.4, 18.6, 18.7, 18.9, 19.1, 19.3, 19.4, 19.6, 19.8, 19.9, 20.1, 20.3, 20

  6%|▋         | 257/4000 [07:34<1:56:20,  1.86s/it]

(A) Tidy up the clothes.
GT: (D) Throw the pillow.
Part  Acc: 52.63%
Total Acc: 76.65%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.5, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.2, 17.3, 17.4, 17.5, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.5, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.4, 20.5, 20.5, 20.6, 20.7, 20.8, 20.9, 21.1, 21.2, 21.3, 21.4, 21.5, 21.5, 21.6, 21.7, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.5, 23.5, 23.6, 23.7, 23.8, 23.9, 24.0, 24.2, 24.3, 24.4, 24.5, 24.5, 24.6, 24.7, 24.8

  6%|▋         | 258/4000 [07:36<1:51:57,  1.80s/it]

(A) Eat the medicine.
GT: (A) Eat the medicine.
Part  Acc: 53.45%
Total Acc: 76.74%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.3, 0.4, 0.5, 0.6, 0.7, 0.7, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.4, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0 seconds. Carefully

  6%|▋         | 259/4000 [07:38<1:46:10,  1.70s/it]

(C) Close the door.
GT: (B) Take the phone/camera.
Part  Acc: 52.54%
Total Acc: 76.45%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.1, 3.1, 3.2, 3.2, 3.3, 3.3, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3 seconds. Carefully 

  6%|▋         | 260/4000 [07:39<1:42:07,  1.64s/it]

(D) Take the food.
GT: (B) Take the bag.
Part  Acc: 51.67%
Total Acc: 76.15%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 15.6, 15.6, 15.7, 15.8, 15.9, 16.0, 16.0, 16.1, 16.2, 16.3, 16.3, 16.4, 16.5, 16.5, 16.7, 16.7, 16.8, 16.9, 16.9, 17.0, 17.1, 17.2, 17.2, 17.3, 17.4, 17.4, 17.6, 17.6, 17.7, 17.8, 17.8, 17.9, 18.0, 18.1, 18.1, 18.2, 18.3, 18.4, 18.4, 18.5, 18.6, 18.6, 18.8, 18.8, 18.9, 19.0, 19.0, 19.1, 19.2, 19.3, 19.3, 19.4, 19.5, 19.5, 19.7, 19.7, 19.8, 19.9, 19.9, 20.0, 20.1, 20.2, 20.2, 20.3, 20.4, 20.4, 20.6, 20.6, 20.7, 20.8, 20.8, 20.9, 21.0, 21.1, 21.1, 21.2, 21.3, 21.4, 21.5, 21.5, 21.6, 21.7, 21.8, 21.8, 21.9, 22.0, 22.0, 22.1, 22.2, 22.3, 22.4, 22.4, 22.5, 22.6, 22.7, 22.7, 22.8, 22.9, 22.9, 23.0, 23.1, 23.2, 23.2, 23.3, 23.4, 23.5, 23.6, 23.6, 23.7, 23.8

  7%|▋         | 261/4000 [07:41<1:42:00,  1.64s/it]

(C) Put down the towel.
GT: (C) Put down the towel.
Part  Acc: 52.46%
Total Acc: 76.25%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 26.0, 26.1, 26.2, 26.3, 26.4, 26.5, 26.6, 26.7, 26.8, 26.9, 27.0, 27.1, 27.2, 27.3, 27.4, 27.5, 27.6, 27.7, 27.8, 27.9, 28.0, 28.1, 28.2, 28.3, 28.4, 28.5, 28.6, 28.7, 28.8, 28.9, 29.0, 29.1, 29.2, 29.3, 29.4, 29.5, 29.6, 29.7, 29.8, 29.9, 30.0, 30.1, 30.2, 30.3, 30.4, 30.5, 30.6, 30.7, 30.8, 30.9, 31.0, 31.1, 31.2, 31.3, 31.4, 31.5, 31.6, 31.7, 31.8, 31.9, 32.0, 32.1, 32.2, 32.3, 32.5, 32.6, 32.7, 32.8, 32.9, 33.0, 33.1, 33.2, 33.3, 33.4, 33.5, 33.6, 33.7, 33.8, 33.9, 34.0, 34.1, 34.2, 34.3, 34.4, 34.5, 34.6, 34.7, 34.8, 34.9, 35.0, 35.1, 35.2, 35.3, 35.4, 35.5, 35.6, 35.7, 35.8, 35.9, 36.0, 36.1, 36.2, 36.3, 36.4, 36.5, 36.6, 36.7, 36.8, 36.9, 37.1

  7%|▋         | 262/4000 [07:42<1:40:25,  1.61s/it]

(D) Take the blanket.
GT: (D) Take the blanket.
Part  Acc: 53.23%
Total Acc: 76.34%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.3, 6.4, 6.6, 6.7, 6.9, 7.0, 7.2, 7.4, 7.5, 7.7, 7.8, 8.0, 8.1, 8.3, 8.4, 8.6, 8.8, 8.9, 9.1, 9.2, 9.4, 9.5, 9.7, 9.8, 10.0, 10.2, 10.3, 10.5, 10.6, 10.8, 10.9, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.0, 12.2, 12.3, 12.5, 12.7, 12.8, 13.0, 13.1, 13.3, 13.4, 13.6, 13.7, 13.9, 14.1, 14.2, 14.4, 14.5, 14.7, 14.8, 15.0, 15.1, 15.3, 15.5, 15.6, 15.8, 15.9, 16.1, 16.2, 16.4, 16.5, 16.7, 16.9, 17.0, 17.2, 17.3, 17.5, 17.6, 17.8, 17.9, 18.1, 18.3, 18.4, 18.6, 18.7, 18.9, 19.0, 19.2, 19.3, 19.5, 19.7, 19.8, 20.0, 20.1, 20.3, 20.4, 20.6, 20.7, 20.9, 21.1, 21.2, 21.4, 21.5, 21.7, 21.8, 22.0, 22.2, 22.3, 22.5, 22.6, 22.8, 22.9, 23.1, 23.2, 23.4, 23.6, 23.7, 23.9

  7%|▋         | 263/4000 [07:44<1:43:13,  1.66s/it]

(C) Take the book.
GT: (B) Take the box.
Part  Acc: 52.38%
Total Acc: 76.05%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.1, 2.2, 2.4, 2.5, 2.6, 2.7, 2.9, 3.0, 3.1, 3.2, 3.3, 3.5, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.3, 4.4, 4.6, 4.7, 4.8, 4.9, 5.1, 5.2, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.0, 6.2, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.7, 7.7, 7.9, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.7, 9.9, 9.9, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.

  7%|▋         | 264/4000 [07:46<1:43:49,  1.67s/it]

(A) Put down the book.
GT: (A) Put down the book.
Part  Acc: 53.12%
Total Acc: 76.14%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.1, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 1.9, 2.1, 2.2, 2.3, 2.5, 2.6, 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.3, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.1, 15.2, 15.3, 15.4, 15.6

  7%|▋         | 265/4000 [07:47<1:42:22,  1.64s/it]

(A) Take the phone/camera.
GT: (C) Take the towel.
Part  Acc: 52.31%
Total Acc: 75.85%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.

  7%|▋         | 266/4000 [07:49<1:44:32,  1.68s/it]

(D) Eat the sandwich.
GT: (D) Eat the sandwich.
Part  Acc: 53.03%
Total Acc: 75.94%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.3, 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 0.9, 1.0, 1.0, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.5, 9.7, 9.7, 9.8, 9.9 seconds. Carefully 

  7%|▋         | 267/4000 [07:51<1:45:24,  1.69s/it]

(D) Open the closet/cabinet.
GT: (D) Open the closet/cabinet.
Part  Acc: 53.73%
Total Acc: 76.03%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 1.0, 1.1, 1.2, 1.4, 1.5, 1.6, 1.8, 1.9, 2.1, 2.2, 2.3, 2.5, 2.6, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.4, 6.5, 6.6, 6.8, 6.9, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.3, 8.4, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.6, 12.7, 12.9, 13.0, 13.1, 13.3, 13.4, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 16.0, 16.1, 16.2, 16.4, 

  7%|▋         | 268/4000 [07:52<1:45:10,  1.69s/it]

(C) Put down the towel.
GT: (C) Put down the towel.
Part  Acc: 54.41%
Total Acc: 76.12%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.2, 7.3, 7.3, 7.5, 7.7, 7.7, 7.8, 8.0, 8.2, 8.2, 8.3, 8.5, 8.5, 8.7, 8.8, 8.8, 9.0, 9.1, 9.1, 9.3, 9.5, 9.5, 9.6, 9.8, 9.8, 10.0, 10.1, 10.3, 10.3, 10.4, 10.6, 10.6, 10.8, 10.9, 10.9, 11.1, 11.3, 11.3, 11.4, 11.6, 11.8, 11.8, 11.9, 12.1, 12.1, 12.2, 12.4, 12.4, 12.6, 12.7, 12.7, 12.9, 13.1, 13.1, 13.2, 13.4, 13.4, 13.6, 13.7, 13.9, 13.9, 14.0, 14.2, 14.2, 14.4, 14.5, 14.5, 14.7, 14.9, 14.9, 15.0, 15.2, 15.3, 15.3, 15.5, 15.7, 15.7, 15.8, 16.0, 16.0, 16.2, 16.3, 16.3, 16.5, 16.7, 16.7, 16.8, 17.0, 17.0, 17.1, 17.3, 17.5, 17.5, 17.6, 17.8, 17.8, 18.0, 18.1, 18.1, 18.3, 18.4, 18.4, 18.6, 18.8, 18.9, 18.9, 19.1, 19.3, 19.3, 19.4, 19.6, 19.6, 19.8, 19.9,

  7%|▋         | 269/4000 [07:56<2:27:16,  2.37s/it]

(A) Open the closet/cabinet.
GT: (D) Put down the shoe.
Part  Acc: 53.62%
Total Acc: 75.84%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.8, 1.9, 2.1, 2.2, 2.3, 2.4, 2.5, 2.7, 2.8, 2.9, 3.0, 3.1, 3.3, 3.4, 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.3, 5.4, 5.6, 5.7, 5.8, 5.9, 6.0, 6.2, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.0, 7.1, 7.3, 7.4, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.9, 12.0, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.0, 14.1, 14.3, 14.4, 14.5, 14.6, 14.8, 14.

  7%|▋         | 270/4000 [07:58<2:09:31,  2.08s/it]

(D) Take the bag.
GT: (D) Take the bag.
Part  Acc: 54.29%
Total Acc: 75.93%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.5, 6.5, 6.6, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 6.9 seconds. Carefully 

  7%|▋         | 271/4000 [07:59<1:57:04,  1.88s/it]

(A) Put down.
GT: (A) Put down.
Part  Acc: 54.93%
Total Acc: 76.01%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.2, 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.4, 1.6, 1.7, 1.8, 1.9, 1.9, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.3, 3.3, 3.4, 3.5, 3.6, 3.6, 3.7, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 4.9, 5.1, 5.2, 5.3, 5.4, 5.4, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.2, 6.3, 6.3, 6.4, 6.5, 6.6, 6.6, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.4, 7.5, 7.5, 7.6, 7.7, 7.8, 8.0, 8.0, 8.1, 8.2, 8.3, 8.4, 8.4, 8.6, 8.7, 8.8, 8.8, 8.9, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.7, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.9, 11.0, 11.0, 11.1, 11.2, 11.3, 11.5, 11.5, 11.6, 11.7, 11.8, 11.8, 11

  7%|▋         | 272/4000 [08:02<2:17:42,  2.22s/it]

(D) Put down the towel.
GT: (D) Put down the towel.
Part  Acc: 55.56%
Total Acc: 76.10%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.7, 1.9, 2.2, 2.4, 2.7, 3.0, 3.2, 3.5, 3.7, 3.9, 4.2, 4.5, 4.7, 5.0, 5.2, 5.4, 5.7, 5.9, 6.2, 6.5, 6.7, 7.0, 7.2, 7.4, 7.7, 8.0, 8.2, 8.5, 8.7, 8.9, 9.2, 9.4, 9.7, 10.0, 10.2, 10.5, 10.7, 10.9, 11.2, 11.5, 11.7, 12.0, 12.2, 12.4, 12.7, 13.0, 13.2, 13.5, 13.7, 14.0, 14.2, 14.4, 14.7, 15.0, 15.2, 15.5, 15.7, 15.9, 16.2, 16.5, 16.7, 17.0, 17.2, 17.5, 17.7, 17.9, 18.2, 18.5, 18.7, 19.0, 19.2, 19.5, 19.7, 20.0, 20.2, 20.5, 20.7, 21.0, 21.2, 21.4, 21.7, 22.0, 22.2, 22.5, 22.7, 23.0, 23.2, 23.5, 23.7, 24.0, 24.2, 24.5, 24.7, 24.9, 25.2, 25.5, 25.7, 26.0, 26.2, 26.5, 26.7, 27.0, 27.2, 27.5, 27.7, 28.0, 28.2, 28.4, 28.7, 29.0, 29.2, 29.5, 29.7, 30.0, 30.2, 3

  7%|▋         | 273/4000 [08:04<2:04:17,  2.00s/it]

(B) Put down.
GT: (B) Put down.
Part  Acc: 56.16%
Total Acc: 76.19%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9

  7%|▋         | 274/4000 [08:05<1:57:13,  1.89s/it]

(B) Tidy up the clothes.
GT: (C) Take the box.
Part  Acc: 55.41%
Total Acc: 75.91%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12

  7%|▋         | 275/4000 [08:07<1:51:20,  1.79s/it]

(D) Put down the bag.
GT: (C) Put down the blanket.
Part  Acc: 54.67%
Total Acc: 75.64%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.6, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 1

  7%|▋         | 276/4000 [08:08<1:43:43,  1.67s/it]

(B) Take.
GT: (D) Put down.
Part  Acc: 53.95%
Total Acc: 75.36%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.4, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.5, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.4, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.3, 14.4, 14.5, 14.7, 14.8

  7%|▋         | 277/4000 [08:10<1:40:55,  1.63s/it]

(D) Put down the bag.
GT: (C) Close the refrigerator.
Part  Acc: 53.25%
Total Acc: 75.09%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.5, 0.7, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.3, 2.5, 2.7, 2.9, 3.1, 3.2, 3.4, 3.6, 3.8, 4.0, 4.2, 4.4, 4.6, 4.7, 4.9, 5.1, 5.3, 5.5, 5.7, 5.8, 6.0, 6.2, 6.4, 6.6, 6.8, 7.0, 7.1, 7.3, 7.5, 7.7, 7.9, 8.1, 8.2, 8.4, 8.6, 8.8, 9.0, 9.2, 9.4, 9.6, 9.7, 9.9, 10.1, 10.3, 10.5, 10.6, 10.8, 11.0, 11.2, 11.4, 11.6, 11.8, 12.0, 12.1, 12.3, 12.5, 12.7, 12.9, 13.0, 13.2, 13.4, 13.6, 13.8, 14.0, 14.2, 14.4, 14.6, 14.7, 14.9, 15.1, 15.3, 15.5, 15.6, 15.8, 16.0, 16.2, 16.4, 16.6, 16.8, 17.0, 17.1, 17.3, 17.5, 17.7, 17.9, 18.0, 18.2, 18.4, 18.6, 18.8, 19.0, 19.2, 19.4, 19.5, 19.7, 19.9, 20.1, 20.3, 20.5, 20.6, 20.8, 21.0, 21.2, 21.4, 21.6, 21.8, 21.9, 22.1, 22

  7%|▋         | 278/4000 [08:12<1:42:25,  1.65s/it]

(C) Sit at.
GT: (C) Sit at.
Part  Acc: 53.85%
Total Acc: 75.18%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.5, 0.7, 0.7, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 3.0, 3.0, 3.2, 3.2, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 5.0, 5.0, 5.2, 5.2, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.5, 9.5, 9.7, 9.7, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.5, 11.7, 11.7, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5

  7%|▋         | 279/4000 [08:13<1:39:59,  1.61s/it]

(A) Close the box.
GT: (A) Close the box.
Part  Acc: 54.43%
Total Acc: 75.27%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.1, 2.2, 2.2, 2.3, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.3, 4.4, 4.4, 4.5, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.4, 6.5, 6.5, 6.6, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.6, 8.7, 8.7, 8.8, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.8, 10.9, 10.9, 11.0, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.

  7%|▋         | 280/4000 [08:15<1:40:27,  1.62s/it]

(D) Sit on the bed.
GT: (D) Sit on the bed.
Part  Acc: 55.00%
Total Acc: 75.36%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.4, 0.6, 0.8, 1.0, 1.2, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5, 2.6, 2.8, 3.0, 3.2, 3.4, 3.5, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.8, 5.0, 5.2, 5.4, 5.6, 5.7, 5.9, 6.1, 6.3, 6.5, 6.7, 6.9, 7.0, 7.2, 7.4, 7.6, 7.8, 7.9, 8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.2, 9.4, 9.6, 9.8, 10.0, 10.1, 10.3, 10.5, 10.7, 10.9, 11.1, 11.3, 11.4, 11.6, 11.8, 12.0, 12.2, 12.3, 12.5, 12.7, 12.9, 13.1, 13.3, 13.5, 13.6, 13.8, 14.0, 14.2, 14.4, 14.5, 14.7, 14.9, 15.1, 15.3, 15.5, 15.7, 15.8, 16.0, 16.2, 16.4, 16.6, 16.8, 17.0, 17.2, 17.3, 17.5, 17.7, 17.9, 18.1, 18.2, 18.4, 18.6, 18.8, 19.0, 19.2, 19.4, 19.5, 19.7, 19.9, 20.1, 20.3, 20.4, 20.6, 20.8, 21.0, 21.2, 21.4, 21.6, 21.7

  7%|▋         | 281/4000 [08:16<1:40:59,  1.63s/it]

(D) Sit on.
GT: (D) Sit on.
Part  Acc: 55.56%
Total Acc: 75.44%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.5, 0.5, 0.5, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.2, 1.2, 1.2, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.5, 4.5, 4.5, 4.6, 4.6, 4.7, 4.8, 4.8, 4.8, 4.9, 5.0, 5.0, 5.0, 5.1, 5.2, 5.2, 5.2, 5.3, 5.3, 5.4, 5.5, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.8, 5.8, 5.9, 6.0, 6.0, 6.0, 6.1, 6.2, 6.2, 6.2, 6.3, 6.4, 6.4 seconds. Carefully 

  7%|▋         | 282/4000 [08:18<1:39:52,  1.61s/it]

(D) Put down the box.
GT: (D) Put down the box.
Part  Acc: 56.10%
Total Acc: 75.53%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.5, 0.7, 0.8, 1.0, 1.1, 1.3, 1.4, 1.6, 1.7, 1.9, 2.0, 2.2, 2.3, 2.5, 2.6, 2.8, 2.9, 3.1, 3.2, 3.3, 3.5, 3.6, 3.8, 3.9, 4.1, 4.2, 4.4, 4.5, 4.7, 4.8, 5.0, 5.1, 5.3, 5.4, 5.6, 5.7, 5.9, 6.0, 6.2, 6.3, 6.5, 6.6, 6.8, 6.9, 7.1, 7.2, 7.4, 7.5, 7.7, 7.8, 8.0, 8.1, 8.3, 8.4, 8.6, 8.7, 8.9, 9.0, 9.2, 9.3, 9.5, 9.6, 9.8, 9.9, 10.1, 10.2, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.0, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.5, 13.6, 13.8, 13.9, 14.1, 14.2, 14.4, 14.5, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.2, 16.3, 16.5, 16.6, 16.8, 17.0, 17.1, 17.3, 17.4, 17.6, 17.7, 17.9, 18.0,

  7%|▋         | 283/4000 [08:19<1:39:11,  1.60s/it]

(A) Close the refrigerator.
GT: (A) Close the refrigerator.
Part  Acc: 56.63%
Total Acc: 75.62%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.7, 7.8, 7.8, 7.9, 8.0, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 11.9, 12.0, 12.0, 12.1, 12.2, 12.2, 12.3, 12.4, 12.4, 12.5, 12.6, 12.6, 12.7, 12.7, 12.8, 12.9, 12.9, 13.0, 13.1, 13.1, 13.2, 13.2, 13.3, 13.4, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.8, 13.9, 13.9, 14.0, 14.1, 14.1, 14.2, 14.3, 14.3, 14.4, 14.4, 14.5, 14.6, 14.6, 14.7, 14.8, 14.8, 14.9, 14.9

  7%|▋         | 284/4000 [08:21<1:40:06,  1.62s/it]

(A) Open the book.
GT: (A) Open the book.
Part  Acc: 57.14%
Total Acc: 75.70%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 11.2, 11.3, 11.3, 11.4, 11.4, 11.5, 11.5, 11.6, 11.6, 11.7, 11.7, 11.8, 11.9, 11.9, 12.0, 12.0, 12.1, 12.1, 12.2, 12.2, 12.3, 12.3, 12.4, 12.5, 12.5, 12.6, 12.6, 12.7, 12.7, 12.8, 12.8, 12.9, 12.9, 13.0, 13.1, 13.1, 13.2, 13.2, 13.3, 13.3, 13.4, 13.4, 13.5, 13.5, 13.6, 13.6, 13.7, 13.8, 13.8, 13.9, 13.9, 14.0, 14.0, 14.1, 14.1, 14.2, 14.2, 14.3, 14.4, 14.4, 14.5, 14.5, 14.6, 14.6, 14.7, 14.7, 14.8, 14.8, 14.9, 15.0, 15.0, 15.1, 15.1, 15.2, 15.2, 15.3, 15.3, 15.4, 15.4, 15.5, 15.6, 15.6, 15.7, 15.7, 15.8, 15.8, 15.9, 15.9, 16.0, 16.0, 16.1, 16.2, 16.2, 16.3, 16.3, 16.4, 16.4, 16.5, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 16.9, 17.0, 17.1, 17.1, 17.2

  7%|▋         | 285/4000 [08:23<1:40:36,  1.62s/it]

(C) Put down the paper/notebook.
GT: (C) Put down the paper/notebook.
Part  Acc: 57.65%
Total Acc: 75.79%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 17.3, 17.5, 17.6, 17.7, 17.8, 17.9, 18.1, 18.1, 18.3, 18.4, 18.5, 18.7, 18.7, 18.9, 19.0, 19.1, 19.2, 19.3, 19.5, 19.6, 19.7, 19.8, 19.9, 20.1, 20.2, 20.3, 20.4, 20.5, 20.7, 20.7, 20.9, 21.0, 21.1, 21.3, 21.3, 21.5, 21.6, 21.7, 21.8, 21.9, 22.1, 22.2, 22.3, 22.4, 22.5, 22.7, 22.8, 22.8, 23.0, 23.1, 23.2, 23.3, 23.4, 23.6, 23.7, 23.8, 23.9, 24.0, 24.2, 24.3, 24.4, 24.5, 24.6, 24.8, 24.9, 25.0, 25.1, 25.2, 25.4, 25.4, 25.6, 25.7, 25.8, 25.9, 26.0, 26.2, 26.3, 26.4, 26.5, 26.6, 26.8, 26.9, 27.0, 27.1, 27.2, 27.4, 27.5, 27.6, 27.7, 27.8, 28.0, 28.0, 28.2, 28.3, 28.4, 28.5, 28.6, 28.8, 28.9, 29.0, 29.1, 29.2, 29.4, 29.5, 29.6, 29.7, 29.8, 30.0, 30.1, 30.2

  7%|▋         | 286/4000 [08:24<1:40:05,  1.62s/it]

(D) Open the laptop.
GT: (D) Open the laptop.
Part  Acc: 58.14%
Total Acc: 75.87%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.0, 10.0, 10.1, 10.1, 10.2, 10.2, 10.3, 10.3, 10.4, 10.4, 10.5, 10.5, 10.6, 10.6, 10.7, 10.7, 10.8, 10.8, 10.9, 10.9, 10.9, 11.0, 11.0, 11.1, 11.1, 11.2, 11.2, 11.3, 11.3, 11.4, 11.4, 11.5, 11.5, 11.6, 11.6, 11.7, 11.7, 11.8, 11.8, 11.8, 11.9, 11.9, 12.0, 12.0, 12.1, 12.1, 12.2, 12.2, 12.3, 12.3, 12.4, 12.4, 12.5, 12.5, 12.6, 12.6, 12.7, 12.7, 12.7, 12.8, 12.8, 12.9, 12.9, 13.0, 13.0, 13.1, 13.1, 13.2, 13.2, 13.3, 13.3, 13.4, 13.4, 13.5, 13.5, 13.6, 13.6, 13.6, 13.7, 13.7, 13.8, 13.8, 13.9, 13.9, 14.0, 14.0, 14.1, 14.1, 14.2, 14.2, 14.3, 14.3, 14.4, 14.4, 14.5, 14.5, 14.5, 14.6, 14.6, 14.7, 14.7, 14.8, 14.8, 14.9, 14.9, 15.0, 15.0, 15.1, 15.1, 15.2

  7%|▋         | 287/4000 [08:26<1:36:34,  1.56s/it]

(B) Sit at.
GT: (B) Sit at.
Part  Acc: 58.62%
Total Acc: 75.96%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1

  7%|▋         | 288/4000 [08:28<1:40:10,  1.62s/it]

(A) Sit at the table.
GT: (D) Eat the sandwich.
Part  Acc: 57.95%
Total Acc: 75.69%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.3, 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6 seconds. Carefully 

  7%|▋         | 289/4000 [08:29<1:40:53,  1.63s/it]

(D) Tidy up the blanket.
GT: (D) Tidy up the blanket.
Part  Acc: 58.43%
Total Acc: 75.78%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.3, 0.5, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 1.9, 2.1, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.2, 3.4, 3.4, 3.5, 3.7, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.5, 4.7, 4.8, 4.8, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.1, 6.1, 6.3, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.2, 7.4, 7.4, 7.5, 7.7, 7.7, 7.9, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.5, 8.7, 8.8, 8.8, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.3, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.4, 11.6, 11.6, 11.7, 11.

  7%|▋         | 290/4000 [08:31<1:39:41,  1.61s/it]

(D) Put down the towel.
GT: (D) Put down the towel.
Part  Acc: 58.89%
Total Acc: 75.86%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.1, 2.2, 2.3, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.9, 15.0, 

  7%|▋         | 291/4000 [08:33<1:42:27,  1.66s/it]

(A) Tidy up the clothes.
GT: (C) Take the laptop.
Part  Acc: 58.24%
Total Acc: 75.60%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 16.1, 16.2, 16.3, 16.5, 16.5, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.5, 18.5, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6, 24.8, 24.8, 25.0, 25.0, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 25.8, 25.9, 26.0, 26.1, 26.2, 26.3, 26.4, 26.5, 26.6, 26.8, 26.8, 27.0, 27.0, 27.2, 27.2, 27.4, 27.5, 27.6

  7%|▋         | 292/4000 [08:34<1:39:04,  1.60s/it]

(C) Put down.
GT: (C) Put down.
Part  Acc: 58.70%
Total Acc: 75.68%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.4, 0.5, 0.6, 0.8, 0.9, 1.1, 1.2, 1.4, 1.5, 1.6, 1.8, 1.9, 2.1, 2.2, 2.3, 2.5, 2.6, 2.8, 2.9, 3.0, 3.2, 3.3, 3.5, 3.6, 3.7, 3.9, 4.0, 4.2, 4.3, 4.5, 4.6, 4.7, 4.9, 5.0, 5.2, 5.3, 5.4, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.6, 6.7, 6.8, 7.0, 7.1, 7.3, 7.4, 7.6, 7.7, 7.8, 8.0, 8.1, 8.3, 8.4, 8.5, 8.7, 8.8, 9.0, 9.1, 9.2, 9.4, 9.5, 9.7, 9.8, 9.9, 10.1, 10.2, 10.4, 10.5, 10.7, 10.8, 10.9, 11.1, 11.2, 11.4, 11.5, 11.6, 11.8, 11.9, 12.1, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.5, 13.6, 13.8, 13.9, 14.0, 14.2, 14.3, 14.5, 14.6, 14.7, 14.9, 15.0, 15.2, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.1, 16.3, 16.4, 16.6, 16.7, 16.9, 17.0, 17.1, 17.3, 1

  7%|▋         | 293/4000 [08:36<1:37:11,  1.57s/it]

(C) Throw the clothes.
GT: (C) Throw the clothes.
Part  Acc: 59.14%
Total Acc: 75.77%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 12.9, 13.0, 13.1, 13.2, 13.2, 13.3, 13.4, 13.4, 13.5, 13.6, 13.6, 13.7, 13.8, 13.8, 13.9, 14.0, 14.0, 14.1, 14.2, 14.2, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.7, 14.8, 14.9, 14.9, 15.0, 15.1, 15.1, 15.2, 15.3, 15.3, 15.4, 15.5, 15.6, 15.6, 15.7, 15.8, 15.8, 15.9, 16.0, 16.0, 16.1, 16.2, 16.2, 16.3, 

  7%|▋         | 294/4000 [08:37<1:34:37,  1.53s/it]

(B) Open the bag.
GT: (C) Put down the blanket.
Part  Acc: 58.51%
Total Acc: 75.51%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.0, 4.2, 4.3, 4.4, 4.6, 4.7, 4.8, 4.9, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.3, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.3, 15.4, 15.5, 15.6, 15.8, 15.9, 16.0, 16.2, 16.3, 16.4, 16.6, 16.7, 16.9, 17.0, 17.1, 17.2, 17.4, 17.5, 17.6, 17.8, 17.9, 18.0, 18.2, 18.3, 18.4, 18.5, 18.7, 18.8, 18.9, 19.1, 19.2, 19

  7%|▋         | 295/4000 [08:39<1:35:19,  1.54s/it]

(B) Open the bag.
GT: (B) Open the bag.
Part  Acc: 58.95%
Total Acc: 75.59%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.7, 10.8, 10.9, 11.0, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.0, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.9, 13.0, 13.0, 13.1, 13.2, 13.2, 13.3, 13.4, 13.4, 13.5, 13.6, 13.7, 13.7, 13.8, 13.9, 13.9, 14.0, 14.1, 14.1, 14.2, 14.3, 14.4, 14.4, 14.5, 14.6, 14.6, 14.7, 14.8, 14.9, 14.9, 15

  7%|▋         | 296/4000 [08:40<1:34:46,  1.54s/it]

(B) Take the book.
GT: (A) Take the pillow.
Part  Acc: 58.33%
Total Acc: 75.34%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5, 2.7, 2.9, 3.1, 3.3, 3.5, 3.7, 3.9, 4.1, 4.3, 4.6, 4.8, 5.0, 5.2, 5.4, 5.6, 5.8, 6.0, 6.2, 6.4, 6.6, 6.8, 7.0, 7.2, 7.5, 7.7, 7.9, 8.1, 8.3, 8.4, 8.6, 8.8, 9.0, 9.2, 9.4, 9.6, 9.8, 10.0, 10.3, 10.5, 10.7, 10.9, 11.1, 11.3, 11.5, 11.7, 11.9, 12.1, 12.3, 12.5, 12.7, 12.9, 13.1, 13.4, 13.6, 13.8, 14.0, 14.2, 14.4, 14.6, 14.8, 15.0, 15.2, 15.4, 15.6, 15.8, 16.0, 16.2, 16.4, 16.6, 16.8, 17.0, 17.2, 17.4, 17.6, 17.8, 18.0, 18.2, 18.4, 18.6, 18.8, 19.1, 19.3, 19.5, 19.7, 19.9, 20.1, 20.3, 20.5, 20.7, 20.9, 21.1, 21.3, 21.5, 21.7, 22.0, 22.2, 22.4, 22.6, 22.8, 23.0, 23.2, 23.4, 23.6, 23.8, 24.0,

  7%|▋         | 297/4000 [08:42<1:36:04,  1.56s/it]

(A) Put down the book.
GT: (B) Eat the sandwich.
Part  Acc: 57.73%
Total Acc: 75.08%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.5, 0.7, 0.8, 1.0, 1.1, 1.3, 1.4, 1.6, 1.7, 1.9, 2.0, 2.2, 2.3, 2.5, 2.6, 2.8, 2.9, 3.1, 3.2, 3.3, 3.5, 3.6, 3.8, 3.9, 4.1, 4.2, 4.4, 4.5, 4.7, 4.8, 5.0, 5.1, 5.3, 5.4, 5.6, 5.7, 5.9, 6.0, 6.2, 6.3, 6.5, 6.6, 6.8, 6.9, 7.1, 7.2, 7.4, 7.5, 7.7, 7.8, 8.0, 8.1, 8.3, 8.4, 8.6, 8.7, 8.9, 9.0, 9.2, 9.3, 9.5, 9.6, 9.8, 9.9, 10.1, 10.2, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.0, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.5, 13.6, 13.8, 13.9, 14.1, 14.2, 14.4, 14.5, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.2, 16.3, 16.5, 16.6, 16.8, 16.9, 17.1, 17.2, 17.4, 17.5, 17.7, 17.8, 18.0,

  7%|▋         | 298/4000 [08:44<1:43:19,  1.67s/it]

(C) Take the cup/glass/bottle.
GT: (A) Take the sandwich.
Part  Acc: 57.14%
Total Acc: 74.83%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3 seconds. Carefully 

  7%|▋         | 299/4000 [08:45<1:39:33,  1.61s/it]

(D) Take the dish.
GT: (C) Eat the sandwich.
Part  Acc: 56.57%
Total Acc: 74.58%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8, 3.0, 3.2, 3.4, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9, 5.1, 5.3, 5.5, 5.7, 6.0, 6.2, 6.4, 6.6, 6.8, 7.0, 7.2, 7.4, 7.6, 7.8, 8.0, 8.3, 8.5, 8.7, 8.9, 9.1, 9.3, 9.5, 9.7, 9.9, 10.1, 10.3, 10.6, 10.8, 11.0, 11.2, 11.4, 11.6, 11.8, 12.0, 12.2, 12.4, 12.6, 12.9, 13.1, 13.3, 13.5, 13.7, 13.9, 14.1, 14.3, 14.5, 14.7, 14.9, 15.2, 15.4, 15.6, 15.8, 16.0, 16.2, 16.4, 16.6, 16.9, 17.1, 17.3, 17.5, 17.7, 17.9, 18.1, 18.3, 18.5, 18.7, 19.0, 19.2, 19.4, 19.6, 19.8, 20.0, 20.2, 20.4, 20.6, 20.8, 21.0, 21.3, 21.5, 21.7, 21.9, 22.1, 22.3, 22.5, 22.7, 22.9, 23.1, 23.3, 23.6, 23.8, 24.0, 24.2, 24.4, 24.6

  8%|▊         | 300/4000 [08:47<1:46:23,  1.73s/it]

(D) Lie on the sofa/couch.
GT: (A) Close the laptop.
Part  Acc: 56.00%
Total Acc: 74.33%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 15.0, 15.1, 15.2, 15.3, 15.3, 15.4, 15.5, 15.5, 15.6, 15.7, 15.8, 15.8, 15.9, 16.0, 16.0, 16.1, 16.2, 16.3, 16.3, 16.4, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 17.0, 17.1, 17.1, 17.2, 17.3, 17.4, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.9, 17.9, 18.0, 18.1, 18.1, 18.2, 18.3, 18.4, 18.4, 18.5, 18.6, 18.7, 18.7, 18.8, 18.9, 18.9, 19.0, 19.1, 19.2, 19.2, 19.3, 19.4, 19.4, 19.5, 19.6, 19.7, 19.7, 19.8, 19.9, 20.0, 20.0, 20.1, 20.2, 20.2, 20.3, 20.4, 20.5, 20.5, 20.6, 20.7, 20.8, 20.8, 20.9, 21.0, 21.0, 21.1, 21.2, 21.3, 21.3, 21.4, 21.5, 21.5, 21.6, 21.7, 21.8, 21.8, 21.9, 22.0, 22.1, 22.1, 22.2, 22.3, 22.3, 22.4, 22.5, 22.6, 22.6, 22.7, 22.8, 22.9, 22.9

  8%|▊         | 301/4000 [08:49<1:45:45,  1.72s/it]

(A) Open the book.
GT: (A) Open the book.
Part  Acc: 56.44%
Total Acc: 74.42%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.2, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.7, 15.8, 15.9, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.7, 18.8, 18.9, 19.0, 19.1, 19.2, 19.4, 19.5, 19.6, 

  8%|▊         | 302/4000 [08:50<1:43:14,  1.67s/it]

(B) Open the refrigerator.
GT: (B) Open the refrigerator.
Part  Acc: 56.86%
Total Acc: 74.50%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 1.1, 1.3, 1.5, 1.6, 1.8, 2.0, 2.2, 2.3, 2.5, 2.7, 2.8, 3.0, 3.2, 3.4, 3.5, 3.7, 3.9, 4.1, 4.2, 4.4, 4.6, 4.8, 4.9, 5.1, 5.3, 5.5, 5.6, 5.8, 6.0, 6.1, 6.3, 6.5, 6.7, 6.8, 7.0, 7.2, 7.4, 7.5, 7.7, 7.9, 8.1, 8.2, 8.4, 8.6, 8.8, 8.9, 9.1, 9.3, 9.4, 9.6, 9.8, 10.0, 10.1, 10.3, 10.5, 10.7, 10.8, 11.0, 11.2, 11.4, 11.5, 11.7, 11.9, 12.1, 12.2, 12.4, 12.6, 12.7, 12.9, 13.1, 13.3, 13.4, 13.6, 13.8, 14.0, 14.1, 14.3, 14.5, 14.7, 14.8, 15.0, 15.2, 15.4, 15.5, 15.7, 15.9, 16.0, 16.2, 16.4, 16.6, 16.7, 16.9, 17.1, 17.3, 17.4, 17.6, 17.8, 18.0, 18.1, 18.3, 18.5, 18.7, 18.8, 19.0, 19.2, 19.3, 19.5, 19.7, 19.9, 20.0, 20.2, 20.4, 20.6, 2

  8%|▊         | 303/4000 [08:52<1:42:23,  1.66s/it]

(D) Open the laptop.
GT: (D) Open the laptop.
Part  Acc: 57.28%
Total Acc: 74.59%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.8, 7.9, 7.9, 8.0, 8.0, 8.1, 8.1, 8.2, 8.2, 8.3, 8.3, 8.4, 8.4, 8.5, 8.5, 8.6, 8.7, 8.7, 8.8, 8.8, 8.9, 8.9, 9.0, 9.0, 9.1, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.7, 9.8, 9.8, 9.9, 9.9, 10.0, 10.0, 10.1, 10.1, 10.2, 10.2, 10.3, 10.3, 10.4, 10.5, 10.5, 10.6, 10.6, 10.7, 10.7, 10.8, 10.8, 10.9, 10.9, 11.0, 11.0, 11.1, 11.1, 11.2, 11.3, 11.3, 11.4, 11.4, 11.5, 11.5, 11.6, 11.6, 11.7, 11.7, 11.8, 11.8, 11.9, 11.9, 12.0, 12.1, 12.1, 12.2, 12.2, 12.3, 12.3, 12.4, 12.4, 12.5, 12.5, 12.6, 12.6, 12.7, 12.7, 12.8, 12.8, 12.9, 13.0, 13.0, 13.1, 13.1, 13.2, 13.2, 13.3, 13.3, 13.4, 13.4, 13.5, 13.5, 13.6, 13.6, 13.7, 13.7, 13.8, 13.9, 13.9, 14.

  8%|▊         | 304/4000 [08:54<1:42:47,  1.67s/it]

(C) Close the laptop.
GT: (C) Close the laptop.
Part  Acc: 57.69%
Total Acc: 74.67%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.6, 5.7, 5.7, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1 seconds. Carefully 

  8%|▊         | 305/4000 [08:55<1:40:39,  1.63s/it]

(C) Put down the box.
GT: (C) Put down the box.
Part  Acc: 58.10%
Total Acc: 74.75%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.9, 1.1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.8, 2.0, 2.1, 2.2, 2.4, 2.5, 2.6, 2.7, 2.9, 3.0, 3.1, 3.3, 3.4, 3.5, 3.7, 3.8, 3.9, 4.0, 4.2, 4.3, 4.4, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.8, 16.0, 16.1, 16.2, 16.4, 16.5, 16

  8%|▊         | 306/4000 [08:57<1:38:49,  1.61s/it]

(B) Take the book.
GT: (B) Take the book.
Part  Acc: 58.49%
Total Acc: 74.84%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.4, 0.5, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4, 1.5, 1.7, 1.8, 2.0, 2.1, 2.3, 2.4, 2.6, 2.7, 2.8, 3.0, 3.1, 3.3, 3.4, 3.6, 3.7, 3.9, 4.0, 4.2, 4.3, 4.5, 4.6, 4.8, 4.9, 5.0, 5.2, 5.3, 5.5, 5.6, 5.8, 5.9, 6.1, 6.2, 6.4, 6.5, 6.7, 6.8, 6.9, 7.1, 7.2, 7.4, 7.5, 7.7, 7.8, 8.0, 8.1, 8.3, 8.4, 8.6, 8.7, 8.9, 9.0, 9.1, 9.3, 9.4, 9.6, 9.7, 9.9, 10.0, 10.2, 10.3, 10.5, 10.6, 10.8, 10.9, 11.0, 11.2, 11.3, 11.5, 11.6, 11.8, 11.9, 12.1, 12.2, 12.4, 12.5, 12.7, 12.8, 13.0, 13.1, 13.2, 13.4, 13.5, 13.7, 13.8, 14.0, 14.1, 14.3, 14.4, 14.6, 14.7, 14.9, 15.0, 15.1, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.2, 16.3, 16.5, 16.6, 16.8, 16.9, 17.1, 17.2, 17.3, 17.5, 17.6, 17.8, 17.9

  8%|▊         | 307/4000 [08:58<1:40:15,  1.63s/it]

(A) Take the sandwich.
GT: (A) Take the sandwich.
Part  Acc: 58.88%
Total Acc: 74.92%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.3, 4.4, 4.5, 4.7, 4.8, 5.0, 5.1, 5.2, 5.4, 5.5, 5.6, 5.8, 5.9, 6.0, 6.2, 6.3, 6.4, 6.6, 6.7, 6.9, 7.0, 7.1, 7.3, 7.4, 7.5, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.9, 9.0, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.4, 10.5, 10.6, 10.8, 10.9, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.1, 13.2, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.4, 15.5, 15.7, 15.8, 15.9, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.7, 17.8, 18.0, 18.1, 18.2, 18.4, 18.5, 18.6, 18.8, 18.9, 19.0, 19.2, 19.3

  8%|▊         | 308/4000 [09:00<1:41:54,  1.66s/it]

(B) Open the refrigerator.
GT: (B) Open the refrigerator.
Part  Acc: 59.26%
Total Acc: 75.00%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.6, 7.7, 7.9, 8.0, 8.1, 8.3, 8.4, 8.6, 8.7, 8.9, 9.0, 9.1, 9.3, 9.4, 9.6, 9.7, 9.9, 10.0, 10.1, 10.3, 10.4, 10.6, 10.7, 10.9, 11.0, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.0, 12.2, 12.3, 12.4, 12.6, 12.7, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.7, 13.9, 14.0, 14.2, 14.3, 14.4, 14.6, 14.7, 14.9, 15.0, 15.2, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.2, 16.3, 16.4, 16.6, 16.7, 16.9, 17.0, 17.2, 17.3, 17.4, 17.6, 17.7, 17.9, 18.0, 18.2, 18.3, 18.5, 18.6, 18.7, 18.9, 19.0, 19.2, 19.3, 19.5, 19.6, 19.7, 19.9, 20.0, 20.2, 20.3, 20.5, 20.6, 20.7, 20.9, 21.0, 21.2, 21.3, 21.5, 21.6, 21.7, 21.9, 22.0, 22.2, 22.3, 22.5, 22.6, 22.7, 22.9, 23.0, 23.2, 23.3, 23.5, 23.

  8%|▊         | 309/4000 [09:02<1:40:25,  1.63s/it]

(B) Take the food.
GT: (B) Take the food.
Part  Acc: 59.63%
Total Acc: 75.08%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9,

  8%|▊         | 310/4000 [09:03<1:37:38,  1.59s/it]

(A) Open the box.
GT: (A) Open the box.
Part  Acc: 60.00%
Total Acc: 75.16%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 1.0, 1.2, 1.3, 1.4, 1.5, 1.7, 1.8, 1.9, 2.0, 2.1, 2.3, 2.4, 2.5, 2.6, 2.7, 2.9, 3.0, 3.1, 3.3, 3.4, 3.5, 3.6, 3.8, 3.9, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.2, 5.4, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.1, 1

  8%|▊         | 311/4000 [09:05<1:37:54,  1.59s/it]

(B) Throw the clothes.
GT: (B) Throw the clothes.
Part  Acc: 60.36%
Total Acc: 75.24%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.6, 1.9, 2.1, 2.4, 2.7, 2.9, 3.2, 3.4, 3.7, 3.9, 4.2, 4.4, 4.7, 5.0, 5.2, 5.4, 5.7, 6.0, 6.2, 6.5, 6.8, 7.0, 7.2, 7.5, 7.8, 8.0, 8.3, 8.6, 8.8, 9.0, 9.3, 9.6, 9.8, 10.1, 10.3, 10.6, 10.8, 11.1, 11.3, 11.6, 11.9, 12.1, 12.4, 12.6, 12.9, 13.1, 13.4, 13.7, 13.9, 14.2, 14.4, 14.7, 14.9, 15.2, 15.4, 15.7, 16.0, 16.2, 16.5, 16.7, 17.0, 17.2, 17.5, 17.8, 18.0, 18.3, 18.5, 18.8, 19.0, 19.3, 19.6, 19.8, 20.1, 20.3, 20.6, 20.8, 21.1, 21.3, 21.6, 21.9, 22.1, 22.4, 22.6, 22.9, 23.1, 23.4, 23.7, 23.9, 24.2, 24.4, 24.7, 24.9, 25.2, 25.4, 25.7, 26.0, 26.2, 26.5, 26.7, 27.0, 27.2, 27.5, 27.8, 28.0, 28.3, 28.5, 28.8, 29.0, 29.3, 29.6, 29.8, 30.1, 30.3, 30.6, 30.8, 3

  8%|▊         | 312/4000 [09:07<1:46:17,  1.73s/it]

(C) Tidy up the blanket.
GT: (D) Open the bag.
Part  Acc: 59.82%
Total Acc: 75.00%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.6, 1.7, 2.0, 2.2, 2.4, 2.6, 2.8, 3.1, 3.2, 3.5, 3.7, 3.9, 4.1, 4.3, 4.6, 4.7, 5.0, 5.2, 5.4, 5.6, 5.8, 6.1, 6.2, 6.5, 6.7, 6.9, 7.1, 7.3, 7.5, 7.8, 8.0, 8.2, 8.4, 8.6, 8.8, 9.0, 9.3, 9.4, 9.7, 9.9, 10.1, 10.3, 10.5, 10.8, 10.9, 11.2, 11.4, 11.6, 11.8, 12.0, 12.3, 12.4, 12.7, 12.9, 13.1, 13.3, 13.5, 13.8, 13.9, 14.2, 14.4, 14.6, 14.8, 15.0, 15.2, 15.4, 15.6, 15.9, 16.1, 16.3, 16.5, 16.7, 16.9, 17.1, 17.4, 17.5, 17.8, 18.0, 18.2, 18.4, 18.6, 18.9, 19.0, 19.3, 19.5, 19.7, 20.0, 20.1, 20.4, 20.6, 20.8, 21.0, 21.2, 21.4, 21.6, 21.9, 22.1, 22.3, 22.5, 22.7, 22.9, 23.1, 23.3, 23.6, 23.8, 24.0, 24.2, 24.4, 24.6, 24.8, 25.

  8%|▊         | 313/4000 [09:08<1:43:51,  1.69s/it]

(C) Throw the clothes.
GT: (C) Throw the clothes.
Part  Acc: 60.18%
Total Acc: 75.08%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.5, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8, 2.9, 3.1, 3.3, 3.5, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9, 5.1, 5.2, 5.4, 5.6, 5.8, 6.0, 6.2, 6.4, 6.6, 6.8, 7.0, 7.2, 7.4, 7.6, 7.7, 7.9, 8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.3, 9.5, 9.7, 9.9, 10.0, 10.2, 10.4, 10.6, 10.8, 11.0, 11.2, 11.4, 11.6, 11.8, 12.0, 12.2, 12.3, 12.5, 12.7, 12.9, 13.1, 13.3, 13.5, 13.7, 13.9, 14.1, 14.3, 14.5, 14.7, 14.8, 15.0, 15.2, 15.4, 15.6, 15.8, 16.0, 16.2, 16.4, 16.6, 16.8, 17.0, 17.1, 17.3, 17.5, 17.7, 17.9, 18.1, 18.3, 18.5, 18.7, 18.9, 19.1, 19.3, 19.5, 19.6, 19.8, 20.0, 20.2, 20.4, 20.6, 20.8, 21.0, 21.2, 21.4, 21.6, 21.8, 21.9, 22.1, 22.3, 22.5, 22

  8%|▊         | 314/4000 [09:10<1:42:31,  1.67s/it]

(D) Take the food.
GT: (C) Throw the pillow.
Part  Acc: 59.65%
Total Acc: 74.84%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.3, 1.5, 1.7, 1.9, 2.1, 2.2, 2.4, 2.6, 2.8, 3.0, 3.1, 3.3, 3.5, 3.7, 3.9, 4.0, 4.2, 4.4, 4.6, 4.8, 4.9, 5.1, 5.3, 5.5, 5.7, 5.9, 6.0, 6.2, 6.4, 6.6, 6.8, 6.9, 7.1, 7.3, 7.5, 7.7, 7.8, 8.0, 8.2, 8.4, 8.6, 8.7, 8.9, 9.1, 9.3, 9.5, 9.6, 9.8, 10.0, 10.2, 10.4, 10.5, 10.7, 10.9, 11.1, 11.3, 11.5, 11.6, 11.8, 12.0, 12.2, 12.4, 12.5, 12.7, 12.9, 13.1, 13.3, 13.4, 13.6, 13.8, 14.0, 14.2, 14.3, 14.5, 14.7, 14.9, 15.1, 15.2, 15.4, 15.6, 15.8, 16.0, 16.2, 16.3, 16.5, 16.7, 16.9, 17.1, 17.3, 17.4, 17.6, 17.8, 18.0, 18.2, 18.4, 18.5, 18.7, 18.9, 19.1, 19.3, 19.4, 19.6, 19.8, 20.0, 20.2, 20.3, 20.5, 20.7, 20.9, 21.1, 21.2, 21.4,

  8%|▊         | 315/4000 [09:12<1:43:53,  1.69s/it]

(A) Take the bag.
GT: (C) Throw the blanket.
Part  Acc: 59.13%
Total Acc: 74.60%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.2, 0.2, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.7, 0.7, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.7, 1.7, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.8, 7.9 seconds. Carefully 

  8%|▊         | 316/4000 [09:13<1:40:46,  1.64s/it]

(A) Put down the sandwich.
GT: (A) Put down the sandwich.
Part  Acc: 59.48%
Total Acc: 74.68%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 1.0, 1.2, 1.3, 1.5, 1.7, 1.8, 2.0, 2.2, 2.3, 2.5, 2.6, 2.8, 3.0, 3.1, 3.3, 3.4, 3.6, 3.8, 3.9, 4.1, 4.3, 4.4, 4.6, 4.7, 4.9, 5.1, 5.2, 5.4, 5.5, 5.7, 5.9, 6.0, 6.2, 6.4, 6.5, 6.7, 6.8, 7.0, 7.2, 7.3, 7.5, 7.6, 7.8, 8.0, 8.1, 8.3, 8.5, 8.6, 8.8, 8.9, 9.1, 9.3, 9.4, 9.6, 9.7, 9.9, 10.1, 10.2, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.3, 11.5, 11.6, 11.8, 11.9, 12.1, 12.3, 12.4, 12.6, 12.8, 12.9, 13.1, 13.2, 13.4, 13.6, 13.7, 13.9, 14.0, 14.2, 14.4, 14.5, 14.7, 14.9, 15.0, 15.2, 15.3, 15.5, 15.7, 15.8, 16.0, 16.1, 16.3, 16.5, 16.6, 16.8, 17.0, 17.1, 17.3, 17.4, 17.6, 17.8, 17.9, 18.1, 18.2, 18.4, 18.6, 18.7, 18.9, 19.1, 19.2, 

  8%|▊         | 317/4000 [09:15<1:42:25,  1.67s/it]

(B) Put down the phone/camera.
GT: (B) Put down the phone/camera.
Part  Acc: 59.83%
Total Acc: 74.76%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.5, 0.7, 0.7, 0.8, 1.0, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.2, 2.3, 2.3, 2.5, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.7, 3.7, 3.8, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.2, 5.3, 5.3, 5.5, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.5, 6.7, 6.7, 6.9, 7.0, 7.0, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.5, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.7, 9.9, 10.0, 10.0, 10.2, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.5, 11.7, 11.7, 11.8, 11

  8%|▊         | 318/4000 [09:17<1:39:09,  1.62s/it]

(D) Open the door.
GT: (D) Open the door.
Part  Acc: 60.17%
Total Acc: 74.84%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.6, 0.7, 0.9, 1.1, 1.3, 1.4, 1.6, 1.8, 1.9, 2.1, 2.3, 2.5, 2.6, 2.8, 3.0, 3.1, 3.3, 3.5, 3.7, 3.8, 4.0, 4.2, 4.3, 4.5, 4.7, 4.9, 5.0, 5.2, 5.4, 5.5, 5.7, 5.9, 6.0, 6.2, 6.4, 6.6, 6.7, 6.9, 7.1, 7.2, 7.4, 7.6, 7.8, 7.9, 8.1, 8.3, 8.4, 8.6, 8.8, 8.9, 9.1, 9.3, 9.5, 9.6, 9.8, 10.0, 10.1, 10.3, 10.5, 10.7, 10.8, 11.0, 11.2, 11.3, 11.5, 11.7, 11.9, 12.0, 12.2, 12.4, 12.5, 12.7, 12.9, 13.1, 13.2, 13.4, 13.6, 13.7, 13.9, 14.1, 14.2, 14.4, 14.6, 14.8, 14.9, 15.1, 15.3, 15.4, 15.6, 15.8, 16.0, 16.1, 16.3, 16.5, 16.6, 16.8, 17.0, 17.2, 17.4, 17.5, 17.7, 17.9, 18.0, 18.2, 18.4, 18.6, 18.7, 18.9, 19.1, 19.2, 19.4, 19.6, 19.7, 19.9, 20.1, 20.3, 20

  8%|▊         | 319/4000 [09:18<1:39:47,  1.63s/it]

(A) Take the book.
GT: (A) Take the book.
Part  Acc: 60.50%
Total Acc: 74.92%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13

  8%|▊         | 320/4000 [09:20<1:38:54,  1.61s/it]

(A) Put down the shoe.
GT: (A) Put down the shoe.
Part  Acc: 60.83%
Total Acc: 75.00%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7 seconds. 

  8%|▊         | 321/4000 [09:21<1:37:16,  1.59s/it]

(C) Sit on.
GT: (C) Sit on.
Part  Acc: 61.16%
Total Acc: 75.08%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3 seconds. Carefully 

  8%|▊         | 322/4000 [09:23<1:34:56,  1.55s/it]

(B) Take the clothes.
GT: (B) Take the clothes.
Part  Acc: 61.48%
Total Acc: 75.16%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 15.7, 15.7, 15.8, 15.9, 15.9, 16.0, 16.0, 16.0, 16.1, 16.2, 16.2, 16.3, 16.3, 16.4, 16.5, 16.5, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 16.9, 17.0, 17.1, 17.1, 17.1, 17.2, 17.2, 17.3, 17.4, 17.4, 17.5, 17.5, 17.6, 17.7, 17.7, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.1, 18.1, 18.2, 18.3, 18.3, 18.3, 18.4, 18.4, 18.5, 18.6, 18.6, 18.7, 18.8, 18.8, 18.9, 18.9, 18.9, 19.0, 19.1, 19.1, 19.2, 19.2, 19.3, 19.4, 19.4, 19.5, 19.5, 19.5, 19.6, 19.7, 19.7, 19.8, 19.8, 19.9, 20.0, 20.0, 20.0, 20.1, 20.1, 20.2, 20.3, 20.3, 20.4, 20.4, 20.5, 20.6, 20.6, 20.6, 20.7, 20.7, 20.8, 20.9, 20.9, 21.0, 21.0, 21.1, 21.2, 21.2, 21.2, 21.3, 21.3, 21.4, 21.5, 21.5, 21.6, 21.6

  8%|▊         | 323/4000 [09:25<1:43:15,  1.68s/it]

(D) Take the paper/notebook.
GT: (D) Take the paper/notebook.
Part  Acc: 61.79%
Total Acc: 75.23%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 16.3, 16.4, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 17.0, 17.1, 17.1, 17.2, 17.3, 17.4, 17.4, 17.5, 17.6, 17.6, 17.7, 17.8, 17.9, 17.9, 18.0, 18.1, 18.2, 18.2, 18.3, 18.4, 18.4, 18.5, 18.6, 18.7, 18.7, 18.8, 18.9, 19.0, 19.0, 19.1, 19.2, 19.3, 19.3, 19.4, 19.5, 19.5, 19.6, 19.7, 19.8, 19.8, 19.9, 20.0, 20.1, 20.1, 20.2, 20.3, 20.3, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.9, 20.9, 21.0, 21.1, 21.1, 21.2, 21.3, 21.4, 21.4, 21.5, 21.6, 21.7, 21.7, 21.8, 21.9, 22.0, 22.1, 22.1, 22.2, 22.3, 22.3, 22.4, 22.5, 22.6, 22.6, 22.7, 22.8, 22.9, 22.9, 23.0, 23.1, 23.1, 23.2, 23.3, 23.4, 23.4, 23.5, 23.6, 23.7, 23.7, 23.8, 23.9, 24.0, 24.0, 24.1, 24.2, 24.2, 24.3

  8%|▊         | 324/4000 [09:26<1:39:37,  1.63s/it]

(C) Take the food.
GT: (D) Throw the towel.
Part  Acc: 61.29%
Total Acc: 75.00%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.6, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 4.0, 4.2, 4.4, 4.6, 4.8, 5.0, 5.2, 5.4, 5.6, 5.8, 6.0, 6.2, 6.4, 6.6, 6.8, 7.0, 7.2, 7.4, 7.7, 7.9, 8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.3, 9.5, 9.7, 9.9, 10.1, 10.3, 10.5, 10.7, 10.9, 11.1, 11.3, 11.5, 11.7, 11.9, 12.1, 12.4, 12.6, 12.8, 13.0, 13.2, 13.4, 13.6, 13.8, 14.0, 14.2, 14.4, 14.6, 14.8, 15.0, 15.2, 15.4, 15.6, 15.8, 16.0, 16.2, 16.4, 16.6, 16.8, 17.0, 17.2, 17.5, 17.7, 17.9, 18.1, 18.3, 18.5, 18.7, 18.9, 19.1, 19.3, 19.5, 19.7, 19.9, 20.1, 20.3, 20.5, 20.7, 20.9, 21.1, 21.3, 21.5, 21.7, 21.9, 22.2, 22.4, 22.6, 22.8, 23.0, 23.2, 23.4, 23.6, 23.8, 24.0,

  8%|▊         | 325/4000 [09:28<1:42:12,  1.67s/it]

(D) Take.
GT: (C) Put down.
Part  Acc: 60.80%
Total Acc: 74.77%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.3, 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6 seconds. Carefully 

  8%|▊         | 326/4000 [09:30<1:39:10,  1.62s/it]

(A) Put down.
GT: (D) Tidy up.
Part  Acc: 60.32%
Total Acc: 74.54%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 13.9, 14.0, 14.1, 14.1, 14.1, 14.2, 14.3, 14.3, 14.4, 14.4, 14.5, 14.5, 14.6, 14.7, 14.7, 14.7, 14.8, 14.9, 14.9, 15.0, 15.0, 15.0, 15.1, 15.2, 15.2, 15.3, 15.3, 15.4, 15.4, 15.5, 15.6, 15.6, 15.6, 15.7, 15.8, 15.8, 15.9, 15.9, 16.0, 16.0, 16.1, 16.2, 16.2, 16.2, 16.3, 16.4, 16.4, 16.5, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.8, 16.9, 17.0, 17.0, 17.1, 17.1, 17.2, 17.2, 17.3, 17.4, 17.4, 17.4, 17.5, 17.6, 17.6, 17.7, 17.7, 17.8, 17.8, 17.9, 18.0, 18.0, 18.0, 18.1, 18.2, 18.2, 18.3, 18.3, 18.4, 18.4, 18.5, 18.6, 18.6, 18.6, 18.7, 18.8, 18.8, 18.9, 18.9, 19.0, 19.0, 19.1, 19.2, 19.2, 19.2, 19.3, 19.4, 19.4, 19.5, 19.5, 19.6, 19.6, 19.7, 19.8, 19.8, 19.8

  8%|▊         | 327/4000 [09:32<1:57:17,  1.92s/it]

(B) Put down the pillow.
GT: (B) Put down the pillow.
Part  Acc: 60.63%
Total Acc: 74.62%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.5, 0.7, 0.8, 1.0, 1.3, 1.5, 1.6, 1.8, 2.0, 2.1, 2.4, 2.6, 2.8, 2.9, 3.1, 3.3, 3.6, 3.7, 3.9, 4.1, 4.2, 4.4, 4.7, 4.9, 5.1, 5.2, 5.4, 5.5, 5.9, 6.0, 6.2, 6.4, 6.5, 6.7, 7.0, 7.2, 7.3, 7.5, 7.7, 7.8, 8.1, 8.3, 8.5, 8.6, 8.8, 9.0, 9.3, 9.5, 9.6, 9.8, 9.9, 10.1, 10.4, 10.6, 10.8, 10.9, 11.1, 11.2, 11.6, 11.7, 11.9, 12.1, 12.2, 12.4, 12.6, 12.9, 13.0, 13.2, 13.4, 13.5, 13.7, 14.0, 14.2, 14.3, 14.5, 14.7, 14.8, 15.2, 15.3, 15.5, 15.6, 15.8, 16.0, 16.3, 16.5, 16.6, 16.8, 17.0, 17.1, 17.4, 17.6, 17.8, 17.9, 18.1, 18.3, 18.6, 18.7, 18.9, 19.1, 19.2, 19.4, 19.7, 19.9, 20.0, 20.2, 20.4, 20.5, 20.9, 21.0, 21.2, 21.4, 21.5, 21.7, 22.0, 22.2, 22.3, 22.5, 22.7, 2

  8%|▊         | 328/4000 [09:34<1:50:33,  1.81s/it]

(B) Take the clothes.
GT: (B) Take the clothes.
Part  Acc: 60.94%
Total Acc: 74.70%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.8, 2.0, 2.1, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.1, 3.3, 3.4, 3.5, 3.7, 3.8, 3.9, 4.1, 4.2, 4.3, 4.4, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.4, 5.5, 5.6, 5.7, 5.9, 6.0, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.3, 9.4, 9.5, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.8, 15.

  8%|▊         | 329/4000 [09:35<1:45:42,  1.73s/it]

(B) Put down.
GT: (B) Put down.
Part  Acc: 61.24%
Total Acc: 74.77%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.6, 1.7, 1.9, 2.1, 2.3, 2.5, 2.7, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9, 5.0, 5.2, 5.4, 5.6, 5.8, 6.0, 6.1, 6.3, 6.5, 6.7, 6.9, 7.1, 7.2, 7.4, 7.6, 7.8, 8.0, 8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.2, 9.4, 9.6, 9.8, 10.0, 10.2, 10.3, 10.5, 10.7, 10.9, 11.1, 11.3, 11.4, 11.6, 11.8, 12.0, 12.2, 12.4, 12.5, 12.7, 12.9, 13.1, 13.3, 13.4, 13.6, 13.8, 14.0, 14.2, 14.4, 14.5, 14.7, 14.9, 15.1, 15.3, 15.5, 15.6, 15.8, 16.0, 16.2, 16.4, 16.6, 16.8, 17.0, 17.1, 17.3, 17.5, 17.7, 17.9, 18.1, 18.2, 18.4, 18.6, 18.8, 19.0, 19.1, 19.3, 19.5, 19.7, 19.9, 20.1, 20.2, 20.4, 20.6, 20.8, 21.0, 21.2, 21.3, 21.5, 21.7, 21.9, 22.1, 22.3, 22.4, 22.6, 22.8, 23

  8%|▊         | 330/4000 [09:37<1:44:53,  1.71s/it]

(A) Put down.
GT: (A) Put down.
Part  Acc: 61.54%
Total Acc: 74.85%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.6,

  8%|▊         | 331/4000 [09:38<1:41:08,  1.65s/it]

(B) Take the food.
GT: (C) Close the refrigerator.
Part  Acc: 61.07%
Total Acc: 74.62%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.2 seco

  8%|▊         | 332/4000 [09:40<1:39:00,  1.62s/it]

(B) Take the blanket.
GT: (B) Take the blanket.
Part  Acc: 61.36%
Total Acc: 74.70%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.4, 1.5, 1.6, 1.7, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 3.0, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.1 seconds. Careful

  8%|▊         | 333/4000 [09:42<1:38:38,  1.61s/it]

(D) Put down the shoe.
GT: (D) Put down the shoe.
Part  Acc: 61.65%
Total Acc: 74.77%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.4, 3.6, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.5, 4.7, 4.8, 4.8, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.7, 5.9, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.1, 7.3, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.0, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.3, 10.5, 10.5, 10.6, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.7, 11.9, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 12.9, 13.1, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.8, 14.0,

  8%|▊         | 334/4000 [09:43<1:37:55,  1.60s/it]

(A) Close the refrigerator.
GT: (A) Close the refrigerator.
Part  Acc: 61.94%
Total Acc: 74.85%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 2.5, 2.7, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.5, 3.7, 3.8, 3.9, 4.0, 4.2, 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.6, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.4, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.8, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.4, 16.5, 16.6, 16.7, 16.9, 17.0, 17.1, 17.2, 17.4, 17.

  8%|▊         | 335/4000 [09:45<1:37:48,  1.60s/it]

(B) Take the pillow.
GT: (B) Take the pillow.
Part  Acc: 62.22%
Total Acc: 74.93%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.2, 6.4, 6.5, 6.7, 6.9, 7.1, 7.3, 7.5, 7.6, 7.8, 8.0, 8.2, 8.4, 8.6, 8.8, 8.9, 9.1, 9.3, 9.5, 9.7, 9.9, 10.0, 10.2, 10.4, 10.6, 10.8, 11.0, 11.2, 11.3, 11.5, 11.7, 11.9, 12.1, 12.3, 12.5, 12.6, 12.8, 13.0, 13.2, 13.4, 13.6, 13.7, 13.9, 14.1, 14.3, 14.5, 14.7, 14.9, 15.0, 15.2, 15.4, 15.6, 15.8, 16.0, 16.2, 16.3, 16.5, 16.7, 16.9, 17.1, 17.3, 17.4, 17.6, 17.8, 18.0, 18.2, 18.4, 18.6, 18.7, 18.9, 19.1, 19.3, 19.5, 19.7, 19.8, 20.0, 20.2, 20.4, 20.6, 20.8, 21.0, 21.1, 21.3, 21.5, 21.7, 21.9, 22.1, 22.3, 22.4, 22.6, 22.8, 23.0, 23.2, 23.4, 23.5, 23.7, 23.9, 24.1, 24.3, 24.5, 24.7, 24.8, 25.0, 25.2, 25.4, 25.6, 25.8, 26.0, 26.1, 26.3, 26.5, 26.7, 26.9, 2

  8%|▊         | 336/4000 [09:47<1:42:22,  1.68s/it]

(D) Put down the sandwich.
GT: (D) Put down the sandwich.
Part  Acc: 62.50%
Total Acc: 75.00%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.5, 0.7, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.3, 2.5, 2.7, 2.9, 3.1, 3.2, 3.4, 3.6, 3.8, 4.0, 4.2, 4.4, 4.6, 4.7, 4.9, 5.1, 5.3, 5.5, 5.7, 5.8, 6.0, 6.2, 6.4, 6.6, 6.8, 7.0, 7.1, 7.3, 7.5, 7.7, 7.9, 8.1, 8.2, 8.4, 8.6, 8.8, 9.0, 9.2, 9.4, 9.6, 9.7, 9.9, 10.1, 10.3, 10.5, 10.6, 10.8, 11.0, 11.2, 11.4, 11.6, 11.8, 12.0, 12.1, 12.3, 12.5, 12.7, 12.9, 13.0, 13.2, 13.4, 13.6, 13.8, 14.0, 14.2, 14.4, 14.6, 14.7, 14.9, 15.1, 15.3, 15.5, 15.6, 15.8, 16.0, 16.2, 16.4, 16.6, 16.8, 17.0, 17.1, 17.3, 17.5, 17.7, 17.9, 18.0, 18.2, 18.4, 18.6, 18.8, 19.0, 19.2, 19.4, 19.5, 19.7, 19.9, 20.1, 20.3, 20.5, 20.6, 20.8, 21.0, 21.2, 21.4, 21.6, 21.8, 21.9, 22.1, 22

  8%|▊         | 337/4000 [09:48<1:44:34,  1.71s/it]

(B) Sit at the table.
GT: (B) Sit at the table.
Part  Acc: 62.77%
Total Acc: 75.07%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.2, 3.2, 3.3, 3.3, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.9, 3.9, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.7, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.3, 7.4, 7.5, 7.5, 7.6 seconds. Carefully 

  8%|▊         | 338/4000 [09:50<1:38:36,  1.62s/it]

(D) Put down.
GT: (D) Put down.
Part  Acc: 63.04%
Total Acc: 75.15%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 22.1, 22.2, 22.4, 22.5, 22.6, 22.7, 22.8, 23.0, 23.1, 23.3, 23.4, 23.5, 23.6, 23.7, 23.9, 24.0, 24.1, 24.3, 24.4, 24.5, 24.6, 24.8, 24.9, 25.0, 25.2, 25.3, 25.5, 25.5, 25.6, 25.8, 25.9, 26.1, 26.2, 26.3, 26.5, 26.5, 26.7, 26.8, 26.9, 27.1, 27.2, 27.4, 27.5, 27.5, 27.7, 27.8, 28.0, 28.1, 28.3, 28.4, 28.5, 28.6, 28.7, 28.9, 29.0, 29.1, 29.3, 29.4, 29.5, 29.6, 29.7, 29.9, 30.0, 30.2, 30.3, 30.4, 30.5, 30.6, 30.8, 30.9, 31.0, 31.2, 31.3, 31.5, 31.5, 31.6, 31.8, 31.9, 32.1, 32.2, 32.3, 32.5, 32.5, 32.7, 32.8, 33.0, 33.1, 33.2, 33.4, 33.5, 33.6, 33.7, 33.8, 34.0, 34.1, 34.3, 34.4, 34.5, 34.6, 34.7, 34.9, 35.0, 35.1, 35.3, 35.4, 35.5, 35.6, 35.7, 35.9, 36.0

  8%|▊         | 339/4000 [09:51<1:37:58,  1.61s/it]

(A) Put down the laptop.
GT: (A) Put down the laptop.
Part  Acc: 63.31%
Total Acc: 75.22%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 

  8%|▊         | 340/4000 [09:53<1:36:04,  1.57s/it]

(D) Put down the pillow.
GT: (D) Put down the pillow.
Part  Acc: 63.57%
Total Acc: 75.29%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.1, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.4, 10.4, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.3, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.3, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17

  9%|▊         | 341/4000 [09:55<1:37:23,  1.60s/it]

(D) Take the dish.
GT: (D) Take the dish.
Part  Acc: 63.83%
Total Acc: 75.37%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.6, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 4.0, 4.3, 4.5, 4.7, 4.9, 5.1, 5.3, 5.5, 5.7, 5.9, 6.1, 6.3, 6.5, 6.7, 6.9, 7.2, 7.4, 7.6, 7.8, 8.0, 8.2, 8.4, 8.6, 8.8, 9.0, 9.2, 9.4, 9.6, 9.9, 10.1, 10.3, 10.5, 10.7, 10.9, 11.1, 11.3, 11.5, 11.7, 11.9, 12.1, 12.3, 12.6, 12.8, 13.0, 13.2, 13.4, 13.6, 13.8, 14.0, 14.2, 14.4, 14.6, 14.8, 15.0, 15.3, 15.5, 15.7, 15.9, 16.1, 16.3, 16.5, 16.7, 16.9, 17.2, 17.4, 17.6, 17.8, 18.0, 18.2, 18.4, 18.6, 18.8, 19.0, 19.2, 19.4, 19.6, 19.9, 20.1, 20.3, 20.5, 20.7, 20.9, 21.1, 21.3, 21.5, 21.7, 21.9, 22.1, 22.3, 22.5, 22.8, 23.0, 23.2, 23.4, 23.6, 23.8, 24.0, 24.2, 24.4

  9%|▊         | 342/4000 [09:56<1:38:42,  1.62s/it]

(D) Put down the phone/camera.
GT: (D) Put down the phone/camera.
Part  Acc: 64.08%
Total Acc: 75.44%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.1, 6.2, 6.4, 6.5, 6.7, 6.8, 7.0, 7.2, 7.3, 7.5, 7.6, 7.8, 7.9, 8.1, 8.3, 8.4, 8.6, 8.7, 8.9, 9.0, 9.2, 9.3, 9.5, 9.7, 9.8, 10.0, 10.1, 10.3, 10.4, 10.6, 10.8, 10.9, 11.1, 11.2, 11.4, 11.5, 11.7, 11.8, 12.0, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.1, 13.3, 13.4, 13.6, 13.7, 13.9, 14.0, 14.2, 14.3, 14.5, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.6, 15.8, 15.9, 16.1, 16.2, 16.4, 16.5, 16.7, 16.8, 17.0, 17.2, 17.3, 17.5, 17.6, 17.8, 17.9, 18.1, 18.3, 18.4, 18.6, 18.7, 18.9, 19.0, 19.2, 19.3, 19.5, 19.7, 19.8, 20.0, 20.1, 20.3, 20.4, 20.6, 20.8, 20.9, 21.1, 21.2, 21.4, 21.5, 21.7, 21.8, 22.0, 22.2, 22.3, 22.5, 22.6, 22.8, 22.9, 23.1, 23.3, 23.4, 23.6, 23.7,

  9%|▊         | 343/4000 [09:58<1:41:33,  1.67s/it]

(C) Wash the mirror.
GT: (C) Wash the mirror.
Part  Acc: 64.34%
Total Acc: 75.51%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.6, 0.7, 0.9, 1.0, 1.2, 1.4, 1.5, 1.7, 1.9, 2.0, 2.2, 2.4, 2.5, 2.7, 2.8, 3.0, 3.2, 3.3, 3.5, 3.7, 3.8, 4.0, 4.2, 4.4, 4.5, 4.7, 4.8, 5.0, 5.2, 5.3, 5.5, 5.7, 5.8, 6.0, 6.2, 6.3, 6.5, 6.6, 6.8, 7.0, 7.1, 7.3, 7.5, 7.6, 7.8, 8.0, 8.1, 8.3, 8.4, 8.6, 8.8, 8.9, 9.1, 9.3, 9.4, 9.6, 9.7, 9.9, 10.1, 10.2, 10.4, 10.6, 10.7, 10.9, 11.1, 11.2, 11.4, 11.5, 11.7, 11.9, 12.0, 12.2, 12.4, 12.5, 12.7, 12.9, 13.1, 13.2, 13.4, 13.5, 13.7, 13.9, 14.0, 14.2, 14.4, 14.5, 14.7, 14.9, 15.0, 15.2, 15.3, 15.5, 15.7, 15.8, 16.0, 16.2, 16.3, 16.5, 16.7, 16.8, 17.0, 17.1, 17.3, 17.5, 17.6, 17.8, 18.0, 18.1, 18.3, 18.5, 18.6, 18.8, 18.9, 19.1, 19.3, 19.4, 19.6,

  9%|▊         | 344/4000 [10:00<1:42:48,  1.69s/it]

(D) Put down the bag.
GT: (D) Put down the bag.
Part  Acc: 64.58%
Total Acc: 75.58%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.1, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.3, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7 seconds. 

  9%|▊         | 345/4000 [10:01<1:41:36,  1.67s/it]

(C) Sit on the bed.
GT: (C) Sit on the bed.
Part  Acc: 64.83%
Total Acc: 75.65%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.3, 0.5, 0.7, 0.8, 0.8, 1.0, 1.1, 1.3, 1.5, 1.5, 1.6, 1.8, 2.0, 2.0, 2.1, 2.3, 2.5, 2.5, 2.6, 2.8, 2.9, 3.1, 3.1, 3.3, 3.4, 3.6, 3.6, 3.8, 3.9, 4.1, 4.3, 4.3, 4.4, 4.6, 4.8, 4.8, 4.9, 5.1, 5.2, 5.2, 5.4, 5.6, 5.7, 5.9, 5.9, 6.1, 6.2, 6.4, 6.4, 6.6, 6.7, 6.9, 7.0, 7.0, 7.2, 7.4, 7.5, 7.5, 7.7, 7.9, 8.0, 8.2, 8.2, 8.4, 8.5, 8.7, 8.7, 8.8, 9.0, 9.2, 9.2, 9.3, 9.5, 9.7, 9.8, 9.8, 10.0, 10.2, 10.3, 10.3, 10.5, 10.6, 10.8, 11.0, 11.0, 11.1, 11.3, 11.5, 11.5, 11.6, 11.8, 12.0, 12.0, 12.1, 12.3, 12.5, 12.6, 12.6, 12.8, 12.9, 13.1, 13.1, 13.3, 13.4, 13.6, 13.8, 13.8, 13.9, 14.1, 14.3, 14.3, 14.4, 14.6, 14.7, 14.7, 14.9, 15.1, 15.2, 15.4, 15.4,

  9%|▊         | 346/4000 [10:05<2:12:59,  2.18s/it]

(A) Throw the clothes.
GT: (A) Throw the clothes.
Part  Acc: 65.07%
Total Acc: 75.72%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.0, 19.2, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 19.9, 20.1, 20.2, 20.2, 20.4, 20.4, 20.6, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.3, 21.5, 21.6, 21.6, 21.8, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.5, 22.7, 22.7, 22.8, 23.0, 23.0, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.7, 23.9, 23.9, 24.0, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6, 24.7, 24.8, 24.9, 24.9, 25.1, 25.1, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 25.8, 25.8, 26.0, 26.1, 26.1, 26.3, 26.3, 26.4, 26.5, 26.6, 26.7, 26.8, 26.9, 27.0, 27.0, 27.2, 27.2, 27.3, 27.5, 27.5, 27.6, 27.7, 27.8, 27.9, 28.0, 28.1, 28.2, 28.2, 28.4, 28.4, 28.5

  9%|▊         | 347/4000 [10:06<2:01:25,  1.99s/it]

(C) Put down the picture.
GT: (C) Put down the picture.
Part  Acc: 65.31%
Total Acc: 75.79%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.6, 3.8, 4.0, 4.2, 4.5, 4.7, 4.9, 5.1, 5.3, 5.5, 5.7, 5.9, 6.2, 6.4, 6.6, 6.8, 7.0, 7.2, 7.4, 7.7, 7.9, 8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.4, 9.6, 9.8, 10.0, 10.2, 10.4, 10.6, 10.9, 11.1, 11.3, 11.5, 11.7, 11.9, 12.1, 12.3, 12.6, 12.8, 13.0, 13.2, 13.4, 13.6, 13.8, 14.1, 14.3, 14.5, 14.7, 14.9, 15.1, 15.3, 15.5, 15.8, 16.0, 16.2, 16.4, 16.6, 16.9, 17.1, 17.3, 17.5, 17.7, 17.9, 18.1, 18.3, 18.6, 18.8, 19.0, 19.2, 19.4, 19.6, 19.8, 20.1, 20.3, 20.5, 20.7, 20.9, 21.1, 21.3, 21.5, 21.8, 22.0, 22.2, 22.4, 22.6, 22.8, 23.0, 23.3, 23.5, 23.7, 23.9, 24.1, 24.3, 24.5, 24.7, 25.0, 25.2, 25.4, 25.6, 25.8, 26.0, 26.2, 26.5, 26.7, 26.9, 27.1, 27.3, 27.5, 27.7, 27.9

  9%|▊         | 348/4000 [10:08<1:56:33,  1.92s/it]

(B) Open the bag.
GT: (B) Open the bag.
Part  Acc: 65.54%
Total Acc: 75.86%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 6.7, 6.8, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.1, 7.2, 7.2, 7.3, 7.3, 7.4, 7.4, 7.5, 7.5, 7.6, 7.6, 7.7, 7.7, 7.8, 7.8, 7.9, 7.9, 8.0, 8.0, 8.1, 8.1, 8.2, 8.2, 8.3, 8.3, 8.4, 8.4, 8.5, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 8.9, 9.0, 9.0, 9.1, 9.1, 9.2, 9.2, 9.3, 9.3, 9.4, 9.4, 9.5, 9.5, 9.6, 9.6, 9.7, 9.7, 9.8, 9.8, 9.9, 9.9, 10.0, 10.0, 10.1, 10.1, 10.2, 10.2, 10.3, 10.3, 10.4, 10.4, 10.5, 10.5, 10.6, 10.7, 10.7, 10.8, 10.8, 10.9, 10.9, 11.0, 11.0, 11.1, 11.1, 11.2, 11.2, 11.3, 11.3, 11.4, 11.4, 11.5, 11.5, 11.6, 11.6, 11.7, 11.7, 11.8, 11.8, 11.9, 11.9, 12.0, 12.0, 12.1, 12.1, 12.2, 12.2, 12.3, 12.3, 12.4, 12.4, 12.5, 12.5, 12.6, 12.7, 12.7, 12.8, 12.8, 12

  9%|▊         | 349/4000 [10:10<1:48:56,  1.79s/it]

(D) Open the bag.
GT: (B) Tidy up the clothes.
Part  Acc: 65.10%
Total Acc: 75.64%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 1.0, 1.1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.9, 2.0, 2.1, 2.3, 2.4, 2.5, 2.7, 2.8, 2.9, 3.1, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.9, 7.0, 7.1, 7.3, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.9, 15.0, 15.1, 15.3, 15.4, 15.5, 15.7, 15.8, 15.9, 16

  9%|▉         | 350/4000 [10:11<1:47:04,  1.76s/it]

(B) Take the dish.
GT: (A) Put down the sandwich.
Part  Acc: 64.67%
Total Acc: 75.43%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.7, 0.8, 0.9, 1.0, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7, 7.8, 7.9, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1 seconds. Carefully 

  9%|▉         | 351/4000 [10:13<1:40:40,  1.66s/it]

(C) Take the book.
GT: (C) Take the book.
Part  Acc: 64.90%
Total Acc: 75.50%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.5, 0.6, 0.7, 0.9, 1.0, 1.1, 1.3, 1.4, 1.5, 1.7, 1.8, 2.0, 2.1, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.2, 3.3, 3.5, 3.6, 3.8, 3.9, 4.0, 4.1, 4.3, 4.4, 4.5, 4.7, 4.8, 5.0, 5.1, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.0, 6.2, 6.3, 6.5, 6.6, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.5, 7.7, 7.8, 8.0, 8.1, 8.3, 8.4, 8.5, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.5, 9.6, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 11.0, 11.1, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.5, 12.6, 12.8, 12.9, 13.1, 13.2, 13.3, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.3, 14.4, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.8, 15.9, 16.1, 16.2, 16.3, 16.5, 16.6,

  9%|▉         | 352/4000 [10:14<1:39:39,  1.64s/it]

(C) Take the shoe.
GT: (C) Take the shoe.
Part  Acc: 65.13%
Total Acc: 75.57%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.6, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 1

  9%|▉         | 353/4000 [10:16<1:36:32,  1.59s/it]

(D) Take the book.
GT: (B) Put down the shoe.
Part  Acc: 64.71%
Total Acc: 75.35%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.9, 2.1, 2.3, 2.5, 2.8, 3.0, 3.2, 3.4, 3.6, 3.9, 4.1, 4.3, 4.5, 4.8, 5.0, 5.2, 5.4, 5.6, 5.9, 6.1, 6.3, 6.5, 6.8, 7.0, 7.2, 7.4, 7.6, 7.8, 8.1, 8.3, 8.5, 8.8, 9.0, 9.2, 9.4, 9.6, 9.8, 10.1, 10.3, 10.5, 10.7, 11.0, 11.2, 11.4, 11.6, 11.8, 12.0, 12.3, 12.5, 12.7, 13.0, 13.2, 13.4, 13.6, 13.8, 14.0, 14.3, 14.5, 14.7, 14.9, 15.2, 15.4, 15.6, 15.8, 16.0, 16.2, 16.5, 16.7, 16.9, 17.2, 17.4, 17.6, 17.8, 18.0, 18.2, 18.5, 18.7, 18.9, 19.1, 19.4, 19.6, 19.8, 20.0, 20.2, 20.5, 20.7, 20.9, 21.1, 21.4, 21.6, 21.8, 22.0, 22.2, 22.5, 22.7, 22.9, 23.1, 23.3, 23.6, 23.8, 24.0, 24.2, 24.5, 24.7, 24.9, 25.1, 25.3, 25.5, 25.8, 26.0, 26.2, 26.5, 26.7, 26.9, 27.1, 27.3,

  9%|▉         | 354/4000 [10:18<1:42:06,  1.68s/it]

(C) Close the book.
GT: (C) Close the book.
Part  Acc: 64.94%
Total Acc: 75.42%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.8, 15.

  9%|▉         | 355/4000 [10:19<1:40:29,  1.65s/it]

(B) Take the bag.
GT: (C) Lie on the bed.
Part  Acc: 64.52%
Total Acc: 75.21%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.3, 0.5, 0.6, 0.8, 0.9, 1.0, 1.2, 1.3, 1.5, 1.6, 1.7, 1.9, 2.0, 2.2, 2.3, 2.4, 2.6, 2.7, 2.9, 3.0, 3.1, 3.3, 3.4, 3.6, 3.7, 3.8, 4.0, 4.1, 4.3, 4.4, 4.5, 4.7, 4.8, 5.0, 5.1, 5.2, 5.4, 5.5, 5.7, 5.8, 5.9, 6.1, 6.2, 6.4, 6.5, 6.6, 6.8, 6.9, 7.1, 7.2, 7.3, 7.5, 7.6, 7.8, 7.9, 8.0, 8.2, 8.3, 8.5, 8.6, 8.7, 8.9, 9.0, 9.2, 9.3, 9.4, 9.6, 9.7, 9.9, 10.0, 10.1, 10.3, 10.4, 10.6, 10.7, 10.8, 11.0, 11.1, 11.3, 11.4, 11.5, 11.7, 11.8, 12.0, 12.1, 12.2, 12.4, 12.5, 12.7, 12.8, 12.9, 13.1, 13.2, 13.4, 13.5, 13.6, 13.8, 13.9, 14.1, 14.2, 14.3, 14.5, 14.6, 14.8, 14.9, 15.0, 15.2, 15.3, 15.4, 15.6, 15.7, 15.9, 16.0, 16.1, 16.3, 16.4, 16.6, 16.7, 16.8, 17.

  9%|▉         | 356/4000 [10:21<1:40:22,  1.65s/it]

(A) Take the shoe.
GT: (A) Take the shoe.
Part  Acc: 64.74%
Total Acc: 75.28%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 1.0, 1.1, 1.2, 1.4, 1.5, 1.6, 1.8, 1.9, 2.0, 2.2, 2.3, 2.4, 2.5, 2.7, 2.8, 2.9, 3.1, 3.2, 3.3, 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.3, 4.4, 4.5, 4.7, 4.8, 4.9, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.9, 6.0, 6.1, 6.3, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.2, 7.3, 7.4, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.4, 10.5, 10.6, 10.7, 10.9, 11.0, 11.1, 11.3, 11.4, 11.5, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.5, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.8, 15.9, 16.0, 1

  9%|▉         | 357/4000 [10:23<1:46:09,  1.75s/it]

(D) Sit on the sofa/couch.
GT: (C) Take the bag.
Part  Acc: 64.33%
Total Acc: 75.07%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.5, 4.7, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2,

  9%|▉         | 358/4000 [10:24<1:42:52,  1.69s/it]

(D) Take the book.
GT: (D) Take the book.
Part  Acc: 64.56%
Total Acc: 75.14%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 1.9, 2.0, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.9, 3.0, 3.1, 3.2, 3.3, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.2, 4.3, 4.4, 4.5, 4.6, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.2, 6.3, 6.4, 6.5, 6.6, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.5, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.3, 14.4, 14.5,

  9%|▉         | 359/4000 [10:26<1:44:05,  1.72s/it]

(D) Sit on the sofa/couch.
GT: (D) Sit on the sofa/couch.
Part  Acc: 64.78%
Total Acc: 75.21%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 17.2, 17.3, 17.4, 17.5, 17.6, 17.7, 17.7, 17.9, 18.0, 18.1, 18.1, 18.2, 18.4, 18.5, 18.5, 18.6, 18.7, 18.9, 18.9, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.3, 20.5, 20.6, 20.7, 20.7, 20.8, 21.0, 21.1, 21.1, 21.2, 21.3, 21.5, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.4, 23.6, 23.7, 23.7, 23.8, 23.9, 24.1, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6, 24.7, 24.8, 24.9, 25.0, 25.1, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 25.8, 25.9, 26.0, 26.0, 26.2, 26.3, 26.4, 26.4, 26.5, 26.7, 26.7, 26.8, 26.9, 27.0, 27.1, 27.2, 27.3, 27.4, 27.5, 27.6, 27.7

  9%|▉         | 360/4000 [10:28<1:40:37,  1.66s/it]

(B) Put down the bag.
GT: (B) Put down the bag.
Part  Acc: 65.00%
Total Acc: 75.28%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.3, 1.3, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.1, 2.1, 2.2, 2.2, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.7, 2.7, 2.8, 2.8, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.5, 6.5, 6.6, 6.6, 6.7, 6.8, 6.8, 6.9, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.3, 7.4, 7.4, 7.5, 7.6, 7.6, 7.7 seconds. Carefully 

  9%|▉         | 361/4000 [10:29<1:40:31,  1.66s/it]

(A) Put down the food.
GT: (A) Put down the food.
Part  Acc: 65.22%
Total Acc: 75.35%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.1, 1.3, 1.5, 1.7, 1.8, 2.0, 2.2, 2.4, 2.5, 2.7, 2.9, 3.1, 3.3, 3.4, 3.6, 3.8, 4.0, 4.1, 4.3, 4.5, 4.7, 4.8, 5.0, 5.2, 5.4, 5.6, 5.7, 5.9, 6.1, 6.3, 6.5, 6.7, 6.8, 7.0, 7.2, 7.4, 7.5, 7.7, 7.9, 8.1, 8.2, 8.4, 8.6, 8.8, 9.0, 9.1, 9.3, 9.5, 9.7, 9.8, 10.0, 10.2, 10.4, 10.5, 10.7, 10.9, 11.1, 11.3, 11.4, 11.6, 11.8, 12.0, 12.1, 12.3, 12.5, 12.7, 12.8, 13.0, 13.2, 13.4, 13.6, 13.7, 13.9, 14.1, 14.3, 14.4, 14.6, 14.8, 15.0, 15.2, 15.3, 15.5, 15.7, 15.9, 16.0, 16.2, 16.4, 16.6, 16.7, 16.9, 17.1, 17.3, 17.5, 17.6, 17.8, 18.0, 18.2, 18.4, 18.5, 18.7, 18.9, 19.1, 19.3, 19.4, 19.6, 19.8, 20.0, 20.1, 20.3, 20.5, 20.7, 20.9, 21.0, 

  9%|▉         | 362/4000 [10:31<1:40:32,  1.66s/it]

(B) Take the clothes.
GT: (B) Take the clothes.
Part  Acc: 65.43%
Total Acc: 75.41%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8 seconds.

  9%|▉         | 363/4000 [10:32<1:37:51,  1.61s/it]

(B) Sit on.
GT: (B) Sit on.
Part  Acc: 65.64%
Total Acc: 75.48%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.9, 2.9, 3.0, 3.1, 3.1, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.7 seconds. Carefully 

  9%|▉         | 364/4000 [10:34<1:34:41,  1.56s/it]

(B) Take the box.
GT: (C) Put down the broom.
Part  Acc: 65.24%
Total Acc: 75.27%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.4, 3.4, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.9, 4.0, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.3, 4.4, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.6, 4.7, 4.8, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.0, 5.1, 5.1, 5.2, 5.2, 5.2, 5.3, 5.4, 5.4, 5.4, 5.5, 5.5 seconds. Carefully 

  9%|▉         | 365/4000 [10:36<1:51:45,  1.84s/it]

(D) Sit on.
GT: (C) Lie on.
Part  Acc: 64.85%
Total Acc: 75.07%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.8, 4.0, 4.2, 4.2, 4.4, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.4, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.4, 6.5, 6.6, 6.8, 7.0, 7.1, 7.2, 7.4, 7.5, 7.6, 7.8, 8.0, 8.0, 8.2, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.2, 9.4, 9.4, 9.6, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.6, 10.8, 10.9, 11.0, 11.2, 11.3, 11.4, 11.6, 11.8, 11.8, 12.0, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 13.0, 13.2, 13.2, 13.4, 13.6, 13.7, 13.8, 14.0, 14.1, 14.2, 14.4, 14.6, 14.7, 14.8, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.8, 16.0, 16.1, 16.2, 16.4, 16.5, 16.6, 16.8, 17.0, 17.0, 17.2, 17.4, 17.5, 17.6, 17.8, 17.9, 18.0, 18.2, 18.4, 18.4, 18.6, 18.8, 18.9, 19.0, 19.2, 19.3, 19.4, 19.6, 19.8, 19.9, 20.0, 20.2, 

  9%|▉         | 366/4000 [10:38<1:48:12,  1.79s/it]

(D) Open the refrigerator.
GT: (D) Open the refrigerator.
Part  Acc: 65.06%
Total Acc: 75.14%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.0, 3.1, 3.3, 3.4, 3.6, 3.7, 3.9, 4.0, 4.2, 4.3, 4.5, 4.7, 4.8, 5.0, 5.1, 5.3, 5.5, 5.6, 5.8, 5.9, 6.1, 6.2, 6.4, 6.5, 6.7, 6.8, 7.0, 7.1, 7.3, 7.4, 7.6, 7.8, 7.9, 8.1, 8.2, 8.4, 8.5, 8.7, 8.8, 9.0, 9.1, 9.3, 9.4, 9.6, 9.7, 9.9, 10.1, 10.2, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.0, 12.2, 12.4, 12.5, 12.7, 12.8, 13.0, 13.1, 13.3, 13.4, 13.6, 13.7, 13.9, 14.1, 14.2, 14.4, 14.5, 14.7, 14.8, 15.0, 15.2, 15.3, 15.5, 15.6, 15.8, 15.9, 16.1, 16.2, 16.4, 16.5, 16.7, 16.8, 17.0, 17.1, 17.3, 17.5, 17.6, 17.8, 17.9, 18.1, 18.2, 18.4, 18.5, 18.7, 18.8, 19.0, 19.1, 19.3, 19.4, 19.6, 19.8, 19.9, 20.1, 20.2, 20.4, 20.5, 20.7, 20.8, 21

  9%|▉         | 367/4000 [10:40<1:47:18,  1.77s/it]

(C) Throw the blanket.
GT: (C) Throw the blanket.
Part  Acc: 65.27%
Total Acc: 75.20%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8, 3.0, 3.2, 3.4, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9, 5.1, 5.3, 5.5, 5.7, 6.0, 6.2, 6.4, 6.6, 6.8, 7.0, 7.2, 7.4, 7.6, 7.8, 8.0, 8.3, 8.5, 8.7, 8.9, 9.1, 9.3, 9.5, 9.7, 9.9, 10.1, 10.3, 10.6, 10.8, 11.0, 11.2, 11.4, 11.6, 11.8, 12.0, 12.2, 12.4, 12.6, 12.9, 13.1, 13.3, 13.5, 13.7, 13.9, 14.1, 14.3, 14.5, 14.7, 14.9, 15.2, 15.4, 15.6, 15.8, 16.0, 16.2, 16.4, 16.6, 16.9, 17.1, 17.3, 17.5, 17.7, 17.9, 18.1, 18.3, 18.5, 18.7, 19.0, 19.2, 19.4, 19.6, 19.8, 20.0, 20.2, 20.4, 20.6, 20.8, 21.0, 21.3, 21.5, 21.7, 21.9, 22.1, 22.3, 22.5, 22.7, 22.9, 23.1, 23.3, 23.6, 23.8, 24.0, 24.2, 24.4, 24.6

  9%|▉         | 368/4000 [10:41<1:44:58,  1.73s/it]

(C) Close the refrigerator.
GT: (A) Take the phone/camera.
Part  Acc: 64.88%
Total Acc: 75.00%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.2, 14.3, 14.3, 14.4, 14.4, 14.5, 14.5, 14.6, 14.6, 14.7, 14.7, 14.8, 14.9, 14.9, 15.0, 15.0, 15.1, 15.1, 15.2, 15.2, 15.3, 15.3, 15.4, 15.4, 15.5, 15.6, 15.6, 15.7, 15.7, 15.8, 15.8, 15.9, 15.9, 16.0, 16.0, 16.1, 16.1, 16.2, 16.3, 16.3, 16.4, 16.4, 16.5, 16.5, 16.6, 16.6, 16.7, 16.8, 16.8, 16.9, 16.9, 17.0, 17.0, 17.1, 17.2, 17.2, 17.3, 17.3, 17.4, 17.4, 17.5, 17.5, 17.6, 17.6, 17.7, 17.8, 17.8, 17.9, 17.9, 18.0, 18.0, 18.1, 18.1, 18.2, 18.2, 18.3, 18.4, 18.4, 18.5, 18.5, 18.6, 18.6, 18.7, 18.7, 18.8, 18.8, 18.9, 18.9, 19.0, 19.1, 19.1, 19.2, 19.2, 19.3, 19.3, 19.4, 19.4, 19.5, 19.5, 19.6, 19.6, 19.7, 19.8, 19.8, 19.9, 19.9, 20.0, 20.0, 20.1, 20.1

  9%|▉         | 369/4000 [10:43<1:42:25,  1.69s/it]

(D) Put down the sandwich.
GT: (D) Put down the sandwich.
Part  Acc: 65.09%
Total Acc: 75.07%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 14.3, 14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7, 15.7, 15.8, 15.9, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.8, 16.9, 17.0, 17.1, 17.2, 17.3, 17.3, 17.4, 17.5, 17.6, 17.7, 17.8, 17.9, 18.0, 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19.0, 19.0, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20.0, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.6, 20.7, 20.8, 20.9, 21.0, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22.0, 22.1, 22.2, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23.0, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.8, 23.8, 23.9, 24.0, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6

  9%|▉         | 370/4000 [10:45<1:42:32,  1.70s/it]

(C) Put down the clothes.
GT: (C) Put down the clothes.
Part  Acc: 65.29%
Total Acc: 75.14%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.3, 11.4, 11.6, 11.7, 11.8, 11.9, 12.1, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 12.9, 13.0, 13.2, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.0, 14.1, 14.3, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.1, 15.2, 15.4, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.2, 16.3, 16.5, 16.6, 16.7, 16.9, 17.0, 17.1, 17.2, 17.4, 17.5, 17.6, 17.7, 17.8, 18.0, 18.1, 18.2, 18.3, 18.5, 18.6, 18.7, 18.8, 18.9, 19.1, 19.2, 19.3, 19.4, 19.6, 19.7, 19.8, 19.9, 20.0, 20.2, 20.3, 20.4, 20.5, 20.7, 20.8, 20.9, 21.0, 21.2, 21.3, 21.4, 21.5, 21.6, 21.8, 21.9, 22.0, 22.1, 22.

  9%|▉         | 371/4000 [10:46<1:40:36,  1.66s/it]

(D) Open the laptop.
GT: (D) Open the laptop.
Part  Acc: 65.50%
Total Acc: 75.20%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.7, 4.7, 4.9, 4.9, 5.1, 5.1, 5.2, 5.2, 5.4, 5.4, 5.6, 5.6, 5.7, 5.7, 5.9, 5.9, 6.0, 6.0, 6.2, 6.2, 6.4, 6.4, 6.5, 6.5, 6.7, 6.7, 6.9, 6.9, 7.0, 7.0, 7.2, 7.2, 7.4, 7.4, 7.4, 7.5, 7.5, 7.7, 7.7, 7.8, 7.8, 8.0, 8.0, 8.2, 8.2, 8.3, 8.3, 8.5, 8.5, 8.7, 8.7, 8.8, 8.8, 9.0, 9.0, 9.2, 9.2, 9.3, 9.3, 9.5, 9.5, 9.6, 9.6, 9.8, 9.8, 9.8, 10.0, 10.0, 10.1, 10.1, 10.3, 10.3, 10.5, 10.5, 10.6, 10.6, 10.8, 10.8, 10.9, 10.9, 11.1, 11.1, 11.3, 11.3, 11.4, 11.4, 11.6, 11.6, 11.8, 11.8, 11.9, 11.9, 12.1, 12.1, 12.3, 12.3, 12.3, 12.4, 12.4, 12.6, 12.6, 12.7, 12.7, 12.9, 12.9, 13.1, 13.1, 13.2, 13.2, 13.4, 13.4, 13.6, 13.6, 13.7, 13.7, 13.9, 13.9, 14.1, 14.1, 14.2, 14.2

  9%|▉         | 372/4000 [10:49<2:03:21,  2.04s/it]

(C) Close the book.
GT: (C) Close the book.
Part  Acc: 65.70%
Total Acc: 75.27%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.8, 1.8, 1.9, 2.0, 2.1, 2.2, 2.2, 2.3, 2.5, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.5, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.2, 10.3, 10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.0, 11.2, 11.2, 11.3 sec

  9%|▉         | 373/4000 [10:51<1:57:02,  1.94s/it]

(B) Take the paper/notebook.
GT: (B) Take the paper/notebook.
Part  Acc: 65.90%
Total Acc: 75.34%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.3, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.4, 5.5, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.1, 7.2, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.6, 10.7, 10.7 seconds. 

  9%|▉         | 374/4000 [10:52<1:48:21,  1.79s/it]

(D) Take the dish.
GT: (C) Throw the clothes.
Part  Acc: 65.52%
Total Acc: 75.13%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 27.9, 28.0, 28.2, 28.4, 28.6, 28.7, 28.9, 29.1, 29.2, 29.4, 29.6, 29.8, 29.9, 30.1, 30.3, 30.4, 30.6, 30.8, 31.0, 31.1, 31.3, 31.5, 31.7, 31.8, 32.0, 32.2, 32.3, 32.5, 32.7, 32.9, 33.0, 33.2, 33.4, 33.5, 33.7, 33.9, 34.1, 34.2, 34.4, 34.6, 34.8, 34.9, 35.1, 35.3, 35.4, 35.6, 35.8, 36.0, 36.1, 36.3, 36.5, 36.6, 36.8, 37.0, 37.2, 37.3, 37.5, 37.7, 37.9, 38.0, 38.2, 38.4, 38.5, 38.7, 38.9, 39.1, 39.2, 39.4, 39.6, 39.7, 39.9, 40.1, 40.3, 40.4, 40.6, 40.8, 40.9, 41.1, 41.3, 41.5, 41.6, 41.8, 42.0, 42.2, 42.3, 42.5, 42.7, 42.8, 43.0, 43.2, 43.4, 43.5, 43.7, 43.9, 44.0, 44.2, 44.4, 44.6, 44.7, 44.9, 45.1, 45.3, 45.4, 45.6, 45.8, 45.9, 46.1, 46.3, 46.5, 46.6

  9%|▉         | 375/4000 [10:54<1:45:39,  1.75s/it]

(B) Close the refrigerator.
GT: (B) Close the refrigerator.
Part  Acc: 65.71%
Total Acc: 75.20%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.9, 1.9, 2.0, 2.2, 2.2, 2.3, 2.4, 2.6, 2.7, 2.7, 2.9, 3.0, 3.0, 3.1, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.6, 7.7, 7.9, 7.9, 8.0, 8.2, 8.3, 8.4, 8.4, 8.6, 8.7, 8.7, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.5, 12.5, 12.6, 12.8, 

  9%|▉         | 376/4000 [10:56<1:43:38,  1.72s/it]

(A) Take the paper/notebook.
GT: (A) Take the paper/notebook.
Part  Acc: 65.91%
Total Acc: 75.27%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.3, 4.4, 4.5, 4.6, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.1, 5.2, 5.3, 5.4, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.0, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 8.9, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.0, 10.1, 10.2, 10.3, 10.3, 10.4, 10.5, 10.6, 10.6, 10.7, 10.8, 10.9, 10.9, 11.0, 11.1, 11.2, 11.2, 11.3, 11.4, 11.5, 11.5, 11.6, 11.7, 11.8, 11.8, 11.9, 12.0, 12.1, 12.1, 12.2, 12.3, 12.3, 12.4, 12.5, 12.6, 12.6, 12.7, 12.8, 12.9, 12.9, 13.0, 13.1, 13.2, 13.2, 13.3, 13

  9%|▉         | 377/4000 [10:57<1:41:25,  1.68s/it]

(C) Take the clothes.
GT: (C) Take the clothes.
Part  Acc: 66.10%
Total Acc: 75.33%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.4, 11.5, 11.6, 11.7, 11.8, 11.

  9%|▉         | 378/4000 [10:59<1:39:42,  1.65s/it]

(A) Wash the mirror.
GT: (D) Take the sandwich.
Part  Acc: 65.73%
Total Acc: 75.13%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.5, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.0, 6.1, 6.1, 6.2, 6.3, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.3, 8.4, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.5 seconds. Carefully 

  9%|▉         | 379/4000 [11:00<1:37:24,  1.61s/it]

(C) Open the refrigerator.
GT: (D) Sit at the table.
Part  Acc: 65.36%
Total Acc: 74.93%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.3, 5.3, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.8, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.1, 11.1, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.3, 12.3, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 13.1, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 13.9, 14.0, 14.1, 14.2, 14.3, 14.4, 14.6, 14.6, 14.8, 14.9, 15.0, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.8, 15.8, 16.0, 16.1, 16.2, 16.3, 16.4, 16.5, 16.6, 16.7, 16.9, 16.9, 17.1, 17.

 10%|▉         | 380/4000 [11:02<1:38:30,  1.63s/it]

(C) Put down the phone/camera.
GT: (C) Put down the phone/camera.
Part  Acc: 65.56%
Total Acc: 75.00%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7, 13.9, 14.0, 14

 10%|▉         | 381/4000 [11:04<1:38:48,  1.64s/it]

(B) Put down.
GT: (B) Put down.
Part  Acc: 65.75%
Total Acc: 75.07%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12

 10%|▉         | 382/4000 [11:05<1:36:24,  1.60s/it]

(D) Put down.
GT: (D) Put down.
Part  Acc: 65.93%
Total Acc: 75.13%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6 s

 10%|▉         | 383/4000 [11:07<1:33:49,  1.56s/it]

(D) Close the window.
GT: (D) Close the window.
Part  Acc: 66.12%
Total Acc: 75.20%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.0, 1.2, 1.3, 1.4, 1.5, 1.7, 1.8, 1.9, 2.0, 2.2, 2.3, 2.4, 2.5, 2.7, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.5, 3.7, 3.8, 3.9, 4.0, 4.2, 4.3, 4.5, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.3, 5.5, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.8, 7.0, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.8, 8.0, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.8, 9.0, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8, 10.0, 10.1, 10.2, 10.3, 10.5, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.1, 14.2, 14.4, 14.5, 14.6, 14.8, 14.9, 15.0, 15.1, 15.2, 15.4,

 10%|▉         | 384/4000 [11:08<1:35:34,  1.59s/it]

(B) Take the laptop.
GT: (B) Take the laptop.
Part  Acc: 66.30%
Total Acc: 75.26%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.9, 1.1, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 1.9, 2.1, 2.2, 2.3, 2.4, 2.6, 2.7, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.5, 3.7, 3.8, 4.0, 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.8, 5.0, 5.1, 5.2, 5.3, 5.4, 5.6, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.4, 6.6, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.4, 7.6, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.4, 8.6, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.4, 9.6, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.3, 12.5, 12.6, 12.7, 12.8, 12.9, 13.1, 13.2, 13.3, 13.4, 13.6, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.1, 15.2, 15.3, 15.4, 15.6, 15.7, 15.8, 15.9, 16.1,

 10%|▉         | 385/4000 [11:10<1:33:49,  1.56s/it]

(A) Put down.
GT: (A) Put down.
Part  Acc: 66.49%
Total Acc: 75.32%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 25.2, 25.4, 25.5, 25.6, 25.8, 25.9, 26.0, 26.1, 26.3, 26.4, 26.5, 26.7, 26.8, 26.9, 27.1, 27.2, 27.3, 27.4, 27.6, 27.7, 27.8, 28.0, 28.1, 28.2, 28.4, 28.5, 28.6, 28.7, 28.9, 29.0, 29.1, 29.3, 29.4, 29.5, 29.7, 29.8, 29.9, 30.1, 30.2, 30.3, 30.4, 30.6, 30.7, 30.8, 31.0, 31.1, 31.2, 31.4, 31.5, 31.6, 31.7, 31.9, 32.0, 32.1, 32.3, 32.4, 32.5, 32.7, 32.8, 32.9, 33.0, 33.2, 33.3, 33.4, 33.6, 33.7, 33.8, 34.0, 34.1, 34.2, 34.3, 34.5, 34.6, 34.7, 34.9, 35.0, 35.1, 35.3, 35.4, 35.5, 35.6, 35.8, 35.9, 36.0, 36.2, 36.3, 36.4, 36.6, 36.7, 36.8, 37.0, 37.1, 37.2, 37.3, 37.5, 37.6, 37.7, 37.9, 38.0, 38.1, 38.3, 38.4, 38.5, 38.6, 38.8, 38.9, 39.0, 39.2, 39.3, 39.4

 10%|▉         | 386/4000 [11:12<1:37:48,  1.62s/it]

(C) Put down the broom.
GT: (C) Put down the broom.
Part  Acc: 66.67%
Total Acc: 75.39%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 12.4, 12.5, 12.5, 12.6, 12.7, 12.8, 12.8, 12.9, 12.9, 13.0, 13.1, 13.2, 13.2, 13.3, 13.4, 13.4, 13.5, 13.5, 13.7, 13.7, 13.8, 13.8, 13.9, 14.0, 14.0, 14.2, 14.2, 14.3, 14.3, 14.4, 14.5, 14.5, 14.6, 14.7, 14.8, 14.8, 14.9, 14.9, 15.0, 15.1, 15.2, 15.2, 15.3, 15.4, 15.4, 15.5, 15.5, 15.7, 15.7, 15.8, 15.8, 15.9, 16.0, 16.0, 16.1, 16.2, 16.3, 16.3, 16.4, 16.4, 16.5, 16.6, 16.7, 16.8, 16.8, 16.9, 16.9, 17.0, 17.1, 17.2, 17.2, 17.3, 17.4, 17.4, 17.5, 17.6, 17.7, 17.7, 17.8, 17.8, 17.9, 18.0, 18.1, 18.1, 18.2, 18.3, 18.3, 18.4, 18.4, 18.6, 18.6, 18.7, 18.7, 18.8, 18.9, 18.9, 19.0, 19.1, 19.2, 19.2, 19.3, 19.4, 19.4, 19.5, 19.6, 19.7, 19.7, 19.8, 19.8, 19.9

 10%|▉         | 387/4000 [11:13<1:37:44,  1.62s/it]

(A) Open the box.
GT: (A) Open the box.
Part  Acc: 66.84%
Total Acc: 75.45%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 1

 10%|▉         | 388/4000 [11:15<1:37:09,  1.61s/it]

(A) Put down the towel.
GT: (A) Put down the towel.
Part  Acc: 67.02%
Total Acc: 75.52%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.3, 0.6, 0.8, 1.0, 1.2, 1.5, 1.7, 1.9, 2.1, 2.4, 2.6, 2.8, 3.0, 3.3, 3.5, 3.7, 4.0, 4.2, 4.4, 4.6, 4.9, 5.1, 5.3, 5.5, 5.8, 6.0, 6.2, 6.5, 6.7, 6.9, 7.1, 7.4, 7.6, 7.8, 8.0, 8.3, 8.5, 8.7, 8.9, 9.2, 9.4, 9.6, 9.9, 10.1, 10.3, 10.5, 10.8, 11.0, 11.2, 11.4, 11.7, 11.9, 12.1, 12.3, 12.6, 12.8, 13.0, 13.3, 13.5, 13.7, 13.9, 14.2, 14.4, 14.6, 14.8, 15.1, 15.3, 15.5, 15.8, 16.0, 16.2, 16.4, 16.7, 16.9, 17.1, 17.3, 17.6, 17.8, 18.0, 18.2, 18.5, 18.7, 18.9, 19.2, 19.4, 19.6, 19.8, 20.1, 20.3, 20.5, 20.7, 21.0, 21.2, 21.4, 21.6, 21.9, 22.1, 22.3, 22.6, 22.8, 23.0, 23.2, 23.5, 23.7, 23.9, 24.1, 24.4, 24.6, 24.8, 25.1, 25.3, 25.5, 25.7, 26.0, 26.2, 26.4, 

 10%|▉         | 389/4000 [11:17<1:39:52,  1.66s/it]

(C) Sit on.
GT: (C) Sit on.
Part  Acc: 67.20%
Total Acc: 75.58%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.4, 0.5, 0.6, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.5, 1.5, 1.6, 1.7, 1.8, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.2, 2.3, 2.4, 2.5, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.8, 4.9, 5.0, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.2, 6.2, 6.3, 6.4, 6.5, 6.5, 6.6, 6.7, 6.8, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.1, 8.2, 8.3, 8.4, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.2, 9.3, 9.4, 9.5, 9.5, 9.6, 9.7, 9.8 seconds. Carefully 

 10%|▉         | 390/4000 [11:18<1:35:48,  1.59s/it]

(D) Put down.
GT: (D) Put down.
Part  Acc: 67.37%
Total Acc: 75.64%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 7.0, 7.2, 7.4, 7.5, 7.6, 7.8, 8.0, 8.2, 8.3, 8.4, 8.6, 8.8, 8.9, 9.1, 9.2, 9.4, 9.6, 9.7, 9.9, 10.0, 10.2, 10.3, 10.5, 10.6, 10.8, 11.0, 11.1, 11.3, 11.4, 11.6, 11.7, 11.9, 12.1, 12.2, 12.4, 12.5, 12.7, 12.9, 13.0, 13.1, 13.3, 13.5, 13.6, 13.8, 13.9, 14.1, 14.3, 14.4, 14.6, 14.7, 14.9, 15.1, 15.2, 15.3, 15.5, 15.7, 15.8, 16.0, 16.1, 16.3, 16.5, 16.6, 16.8, 16.9, 17.1, 17.2, 17.4, 17.6, 17.7, 17.9, 18.0, 18.2, 18.3, 18.5, 18.6, 18.8, 19.0, 19.1, 19.3, 19.4, 19.6, 19.8, 19.9, 20.0, 20.2, 20.4, 20.6, 20.7, 20.8, 21.0, 21.2, 21.3, 21.5, 21.6, 21.8, 22.0, 22.1, 22.3, 22.4, 22.6, 22.7, 22.9, 23.0, 23.2, 23.4, 23.5, 23.7, 23.8, 24.0, 24.1, 24.3, 24.5, 24.6,

 10%|▉         | 391/4000 [11:20<1:39:15,  1.65s/it]

(A) Put down the book.
GT: (A) Put down the book.
Part  Acc: 67.54%
Total Acc: 75.70%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 3.1, 3.2, 3.3, 3.5, 3.6, 3.7, 3.9, 4.0, 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.1, 6.1, 6.3, 6.4, 6.5, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.3, 8.3, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.3, 9.4, 9.5, 9.7, 9.8, 9.9, 10.1, 10.2, 10.3, 10.4, 10.5, 10.7, 10.8, 10.9, 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, 11.9, 12.0, 12.1, 12.3, 12.4, 12.5, 12.6, 12.7, 12.9, 13.0, 13.1, 13.3, 13.4, 13.5, 13.7, 13.8, 13.9, 14.1, 14.2, 14.3, 14.4, 14.5, 14.7, 14.8, 14.9, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.3, 16.4, 16.5, 16.6, 16.7, 16.9, 17.0, 17.1, 17.3, 17.4, 17.5, 17.7, 17.8, 17.9, 18.1, 18.2, 18.3

 10%|▉         | 392/4000 [11:21<1:37:24,  1.62s/it]

(D) Throw the book.
GT: (D) Throw the book.
Part  Acc: 67.71%
Total Acc: 75.77%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.4, 0.6, 0.9, 1.1, 1.4, 1.6, 1.9, 2.1, 2.4, 2.6, 2.9, 3.1, 3.4, 3.6, 3.9, 4.1, 4.4, 4.6, 4.9, 5.1, 5.4, 5.6, 5.9, 6.2, 6.4, 6.7, 6.9, 7.2, 7.4, 7.7, 7.9, 8.2, 8.4, 8.7, 8.9, 9.2, 9.4, 9.7, 9.9, 10.2, 10.4, 10.7, 10.9, 11.2, 11.4, 11.7, 11.9, 12.2, 12.5, 12.7, 13.0, 13.2, 13.5, 13.7, 14.0, 14.2, 14.5, 14.7, 15.0, 15.2, 15.5, 15.7, 16.0, 16.2, 16.5, 16.7, 17.0, 17.2, 17.5, 17.7, 18.0, 18.2, 18.5, 18.8, 19.0, 19.3, 19.5, 19.8, 20.0, 20.3, 20.5, 20.8, 21.0, 21.3, 21.5, 21.8, 22.0, 22.3, 22.5, 22.8, 23.0, 23.3, 23.5, 23.8, 24.0, 24.3, 24.6, 24.8, 25.1, 25.3, 25.6, 25.8, 26.1, 26.3, 26.6, 26.8, 27.1, 27.3, 27.6, 27.8, 28.1, 28.3, 28.6, 28.8, 29.1, 29

 10%|▉         | 393/4000 [11:23<1:43:53,  1.73s/it]

(B) Take the bag.
GT: (B) Take the bag.
Part  Acc: 67.88%
Total Acc: 75.83%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.3, 1.3, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.5, 5.5, 5.6, 5.7, 5.7, 5.8, 5.8, 5.9, 5.9, 6.0, 6.0, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.6, 6.6, 6.7, 6.7, 6.8, 6.9, 6.9, 7.0, 7.0, 7.1, 7.1 seconds. Carefully 

 10%|▉         | 394/4000 [11:25<1:42:28,  1.71s/it]

(A) Take the paper/notebook.
GT: (A) Take the paper/notebook.
Part  Acc: 68.04%
Total Acc: 75.89%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.1, 2.1, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.7, 2.7, 2.8, 2.8, 2.9, 2.9, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.1, 3.1, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.7, 3.7, 3.8, 3.8, 3.9, 3.9, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.1, 4.1, 4.2, 4.2, 4.3, 4.3, 4.3, 4.3 seconds. Carefully 

 10%|▉         | 395/4000 [11:28<2:07:11,  2.12s/it]

(C) Put down the clothes.
GT: (A) Take the dish.
Part  Acc: 67.69%
Total Acc: 75.70%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.5, 0.7, 0.9, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4, 2.7, 2.9, 3.1, 3.3, 3.5, 3.8, 4.0, 4.2, 4.4, 4.6, 4.8, 5.0, 5.3, 5.5, 5.7, 5.9, 6.1, 6.4, 6.6, 6.8, 7.0, 7.2, 7.4, 7.6, 7.9, 8.1, 8.3, 8.5, 8.7, 9.0, 9.2, 9.4, 9.6, 9.8, 10.0, 10.2, 10.5, 10.7, 10.9, 11.1, 11.3, 11.6, 11.8, 12.0, 12.2, 12.4, 12.6, 12.8, 13.1, 13.3, 13.5, 13.7, 13.9, 14.2, 14.4, 14.6, 14.8, 15.0, 15.2, 15.4, 15.7, 15.9, 16.1, 16.3, 16.5, 16.8, 17.0, 17.2, 17.4, 17.7, 17.9, 18.1, 18.3, 18.5, 18.7, 18.9, 19.2, 19.4, 19.6, 19.8, 20.0, 20.3, 20.5, 20.7, 20.9, 21.1, 21.3, 21.5, 21.8, 22.0, 22.2, 22.4, 22.6, 22.9, 23.1, 23.3, 23.5, 23.7, 23.9, 24.1, 24.4, 24.6, 24.8, 25.0, 25.2, 25.5, 25.7, 

 10%|▉         | 396/4000 [11:30<2:02:01,  2.03s/it]

(D) Put down the book.
GT: (D) Put down the book.
Part  Acc: 67.86%
Total Acc: 75.76%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.6, 1.8, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.6, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.2, 5.4, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.2, 6.4, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.8, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.8, 10.0, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5 s

 10%|▉         | 397/4000 [11:32<1:55:59,  1.93s/it]

(C) Put down the broom.
GT: (C) Put down the broom.
Part  Acc: 68.02%
Total Acc: 75.82%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.6, 0.8, 0.9, 1.0, 1.2, 1.3, 1.4, 1.6, 1.7, 1.8, 2.0, 2.1, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.5, 3.7, 3.8, 3.9, 4.1, 4.2, 4.3, 4.5, 4.6, 4.7, 4.9, 5.0, 5.1, 5.3, 5.4, 5.5, 5.7, 5.8, 5.9, 6.1, 6.2, 6.3, 6.5, 6.6, 6.7, 6.9, 7.0, 7.1, 7.3, 7.4, 7.5, 7.7, 7.8, 7.9, 8.1, 8.2, 8.3, 8.5, 8.6, 8.7, 8.9, 9.0, 9.1, 9.2, 9.4, 9.5, 9.6, 9.8, 9.9, 10.0, 10.2, 10.3, 10.4, 10.6, 10.7, 10.8, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 14.9, 15.1, 15.2, 15.3, 15.5, 15.6, 15.7, 15.9, 16.0, 16.1, 16.3, 16.4, 16.5, 16.

 10%|▉         | 398/4000 [11:33<1:50:24,  1.84s/it]

(C) Put down the blanket.
GT: (C) Put down the blanket.
Part  Acc: 68.18%
Total Acc: 75.88%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 0.9, 1.0, 1.1, 1.2, 1.2, 1.3, 1.4, 1.5, 1.6, 1.6, 1.7, 1.8, 1.9, 2.0, 2.0, 2.1, 2.2, 2.3, 2.4, 2.4, 2.5, 2.6, 2.7, 2.8, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.3, 3.4, 3.5, 3.6, 3.7, 3.7, 3.8, 3.9, 4.0, 4.1, 4.1, 4.2, 4.3, 4.4, 4.5, 4.5, 4.6, 4.7, 4.8, 4.9, 4.9, 5.0, 5.1, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.7, 5.7, 5.8, 5.9, 6.0, 6.1, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.6, 6.7, 6.8, 6.9, 7.0, 7.0, 7.1, 7.2, 7.3, 7.4, 7.4, 7.5, 7.6, 7.7, 7.8, 7.8, 7.9, 8.0, 8.1, 8.2, 8.2, 8.3, 8.4, 8.5, 8.6, 8.6, 8.7, 8.8, 8.9, 9.0, 9.0, 9.1, 9.2, 9.3, 9.4, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 9.9, 10.0, 10.1, 10.2, 10.3 seconds. Carefu

 10%|▉         | 399/4000 [11:35<1:43:44,  1.73s/it]

(A) Take the clothes.
GT: (A) Take the clothes.
Part  Acc: 68.34%
Total Acc: 75.94%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.9, 1.0, 1.1, 1.3, 1.4, 1.5, 1.7, 1.8, 1.9, 2.1, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.2, 3.3, 3.4, 3.6, 3.7, 3.8, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.8, 4.9, 5.0, 5.2, 5.3, 5.4, 5.6, 5.7, 5.8, 6.0, 6.1, 6.2, 6.4, 6.5, 6.7, 6.8, 6.9, 7.1, 7.2, 7.3, 7.5, 7.6, 7.7, 7.9, 8.0, 8.1, 8.3, 8.4, 8.5, 8.7, 8.8, 8.9, 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.9, 10.0, 10.1, 10.3, 10.4, 10.5, 10.7, 10.8, 11.0, 11.1, 11.2, 11.4, 11.5, 11.6, 11.8, 11.9, 12.0, 12.2, 12.3, 12.4, 12.6, 12.7, 12.8, 13.0, 13.1, 13.2, 13.4, 13.5, 13.6, 13.8, 13.9, 14.0, 14.2, 14.3, 14.4, 14.6, 14.7, 14.8, 15.0, 15.1, 15.3, 15.4, 15.5, 15.7, 15.8, 15.9, 16.1, 16.2, 16.3, 

 10%|█         | 400/4000 [11:37<1:44:55,  1.75s/it]

(A) Tidy up the clothes.
GT: (A) Tidy up the clothes.
Part  Acc: 68.50%
Total Acc: 76.00%
-------------------------------------------------- Action Prediction --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1 seconds. Carefully 

 10%|█         | 401/4000 [11:39<1:51:39,  1.86s/it]

(B) Scattering something down.
GT: (C) Piling something up
Part  Acc: 0.00%
Total Acc: 75.81%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.0, 3.1 seconds. Carefully 

 10%|█         | 402/4000 [11:41<2:06:39,  2.11s/it]

(C) Not sure.
GT: (B) Showing something on top of something
Part  Acc: 0.00%
Total Acc: 75.62%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6 seconds. Carefully 

 10%|█         | 403/4000 [11:44<2:13:03,  2.22s/it]

(C) Moving away from something with your camera
GT: (C) Moving away from something with your camera
Part  Acc: 33.33%
Total Acc: 75.68%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8 seconds. Carefully 

 10%|█         | 404/4000 [11:46<2:14:51,  2.25s/it]

(B) Throwing something in the air and catching it
GT: (B) Throwing something in the air and catching it
Part  Acc: 50.00%
Total Acc: 75.74%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.7, 3.8 seconds. Carefully 

 10%|█         | 405/4000 [11:48<2:15:36,  2.26s/it]

(B) Spinning something so it discontinues spinning.
GT: (C) Spinning something so it continues spinning
Part  Acc: 40.00%
Total Acc: 75.56%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9 seconds. Carefully 

 10%|█         | 406/4000 [11:51<2:25:37,  2.43s/it]

(C) Piling something up
GT: (C) Piling something up
Part  Acc: 50.00%
Total Acc: 75.62%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8 seconds. Carefully 

 10%|█         | 407/4000 [11:54<2:28:42,  2.48s/it]

(C) Holding something next to something
GT: (C) Holding something next to something
Part  Acc: 57.14%
Total Acc: 75.68%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7 seconds. Carefully 

 10%|█         | 408/4000 [11:56<2:27:43,  2.47s/it]

(C) Pulling something from left to right
GT: (C) Pulling something from left to right
Part  Acc: 62.50%
Total Acc: 75.74%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1, 3.1, 3.2 seconds. Carefully 

 10%|█         | 409/4000 [11:59<2:26:30,  2.45s/it]

(A) Dropping something onto something
GT: (A) Dropping something onto something
Part  Acc: 66.67%
Total Acc: 75.79%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 2.9, 3.0 seconds. Carefully 

 10%|█         | 410/4000 [12:01<2:28:39,  2.48s/it]

(A) Pretending to throw something
GT: (A) Pretending to throw something
Part  Acc: 70.00%
Total Acc: 75.85%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.7, 3.8 seconds. Carefully 

 10%|█         | 411/4000 [12:05<2:53:45,  2.90s/it]

(B) Lifting up one end of something, then letting it drop down
GT: (B) Lifting up one end of something, then letting it drop down
Part  Acc: 72.73%
Total Acc: 75.91%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.5, 2.6, 2.6 seconds. Carefully 

 10%|█         | 412/4000 [12:09<3:06:51,  3.12s/it]

(B) Pulling two ends of something so that it separates into two pieces
GT: (B) Pulling two ends of something so that it separates into two pieces
Part  Acc: 75.00%
Total Acc: 75.97%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5 seconds. Carefully 

 10%|█         | 413/4000 [12:11<2:54:25,  2.92s/it]

(A) Putting something in front of something
GT: (A) Putting something in front of something
Part  Acc: 76.92%
Total Acc: 76.03%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.0, 4.1, 4.1, 4.2 seconds. Carefully 

 10%|█         | 414/4000 [12:14<2:55:43,  2.94s/it]

(A) Moving something and something closer to each other
GT: (A) Moving something and something closer to each other
Part  Acc: 78.57%
Total Acc: 76.09%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7 seconds. Carefully 

 10%|█         | 415/4000 [12:17<2:50:28,  2.85s/it]

(C) Pushing something from left to right
GT: (C) Pushing something from left to right
Part  Acc: 80.00%
Total Acc: 76.14%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7 seconds. Carefully 

 10%|█         | 416/4000 [12:19<2:39:54,  2.68s/it]

(C) Moving away from something with your camera
GT: (C) Moving away from something with your camera
Part  Acc: 81.25%
Total Acc: 76.20%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2 seconds. Carefully 

 10%|█         | 417/4000 [12:21<2:34:13,  2.58s/it]

(C) Not sure
GT: (A) Twisting something
Part  Acc: 76.47%
Total Acc: 76.02%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.7, 2.8 seconds. Carefully 

 10%|█         | 418/4000 [12:24<2:31:03,  2.53s/it]

(B) Taking something out of something
GT: (B) Taking something out of something
Part  Acc: 77.78%
Total Acc: 76.08%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9 seconds. Carefully 

 10%|█         | 419/4000 [12:26<2:25:50,  2.44s/it]

(B) Uncovering something
GT: (B) Uncovering something
Part  Acc: 78.95%
Total Acc: 76.13%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.2, 5.2 seconds. Carefully 

 10%|█         | 420/4000 [12:28<2:23:55,  2.41s/it]

(C) Putting something onto something
GT: (C) Putting something onto something
Part  Acc: 80.00%
Total Acc: 76.19%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9 seconds. Carefully 

 11%|█         | 421/4000 [12:31<2:19:35,  2.34s/it]

(A) Pushing something off of something
GT: (A) Pushing something off of something
Part  Acc: 80.95%
Total Acc: 76.25%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.2, 5.2, 5.2, 5.3, 5.3 seconds. Carefully 

 11%|█         | 422/4000 [12:33<2:21:47,  2.38s/it]

(B) Pouring something into something
GT: (B) Pouring something into something
Part  Acc: 81.82%
Total Acc: 76.30%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.9, 5.0 seconds. Carefully 

 11%|█         | 423/4000 [12:35<2:18:20,  2.32s/it]

(C) Plugging something into something
GT: (C) Plugging something into something
Part  Acc: 82.61%
Total Acc: 76.36%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.0, 3.1 seconds. Carefully 

 11%|█         | 424/4000 [12:38<2:16:14,  2.29s/it]

(C) Covering something with something
GT: (C) Covering something with something
Part  Acc: 83.33%
Total Acc: 76.42%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3 seconds. Carefully 

 11%|█         | 425/4000 [12:40<2:25:59,  2.45s/it]

(A) Putting something that can't roll onto a slanted surface, so it stays where it is
GT: (A) Putting something that can't roll onto a slanted surface, so it stays where it is
Part  Acc: 84.00%
Total Acc: 76.47%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.2, 5.2, 5.2, 5.3, 5.3, 5.4, 5.4 seconds. Carefully 

 11%|█         | 426/4000 [12:43<2:24:16,  2.42s/it]

(B) Folding something
GT: (B) Folding something
Part  Acc: 84.62%
Total Acc: 76.53%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.7, 2.8 seconds. Carefully 

 11%|█         | 427/4000 [12:45<2:29:27,  2.51s/it]

(C) Turning something upside down
GT: (C) Turning something upside down
Part  Acc: 85.19%
Total Acc: 76.58%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.5, 3.6 seconds. Carefully 

 11%|█         | 428/4000 [12:48<2:23:15,  2.41s/it]

(A) Plugging something into something
GT: (A) Plugging something into something
Part  Acc: 85.71%
Total Acc: 76.64%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9 seconds. Carefully 

 11%|█         | 429/4000 [12:50<2:29:05,  2.51s/it]

(A) Tilting something with something on it slightly so it doesn't fall down
GT: (A) Tilting something with something on it slightly so it doesn't fall down
Part  Acc: 86.21%
Total Acc: 76.69%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1 seconds. Carefully 

 11%|█         | 430/4000 [12:53<2:31:06,  2.54s/it]

(B) Releasing something in front of something
GT: (A) Holding something in front of something
Part  Acc: 83.33%
Total Acc: 76.51%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.9, 5.0, 5.0, 5.1, 5.1, 5.2, 5.2, 5.2, 5.2, 5.3, 5.3 seconds. Carefully 

 11%|█         | 431/4000 [12:56<2:33:25,  2.58s/it]

(A) Throwing something onto a surface
GT: (A) Throwing something onto a surface
Part  Acc: 83.87%
Total Acc: 76.57%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.0, 2.1, 2.1 seconds. Carefully 

 11%|█         | 432/4000 [12:58<2:32:03,  2.56s/it]

(A) Closing something
GT: (C) Opening something
Part  Acc: 81.25%
Total Acc: 76.39%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9 seconds. Carefully 

 11%|█         | 433/4000 [13:01<2:45:41,  2.79s/it]

(C) Taking something out of something
GT: (C) Taking something out of something
Part  Acc: 81.82%
Total Acc: 76.44%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.0, 3.1 seconds. Carefully 

 11%|█         | 434/4000 [13:04<2:42:19,  2.73s/it]

(B) Turning the camera downwards while filming something
GT: (B) Turning the camera downwards while filming something
Part  Acc: 82.35%
Total Acc: 76.50%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.1, 4.2, 4.2, 4.2 seconds. Carefully 

 11%|█         | 435/4000 [13:06<2:33:48,  2.59s/it]

(A) Turning the camera left while filming something.
GT: (A) Turning the camera left while filming something
Part  Acc: 82.86%
Total Acc: 76.55%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.5 seconds. Carefully 

 11%|█         | 436/4000 [13:09<2:30:40,  2.54s/it]

(A) Stuffing something into something
GT: (A) Stuffing something into something
Part  Acc: 83.33%
Total Acc: 76.61%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8 seconds. Carefully 

 11%|█         | 437/4000 [13:11<2:22:09,  2.39s/it]

(B) Pushing something onto something
GT: (B) Pushing something onto something
Part  Acc: 83.78%
Total Acc: 76.66%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7 seconds. Carefully 

 11%|█         | 438/4000 [13:13<2:26:35,  2.47s/it]

(C) Dropping something onto something
GT: (C) Dropping something onto something
Part  Acc: 84.21%
Total Acc: 76.71%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6 seconds. Carefully 

 11%|█         | 439/4000 [13:15<2:19:38,  2.35s/it]

(A) Showing something next to something
GT: (A) Showing something next to something
Part  Acc: 84.62%
Total Acc: 76.77%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.3, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.9, 5.0 seconds. Carefully 

 11%|█         | 440/4000 [13:18<2:28:59,  2.51s/it]

(A) Throwing something in the air and letting it fall
GT: (A) Throwing something in the air and letting it fall
Part  Acc: 85.00%
Total Acc: 76.82%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5 seconds. Carefully 

 11%|█         | 441/4000 [13:21<2:25:35,  2.45s/it]

(A) Bending something so that it deforms
GT: (A) Bending something so that it deforms
Part  Acc: 85.37%
Total Acc: 76.87%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.2, 4.3 seconds. Carefully 

 11%|█         | 442/4000 [13:23<2:17:46,  2.32s/it]

(A) Putting something in front of something.
GT: (A) Putting something in front of something
Part  Acc: 85.71%
Total Acc: 76.92%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0, 1.1, 1.1, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.1, 2.1, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.5, 2.5, 2.6, 2.6, 2.6, 2.6, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9 seconds. Carefully 

 11%|█         | 443/4000 [13:25<2:15:38,  2.29s/it]

(B) Holding something in front of something
GT: (B) Holding something in front of something
Part  Acc: 86.05%
Total Acc: 76.98%
-------------------------------------------------- Action Antonym --------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


prompt: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.
[INST] <Video><VideoHere></Video> [/INST] [INST] The video contains 128 frames sampled at 0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3, 1.4, 1.4, 1.5, 1.5, 1.6, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1, 2.2, 2.2, 2.2, 2.2, 2.3, 2.3, 2.4, 2.4, 2.4, 2.5, 2.5, 2.6, 2.6, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 2.9, 2.9, 3.0, 3.0, 3.1, 3.1, 3.2, 3.2, 3.2, 3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7, 3.7, 3.8, 3.8, 3.8, 3.8, 3.9, 3.9, 4.0, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4, 4.4, 4.4, 4.5, 4.5, 4.6, 4.6, 4.7, 4.7, 4.8, 4.8, 4.8 seconds. Carefully 

 11%|█         | 444/4000 [13:28<2:33:01,  2.58s/it]

(B) Trying but failing to attach something to something because it doesn't stick
GT: (B) Trying but failing to attach something to something because it doesn't stick
Part  Acc: 86.36%
Total Acc: 77.03%
-------------------------------------------------- Action Antonym --------------------------------------------------


In [None]:
save_path = args.model_dir+"/MVBench_test_"+args.model_pth+"/result"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

with open(f"{save_path}.json", "w") as f:
    json.dump({
        "acc_dict": acc_dict,
        "res_list": res_list
    }, f)

In [None]:
final_res = dict()
correct = 0
total = 0
for k, v in acc_dict.items():
    final_res[k] = v[0] / v[1] * 100
    correct += v[0]
    total += v[1]    
final_res['Avg'] = correct / total * 100

print(final_res)

# with open("upload_leaderboard.json", "w") as f:
#     json.dump(final_res, f)

In [None]:
acc_path = args.model_dir+"/MVBench_test_"+args.model_pth+"/acc"
out = "AS	AP	AA	FA	UA	OE	OI	OS	MD	AL	ST	AC	MC	MA	SC	FP	CO	EN	ER	CI	Avg"
out1 = "AS		AP		AA		FA		UA		OE		OI		OS		MD		AL		ST		AC		MC		MA		SC		FP		CO		EN		ER		CI		Avg"
out2 = ""
correct = 0
total = 0
with open(f"{save_path}.json", "r") as f:
    json_data = json.load(f)
    for k, v in json_data["acc_dict"].items():
        correct += v[0]
        total += v[1]    
        out2 += f"{v[0]/v[1]*100:.2f}\t"
out2 += f"{correct/total*100:.2f}"
print(out)
print(out2)

with open(f"{acc_path}.txt", "w") as f:
    f.write(out1+"\n")
    f.write(out2)



In [None]:
def check_answer_egoschema(pred, qid):
    correct = 0
    answer_content = ans_dict[qid]['content'].lower()
    if answer_content[-1] == ".":
        answer_content = answer_content[:-1]
    if ans_dict[qid]['answer'].lower() in pred.lower():
        flag = True
        for kk in ["(A)", "(B)", "(C)", "(D)", "(E)"]:
            if kk != ans_dict[qid]['answer'].lower() and kk in pred.lower():
                flag = ans_dict
                break
        if flag:
            correct += 1
    elif answer_content in pred.lower():
        correct = 1
    elif answer_content.replace("a ", "") in pred.lower():
        correct = 1
    elif answer_content.replace("an ", "") in pred.lower():
        correct = 1
    return correct

def infer_egoschema(
        data_sample, system="", 
        question_prompt='', # add in the end of question
        answer_prompt=None, # add in the begining of answer
        return_prompt='',  # add in the begining of return message
        system_q=False, # whether add question in the system prompt for QFormer
        print_res=True,
        system_llm=False,
        num_segments=8,
    ):
    vid_path = os.path.join("shdd:s3://egoschema/videos", data_sample['video'])
    print(vid_path)
    video, msg = load_video(vid_path, num_segments=num_segments, return_msg=True)
    TC, H, W = video.shape
    video = video.reshape(1, TC//3, 3, H, W).to(cfg.device)
    
    video_list = []
    with torch.no_grad():
        video_emb = model.encode_long_video(video,[msg,],"")
    video_list.append(video_emb)
#     video_list.append(torch.zeros_like(video_emb))

    chat = EasyDict({
        "system": system,
        "roles": ("[INST]", "[/INST]"),
        "messages": [],
        "sep": ""
    })

    chat.messages.append([chat.roles[0], f"<Video><VideoHere></Video> [/INST]"])
    
    if system_llm:
        prompt = msg + system + data_sample['QA'][0]['q'] + question_prompt
    else:
        prompt = msg + data_sample['QA'][0]['q'] + question_prompt
    
    ask(prompt, chat)

    llm_message = answer(
        conv=chat, model=model, do_sample=False, 
        img_list=video_list, max_new_tokens=100, 
        answer_prompt=answer_prompt, print_res=print_res
    )[0]
    # remove potential explanation
    llm_message = return_prompt + llm_message.strip().split('\n')[0]
    print(llm_message)
    print(f"GT: {data_sample['QA'][0]['a']}")
    return llm_message

In [None]:
import csv
with open("./download/datasets/egoschema/EgoSchema.csv", mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)

    json_data = []
    ans_dict = {}
    
    for idx, msg in enumerate(reader):
        if idx == 0:
            print(msg)
            continue
            
        video = msg[1] + '.mp4'
        input_str = f"Question: {msg[3].capitalize()}\nOptions:\n"
    
        target_index = -1
        for i, candidate in enumerate(msg[5:]):
            option = chr(ord('A') + i)
            input_str += f"({option}) {candidate}\n"
            if candidate == msg[4]:
                target_index = i
            
        assert target_index != -1
        correct = chr(ord('A') + target_index)
        
        json_data.append({
            'video': video,
            "QA": [{
                "i": "",
                "q": input_str.strip(),
                "a": f"Answer: ({correct}) {msg[4]}",
            }]
        })

        ans_dict[idx - 1] = {
            'video': video,
            'answer': f"({correct})",
            'content': msg[4],
        }

In [None]:
#  position embedding
# num_frame = 16
# resolution = 224
# new_pos_emb = get_sinusoid_encoding_table(n_position=(resolution//16)**2*num_frame, cur_frame=num_frame)
# model.vision_encoder.encoder.pos_embed = new_pos_emb

correct = 0
total = 0
total_num = len(json_data)

output = ""

for idx, example in enumerate(tqdm(json_data)):
    start = time.time()
    llm_message = infer_egoschema(
        example, 
        "Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.\n", 
        question_prompt="\nOnly give the best option.", 
        answer_prompt="Best option:(",
        return_prompt='(',
        system_q=False,
        print_res=True,
        system_llm=False,
        num_segments=tot_frames
    )
    
    duration = time.time() - start
    output += (example["video"] + '\n')
    output += (llm_message + '\n')
    correct += check_answer_egoschema(llm_message, idx)
    total += 1
    print("Acc:", correct / total)
    print('-' * 20, f'{idx+1}/{total_num} done,', f'cost: {duration:.2f}s', '-' * 20)

In [None]:
save_path = args.model_dir+"/Egoschema_test_"+args.model_pth+"/result_subset"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path + ".txt", "a") as f:
    f.writelines(output)
    
acc_path = args.model_dir+"/Egoschema_test_"+args.model_pth+"/acc_subset"
with open(f"{acc_path}.txt", "w") as f:
    f.write("Acc: " + str(correct / total))

In [None]:
with open("./download/datasets/egoschema/questions.json", "r") as f:
    full_data = json.load(f)

full_egoschema = []
for data in full_data:
    video = data['q_uid'] + '.mp4'
    input_str = f"Question: {data['question'].capitalize()}\nOptions:\n"

    for i, candidate in enumerate(['option 0', 'option 1', 'option 2', 'option 3', 'option 4']):
        option = chr(ord('A') + i)
        input_str += f"({option}) {data[candidate]}\n"
    
    full_egoschema.append({
        'q_uid': data['q_uid'],
        'video': video,
        "QA": [{
            "i": "",
            "q": input_str.strip(),
            "a": "",
        }]
    })


def infer_full_egoschema(
        data_sample, system="", 
        question_prompt='', # add in the end of question
        answer_prompt=None, # add in the begining of answer
        return_prompt='',  # add in the begining of return message
        system_q=False, # whether add question in the system prompt for QFormer
        print_res=True,
        system_llm=False,
        num_segments=8,
    ):
    vid_path = os.path.join("shdd:s3://egoschema/videos", data_sample['video'])
    print(vid_path)
    video, msg = load_video(vid_path, num_segments=num_segments, return_msg=True)
    TC, H, W = video.shape
    video = video.reshape(1, TC//3, 3, H, W).to(cfg.device)
    
    video_list = []
    with torch.no_grad():
        video_emb = model.encode_long_video(video,[msg,],"")
    video_list.append(video_emb)

    chat = EasyDict({
        "system": system,
        "roles": ("[INST]", "[/INST]"),
        "messages": [],
        "sep": ""
    })

    chat.messages.append([chat.roles[0], f"<Video><VideoHere></Video> [/INST]"])
    
    if system_llm:
        prompt = msg + system + data_sample['QA'][0]['q'] + question_prompt
    else:
        prompt = msg + data_sample['QA'][0]['q'] + question_prompt
    
    ask(prompt, chat)

    llm_message = answer(
        conv=chat, model=model, do_sample=False, 
        img_list=video_list, max_new_tokens=100, 
        answer_prompt=answer_prompt, print_res=print_res
    )[0]
    # remove potential explanation
    llm_message = return_prompt + llm_message.strip().split('\n')[0]
    print(llm_message)
    return llm_message


#  position embedding
# num_frame = 16
# resolution = 224
# new_pos_emb = get_sinusoid_encoding_table(n_position=(resolution//16)**2*num_frame, cur_frame=num_frame)
# model.vision_encoder.encoder.pos_embed = new_pos_emb


ans_dict = {}

for idx, example in enumerate(tqdm(full_egoschema)):
    start = time.time()
    llm_message = infer_full_egoschema(
        example, 
        "Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.\n", 
        question_prompt="\nOnly give the best option.", 
        answer_prompt="Best option:(",
        return_prompt='(',
        system_q=False,
        print_res=True,
        system_llm=False,
        num_segments=tot_frames,
    )

    assert llm_message[0] == '(' and llm_message[2] == ')'
    ans = ord(llm_message[1]) - ord('A')
    assert ans in [0, 1, 2, 3, 4]
    ans_dict[example['q_uid']] = ans

In [None]:
save_path = args.model_dir+"/Egoschema_test_"+args.model_pth+"/result"
with open(save_path + ".json", "w") as f:
    json.dump(ans_dict, f)

# Then you can run https://github.com/egoschema/EgoSchema/blob/main/validate.py to get the score
# python3 validate.py --f ./your_prediction.json

In [None]:
import pysubs2
import re
from torchvision import transforms
from torchvision.transforms import PILToTensor

def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]\[\]', '', text)
    return cleaned_text

def read_vtt_and_concatenate(file_path, tokenizer, max_len=4096):
    subs = pysubs2.load(file_path, encoding="utf-8")
        
    prev = ""
    subtitles = []
    for caption in subs:
        # Split the caption text into individual lines
        lines = caption.text.split('\n')
        for line in lines:
            # Clean the text and check for repetition
            line = clean_text(line)
            if prev != line and line:
                subtitles.append(line)
                prev = line

    # Join subtitles to check length
    full_text = ' '.join(subtitles)
    tokenized_ids = tokenizer(full_text, add_special_tokens=False).input_ids

    # If the tokenized length is within the limit, return the full text
    if len(tokenized_ids) <= max_len:
        return full_text

    # Otherwise, we need to trim the text to fit within the limit
    # We will keep the first half and the last half
    half_len = max_len // 2
    start_text = ' '.join(subtitles[:half_len])
    end_text = ' '.join(subtitles[-half_len:])
    
    # Re-tokenize to ensure the total length is within the limit
    start_tokenized_ids = tokenizer(start_text, add_special_tokens=False).input_ids
    end_tokenized_ids = tokenizer(end_text, add_special_tokens=False).input_ids

    # Adjust the lengths to fit within the max_len
    while len(start_tokenized_ids) + len(end_tokenized_ids) > max_len:
        if len(start_tokenized_ids) > len(end_tokenized_ids):
            start_tokenized_ids.pop()
        else:
            end_tokenized_ids.pop(0)
    
    # Combine the adjusted parts
    adjusted_text = tokenizer.decode(start_tokenized_ids) + ' ... ' + tokenizer.decode(end_tokenized_ids)
    
    return adjusted_text

In [None]:
class MME_dataset(Dataset):
    def __init__(
        self, 
        data_prefix="shdd:s3://VideoMME_0629/processed_1fps",
        subtitle_prefix="./download/datasets/videomme/subtitle_0629",
        anno_path="./download/datasets/videomme/Video-MME_0629.json",
        frame_dict_path="./download/datasets/videomme/video_mme_1fps.json",
        num_segments=16, 
        stride=0, # if stride >= 1, will return all frames according to FPS (1/stride), else return partial frames
        resolution=224, 
        max_subtitle_len=4096, # max_tokens for subtitle
    ):
        self.data_prefix = data_prefix
        self.subtitle_prefix = subtitle_prefix
        with open(anno_path, 'r') as f:
            self.data_list = json.load(f)
        with open(frame_dict_path, 'r') as f:
            self.frame_dict = json.load(f)
        
        self.num_segments = num_segments
        self.stride = stride
        self.resolution = resolution
        self.max_subtitle_len = max_subtitle_len

        # transform
        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)
        self.transform = transforms.Compose([
            transforms.Lambda(lambda x: x.float().div(255.0)),
            transforms.Normalize(mean, std)
        ])
    
    def __str__(self):
        task_dict = {}
        total = 0
        for data in self.data_list:
            if data['duration_category'] not in ans_dict:
                task_dict[data['duration_category']] = {}
            for q in data['questions']:
                if q['task_type'] not in ans_dict[data['duration_category']]:
                    ans_dict[data['duration_category']][q['task_type']] = 0
                ans_dict[data['duration_category']][q['task_type']] += 1
                total += 1

        res = f"There are {len(self.data_list)} videos.\n"
        res += f"There are {total} QAs.\n"
        for k, v in task_dict.items():
            res += f"------{k}------\n"
            for kk, vv in task_dict.items():
                res += f"{kk}: {vv}\n"
                
        return res.rstrip()
        
    def __len__(self):
        return len(self.data_list)
    
    def get_index(self, max_frame):
        start_idx = 0
        end_idx = max_frame - 1
        seg_size = float(max_frame - start_idx) / self.num_segments
        frame_indices = np.array([
            max(int(start_idx + (seg_size / 2) + np.round(seg_size * idx)), end_idx)
            for idx in range(self.num_segments)
        ])
        return frame_indices

    def get_time_stamp(self, video_path):
        timestamp = video_path.split("_")[-1].split(".jpg")[0]
        minutes, seconds = map(int, timestamp.split(":"))
        total_seconds = minutes * 60 + seconds
        return total_seconds

    def read_frame(self, video_name):
        full_frame_list = []
        for p in self.frame_dict[video_name]:
            full_frame_list.append(os.path.join(self.data_prefix, video_name, 'frames', p))
            
        images_group = list()
        time_list = []
        if self.stride >= 1 and (len(full_frame_list) / self.stride) > self.num_segments:
            frame_list = full_frame_list[::self.stride]
        else:
            # if len(full_frame_list) < self.num_segments: # return all frames if not, seem to be a little lower
            #     frame_list = full_frame_list
            # else:
            frame_indices = get_index(len(full_frame_list), self.num_segments)
            frame_list = [full_frame_list[idx] for idx in frame_indices]
        # print(frame_list)
        
        for frame_path in frame_list:
            time_stamp = self.get_time_stamp(frame_path)
            time_list.append(time_stamp)
            if "s3://" in frame_path:
                img_bytes = client.get(frame_path)
                img = Image.open(io.BytesIO(img_bytes))
            else:
                img = Image.open(frame_path)
            img = img.resize((resolution, resolution))
            img = PILToTensor()(img).unsqueeze(0)
            images_group.append(img)
        torch_imgs = self.transform(torch.vstack(images_group))
        sec = ", ".join(map(str, time_list))
        time_instruction = f"The video contains {len(time_list)} frames sampled at {sec} seconds. "
        print(torch_imgs.shape)
        return torch_imgs, time_instruction

    def qa_template(self, data):
        question = f"Question: {data['question']}\n"
        question += "Options:\n"
        answer = data['answer']
        answer = f"({answer}) {data['options'][ord(answer) - ord('A')][3:]}"
        for idx, c in enumerate(data['options']):
            cur_choice, cur_text = c[0], c[3:]
            question += f"({cur_choice}) {cur_text}\n"
        question = question.rstrip()
        return question, answer

    def __getitem__(self, idx):
        video_name = self.data_list[idx]['videoID']
        torch_imgs, time_instruction = self.read_frame(video_name)
        duration_category = self.data_list[idx]['duration']
        qa_list = []
        for qa in self.data_list[idx]['questions']:
            qa_list.append(self.qa_template(qa))

        subtitle = ""
        try:
            subtitle_path = os.path.join(self.subtitle_prefix, video_name + ".srt")
            if os.path.exists(subtitle_path):
                subtitle = read_vtt_and_concatenate(subtitle_path, model.mistral_tokenizer, self.max_subtitle_len)
        except Exception:
            subtitle = ""
            print(f"Error for {subtitle_path}")
            
        return {
            'subtitle': subtitle,
            'video': torch_imgs, 
            'time_instruction': time_instruction,
            'qa_list': qa_list,
            'duration_category': duration_category
        }

    
def infer_mme(
        data_sample, system="", 
        question_prompt='', # add in the end of question
        answer_prompt=None, # add in the begining of answer
        return_prompt='',  # add in the begining of return message
        system_q=False, # whether add question in the system prompt for QFormer
        print_res=True,
        system_llm=False,
        no_qformer_instruction=False,
        qformer_instruction=None,
        add_subtitle=False,
    ):
    assert system_q == False, "do not support system_q now"
    video = data_sample["video"]
    msg = data_sample["time_instruction"]
    T_, C, H, W = video.shape
    video = video.reshape(1, T_, C, H, W).to(cfg.device)
    
    video_list = []
    with torch.no_grad():
        video_emb = model.encode_long_video(video,[msg,],"")
    video_list.append(video_emb[0].unsqueeze(0))
    print(video_list[0].shape)

    pred_list = []
    gt_list = []
    for idx, qa in enumerate(data_sample['qa_list']):
        print(f"----------qa_{idx}---------", flush=True)
        chat = EasyDict({
            "system": system,
            "roles": ("[INST]", "[/INST]"),
            "messages": [],
            "sep": ""
        })
        
        if add_subtitle and data_sample['subtitle'] != '':
            subtitle = f"This video's subtitles are listed below: {data_sample['subtitle']}"
            chat.messages.append([chat.roles[0], f"{subtitle}\n<Video><VideoHere></Video> [/INST]"])
        else:
            chat.messages.append([chat.roles[0], f"<Video><VideoHere></Video> [/INST]"])
        
        if system_llm:
            prompt = msg + system + qa[0] + question_prompt
        else:
            prompt = msg + qa[0] + question_prompt
        
        ask(prompt, chat)
    
        llm_message = answer(
            conv=chat, model=model, do_sample=False, 
            img_list=video_list, max_new_tokens=256, 
            answer_prompt=answer_prompt, print_res=print_res
        )[0]
        # remove potential explanation
        llm_message = return_prompt + llm_message.strip().split('\n')[0]
        print(f"Pred: {llm_message}", flush=True)
        print(f"GT: {qa[1]}", flush=True)
        pred_list.append(llm_message[1])
        gt_list.append(qa[1][1])
    return pred_list, gt_list

In [None]:
stride = 0
max_subtitle_len=8192
data_prefix = "shdd:s3://VideoMME_0629/processed_1fps"
anno_path = "./download/datasets/videomme/Video-MME_0629.json"
frame_dict_path = "./download/datasets/videomme/video_mme_1fps.json"
dataset = MME_dataset(
    data_prefix=data_prefix, 
    anno_path=anno_path, 
    frame_dict_path=frame_dict_path,
    num_segments=tot_frames, 
    stride=stride,
    resolution=resolution,
    max_subtitle_len=max_subtitle_len,
)

with open(anno_path, 'r') as f:
    res_json_data = json.load(f)
    

    
# Only Vision Information

correct = 0
total = 0
res_list = []
acc_dict = {}

for idx, example in enumerate(tqdm(dataset)):
    duration_category = example['duration_category']
    if duration_category not in acc_dict:
        acc_dict[duration_category] = [0, 0] # correct, total
    qa_count = len(example['qa_list'])
    acc_dict[duration_category][1] += qa_count
    total += qa_count

    
    pred_list, gt_list = infer_mme(
        example, 
        "Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.\n", # newPrompt2
        question_prompt="\nOnly give the best option.",  # prompt3
        answer_prompt="Best option:(",
        return_prompt='(',
        system_q=False,
        print_res=True,
        system_llm=True,
        add_subtitle=False,
    )
    
    res_list.append({
        'pred': pred_list,
        'gt': gt_list
    })
    qa_idx = 0
    for pred, gt in zip(pred_list, gt_list):
        if pred == gt:
            acc_dict[duration_category][0] += 1
            correct += 1
        res_json_data[idx]['questions'][qa_idx]['response'] = pred
        qa_idx += 1
    print(f"Part  Acc: {acc_dict[duration_category][0] / acc_dict[duration_category][1] * 100 :.2f}%")
    print(f"Total Acc: {correct / total * 100 :.2f}%")
    print('-' * 50, duration_category, '-' * 50)
    

    
    
save_path = args.model_dir+"/VideoMME_test_"+args.model_pth+"/Wo_result"
acc_path = args.model_dir+"/VideoMME_test_"+args.model_pth+"/Wo_acc"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
os.makedirs(os.path.dirname(save_path), exist_ok=True)
category_list = ["short", "medium", "long"]

with open(f"{save_path}.json", "w") as f:
    json.dump({
        "acc_dict": acc_dict,
        "res_list": res_list
    }, f)

with open(f"{save_path}_full.json", "w") as f:
    json.dump(res_json_data, f)

with open(f"{acc_path}.txt", "w") as f:
    f.write("Acc: " + str(round(100 * correct / total, 1)) + "\n")
    for duration_category in category_list:
        f.write(duration_category + " Acc: " + str(round(100 * acc_dict[duration_category][0] / acc_dict[duration_category][1], 1)) + "\n")
        
        
        

        
        
# With Subtitle
correct = 0
total = 0
res_list = []
acc_dict = {}

for idx, example in enumerate(tqdm(dataset)):
    duration_category = example['duration_category']
    if duration_category not in acc_dict:
        acc_dict[duration_category] = [0, 0] # correct, total
    qa_count = len(example['qa_list'])
    acc_dict[duration_category][1] += qa_count
    total += qa_count

    
    pred_list, gt_list = infer_mme(
        example, 
        "Carefully watch the video, read the related subtitles and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.\n", # newPrompt2
        question_prompt="\nOnly give the best option.",  # prompt3
        answer_prompt="Best option:(",
        return_prompt='(',
        system_q=False,
        print_res=True,
        system_llm=True,
        add_subtitle=True,
    )
    
    res_list.append({
        'pred': pred_list,
        'gt': gt_list
    })
    qa_idx = 0
    for pred, gt in zip(pred_list, gt_list):
        if pred == gt:
            acc_dict[duration_category][0] += 1
            correct += 1
        res_json_data[idx]['questions'][qa_idx]['response'] = pred
        qa_idx += 1
    print(f"Part  Acc: {acc_dict[duration_category][0] / acc_dict[duration_category][1] * 100 :.2f}%")
    print(f"Total Acc: {correct / total * 100 :.2f}%")
    print('-' * 50, duration_category, '-' * 50)
    
    
    

save_path = args.model_dir+"/VideoMME_test_"+args.model_pth+"/WithSub_result"
acc_path = args.model_dir+"/VideoMME_test_"+args.model_pth+"/WithSub_acc"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
os.makedirs(os.path.dirname(save_path), exist_ok=True)
category_list = ["short", "medium", "long"]

with open(f"{save_path}.json", "w") as f:
    json.dump({
        "acc_dict": acc_dict,
        "res_list": res_list
    }, f)

with open(f"{save_path}_full.json", "w") as f:
    json.dump(res_json_data, f)

with open(f"{acc_path}.txt", "w") as f:
    f.write("Acc: " + str(round(100 * correct / total, 1)) + "\n")
    for duration_category in category_list:
        f.write(duration_category + " Acc: " + str(round(100 * acc_dict[duration_category][0] / acc_dict[duration_category][1], 1)) + "\n")