In [1]:
from PIL import Image
import pandas as pd
import os

def open_images_from_directory(directory):
    # List all files in the directory
    files = os.listdir(directory)
    img = []
    for file in (files):
        # Check if the file is an image and only take the first 4 frames
        if len(img) < 4: 
            if file.endswith('.jpg'):
                try:
                    image_path = os.path.join(directory, file)
                    img.append(Image.open(image_path))
                except Exception as e:
                    print(f"Error opening image {file}: {e}")
        else:
            break
    return img


In [2]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoModelForCausalLM
DEVICE = "cuda" if torch.cuda.is_available() else "CPU"
print(DEVICE)

cuda


In [3]:
def create_input_messages(image_array):
    # Generate image content based on the length of image_array
    image_contents = [{"type": "image"} for _ in image_array]
    
    # Add the text message to describe the images
    
    text_content = {"type": "text", "text": "Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. Output '0' if it is pick or '1' if it is place."}
            
    # Combine image contents and text content
    messages = [{"role": "user", "content": image_contents + [text_content]}]
    
    return messages

**SmolVLM-Instruct**

In [None]:
# Initialize processor, model and load PEFT adapter
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)
model.load_adapter("HuggingFaceTB/SmolVLM-Instruct-DPO")



In [13]:
# Load the CSV file into a DataFrame
dataset_path = '/home/ttyh/hot3d/hot3d/dataset/mcq/gazevsnogaze.csv'
df = pd.read_csv(dataset_path)

correct = 0
wrong = 0
data = os.listdir("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/gaze")
for sample in data:
    # Open images and create input messages
    img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/gaze/" + sample)
    messages = create_input_messages(img)
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    
    # Process inputs
    inputs = processor(text=prompt, images=img, return_tensors="pt")
    inputs = inputs.to(DEVICE)

    # Generate outputs
    generated_ids = model.generate(**inputs, max_new_tokens=500)[:, inputs['input_ids'].shape[1]:]
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Check if the generated text matches the action class
    action_class = str(df[df['Folder name'] == sample]['action_class'].iloc[0])
    if action_class in generated_texts:
        correct += 1
    else:
        wrong += 1
    
    # Update the DataFrame with the generated text
    df.loc[df['Folder name'] == sample, 'SmolVLM-Instruct(B)'] = int(float(generated_texts))
    
    # Delete the variables to free up memory
    del img, messages, prompt, inputs, generated_ids, generated_texts

print("Accuracy: ", correct / (wrong + correct))

# Save the updated DataFrame back to the CSV file
df.to_csv(dataset_path, index=False)


Accuracy:  0.75


In [14]:
def create_input_messages(image_array):
    # Generate image content based on the length of image_array
    image_contents = [{"type": "image"} for _ in image_array]
    
    # Add the text message to describe the images
    
    text_content = {"type": "text", "text": "Analyze this sequence of frames. Identify whether the user is performing a pick or place task. Output '0' if it is pick or '1' if it is place."}
            
    # Combine image contents and text content
    messages = [{"role": "user", "content": image_contents + [text_content]}]
    
    return messages
    # Load the CSV file into a DataFrame
dataset_path = '/home/ttyh/hot3d/hot3d/dataset/mcq/gazevsnogaze.csv'
df = pd.read_csv(dataset_path)

correct = 0
wrong = 0
data = os.listdir("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/nogaze")
for sample in data:
    # Open images and create input messages
    img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/nogaze/" + sample)
    messages = create_input_messages(img)
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    
    # Process inputs
    inputs = processor(text=prompt, images=img, return_tensors="pt")
    inputs = inputs.to(DEVICE)

    # Generate outputs
    generated_ids = model.generate(**inputs, max_new_tokens=500)[:, inputs['input_ids'].shape[1]:]
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Check if the generated text matches the action class
    action_class = str(df[df['Folder name'] == sample]['action_class'].iloc[0])
    if action_class in generated_texts:
        correct += 1
    else:
        wrong += 1
    
    # Update the DataFrame with the generated text
    df.loc[df['Folder name'] == sample, 'SmolVLM-Instruct(B)'] = int(float(generated_texts))
    
    # Delete the variables to free up memory
    del img, messages, prompt, inputs, generated_ids, generated_texts

print("Accuracy: ", correct / (wrong + correct))

# Save the updated DataFrame back to the CSV file
df.to_csv(dataset_path, index=False)


Accuracy:  0.5


In [15]:
# Tried a few shot prompt with one example but did not even output if it was a picking or placing task
messages = [
    {
        'role': 'user',
        'content': [
            {'type': 'text', 'text': "Analyse the sequence of frames where the red spot shows the user's eye gaze. Always output only '0' if the task is pick or '1' if the task is place. Do not include any additional text or description."},
            {'type': 'image'},
            {'type': 'image'},
            {'type': 'image'},
            {'type': 'image'}
        ]
    },
    {
        'role': 'assistant',
        'content': [
            {'type': 'text', 'text': "0"}
        ]
    },
    {
        'role': 'user',
        'content': [
            {'type': 'text', 'text': "Analyse the sequence of frames where the red spot shows the user's eye gaze. Always output only '0' if the task is pick or '1' if the task is place. Do not include any additional text or description."},
            {'type': 'image'},
            {'type': 'image'},
            {'type': 'image'},
            {'type': 'image'}
        ]
    }
]

# Load images
img_prompt = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/mcq/all_frames/Pick up jug from table_58929010438104.00")
img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/gaze/Place_P0001_a68492d5_new_2")

# Combine image lists
if not isinstance(img_prompt, list) or not isinstance(img, list):
    raise ValueError("Images must be loaded as lists.")
img_new = img_prompt + img

# Prepare the prompt
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

# Prepare inputs
inputs = processor(text=prompt, images=img_new, return_tensors="pt").to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

# Output results
print(generated_texts)


["User: Analyse the sequence of frames where the red spot shows the user's eye gaze. Always output only '0' if the task is pick or '1' if the task is place. Do not include any additional text or description.<image>Assistant: 0\nUser: Analyse the sequence of frames where the red spot shows the user's eye gaze. Always output only '0' if the task is pick or '1' if the task is place. Do not include any additional text or description.<image>Assistant: A person is holding a blue thermos in front of a wall with multiple monitors."]


**llava-hf/llava-onevision-qwen2-7b-ov-hf**

In [21]:
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

# Load the model in half-precision
model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Load the CSV file into a DataFrame
dataset_path = '/home/ttyh/hot3d/hot3d/dataset/mcq/gazevsnogaze.csv'
df = pd.read_csv(dataset_path)

correct = 0
wrong = 0
data = os.listdir("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/gaze")
messages = [{'role': 'user',
  'content': [{'type': 'image'},
   {'type': 'image'},
   {'type': 'image'},
   {'type': 'image'},
   {'type': 'text',
    'text': "Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. Output '0' if it is pick or '1' if it is place."}]}]

prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
for sample in data:
    # Open images and create input messages
    img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/gaze/" + sample)
    
    inputs = processor(images=img, text=prompt, padding=True, return_tensors="pt").to(model.device, torch.float16)
    inputs = inputs.to(DEVICE)
    print(sample, 'is in device') 
    #Generate outputs
    generated_ids = model.generate(**inputs, max_new_tokens=100)[:, inputs['input_ids'].shape[1]:]
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print("generated text: ", generated_texts)
    # Check if the generated text matches the action class
    action_class = str(df[df['Folder name'] == sample]['action_class'].iloc[0])
    
    
    if action_class in generated_texts:
        correct += 1
    else:
        wrong += 1
    
    # Update the DataFrame with the generated text
    df.loc[df['Folder name'] == sample, 'llava(B)'] = int(float(generated_texts))
    df.to_csv(dataset_path, index=False)
    # Delete the variables to free up memory
    del img, inputs, generated_ids, generated_texts, action_class

print("Accuracy: ", correct / (wrong + correct))

# Save the updated DataFrame back to the CSV file

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Pick up_P0001_a68492d5_new_1 is in device


In [None]:
# Load the CSV file into a DataFrame
dataset_path = '/home/ttyh/hot3d/hot3d/dataset/mcq/gazevsnogaze.csv'
df = pd.read_csv(dataset_path)

correct = 0
wrong = 0
data = os.listdir("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/nogaze")
messages = [{'role': 'user',
  'content': [{'type': 'image'},
   {'type': 'image'},
   {'type': 'image'},
   {'type': 'image'},
   {'type': 'text',
    'text': "Analyze this sequence of frames. Identify whether the user is performing a pick or place task. Output '0' if it is pick or '1' if it is place."}]}]

prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
for sample in data:
    # Open images and create input messages
    img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/nogaze/" + sample)
    
    inputs = processor(images=img, text=prompt, padding=True, return_tensors="pt").to(model.device, torch.float16)
    inputs = inputs.to(DEVICE)
    # Generate outputs
    generated_ids = model.generate(**inputs, max_new_tokens=100)[:, inputs['input_ids'].shape[1]:]
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Check if the generated text matches the action class
    action_class = str(df[df['Folder name'] == sample]['action_class'].iloc[0])
    print("generated text: ", generated_texts)
    
    if action_class in generated_texts:
        correct += 1
    else:
        wrong += 1
    
    # Update the DataFrame with the generated text
    df.loc[df['Folder name'] == sample, 'llava(B)'] = int(float(generated_texts))
    df.to_csv(dataset_path, index=False)
    # Delete the variables to free up memory
    del img, inputs, generated_ids, generated_texts, action_class

print("Accuracy: ", correct / (wrong + correct))

# Save the updated DataFrame back to the CSV file



Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


In [None]:
# Trying few shot
dataset_path = '/home/ttyh/hot3d/hot3d/dataset/mcq/gazevsnogaze.csv'
df = pd.read_csv(dataset_path)

correct = 0
wrong = 0
data = os.listdir("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames")
messages = [
    {
        'role': 'user',
        'content': [
            {'type': 'text', 'text': "Analyse the sequence of frames where the red spot shows the user's eye gaze. Always output only '0' if the task is pick or '1' if the task is place. Do not include any additional text or description."},
            {'type': 'image'},
            {'type': 'image'},
            {'type': 'image'},
            {'type': 'image'}
        ]
    },
    {
        'role': 'assistant',
        'content': [
            {'type': 'text', 'text': "0"}
        ]
    },
    {
        'role': 'user',
        'content': [
            {'type': 'text', 'text': "Analyse the sequence of frames where the red spot shows the user's eye gaze. Always output only '0' if the task is pick or '1' if the task is place. Do not include any additional text or description."},
            {'type': 'image'},
            {'type': 'image'},
            {'type': 'image'},
            {'type': 'image'}
        ]
    }
]


prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
for sample in data:
    # Open images and create input messages
    img_prompt = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/mcq/all_frames/Pick up jug from table_58929010438104.00")
    img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames" + sample)
    img_new = img_prompt + img
    
    inputs = processor(images=img_new, text=prompt, padding=True, return_tensors="pt").to(model.device, torch.float16)
    inputs = inputs.to(DEVICE)
    # Generate outputs
    generated_ids = model.generate(**inputs, max_new_tokens=100)[:, inputs['input_ids'].shape[1]:]
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Check if the generated text matches the action class
    action_class = str(df[df['Folder name'] == sample]['action_class'].iloc[0])
    print("generated text: ", generated_texts)
    
    if action_class in generated_texts:
        correct += 1
    else:
        wrong += 1
        
    # Update the DataFrame with the generated text
    df.loc[df['Folder name'] == sample, 'llava(few)'] = int(float(generated_texts))
    df.to_csv(dataset_path, index=False)
    # Delete the variables to free up memory
    del img, inputs, generated_ids, generated_texts, action_class

print("Accuracy: ", correct / (wrong + correct))

# Save the updated DataFrame back to the CSV file



**"microsoft/Phi-3.5-vision-instruct"**

In [5]:
from transformers import AutoModelForCausalLM 
model_id = "microsoft/Phi-3.5-vision-instruct" 

# Note: set _attn_implementation='eager' if you don't have flash_attn installed
model = AutoModelForCausalLM.from_pretrained(
  model_id, 
  device_map="cuda", 
  trust_remote_code=True, 
  torch_dtype="auto", 
  _attn_implementation='flash_attention_2' #'eager'   
)

# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor = AutoProcessor.from_pretrained(model_id, 
  trust_remote_code=True, 
  num_crops=4
) 


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



preprocessor_config.json:   0%|          | 0.00/442 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [6]:
# Load the CSV file into a DataFrame
dataset_path = '/home/ttyh/hot3d/hot3d/dataset/mcq/gazevsnogaze.csv'
df = pd.read_csv(dataset_path)

correct = 0
wrong = 0
data = os.listdir("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/gaze")

for sample in data:
    # Open images and create input messages
    img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/gaze/" + sample)
    prompt = processor.tokenizer.apply_chat_template(
      [
            {"role": "user", "content": "<|image_1|>\n<|image_2|>\n<|image_3|>\n<|image_4|>\nAnalyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. Output '0' if it is pick or '1' if it is place."},
        ], 
      tokenize=False, 
      add_generation_prompt=True
    )
    inputs = processor(prompt, img, return_tensors="pt").to("cuda:0") 
    
    generation_args = { 
        "max_new_tokens": 1000, 
        "temperature": 0.0, 
        "do_sample": False, 
    } 
    
    generate_ids = model.generate(**inputs, 
      eos_token_id=processor.tokenizer.eos_token_id, 
      **generation_args
    )
    
    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    generated_texts = processor.batch_decode(generate_ids, 
      skip_special_tokens=True, 
      clean_up_tokenization_spaces=False)[0] 

    # Check if the generated text matches the action class
    action_class = str(df[df['Folder name'] == sample]['action_class'].iloc[0])
    print("generated text: ", generated_texts)
    if action_class in generated_texts:
        correct += 1
    else:
        wrong += 1
    
    # Update the DataFrame with the generated text
    df.loc[df['Folder name'] == sample, 'Phi3.5(B)'] = int(float(generated_texts))
    
    # Delete the variables to free up memory
    del img, prompt, inputs, generate_ids, generated_texts

print("Accuracy: ", correct / (wrong + correct))

# Save the updated DataFrame back to the CSV file
df.to_csv(dataset_path, index=False)


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


generated text:  1




generated text:  1




generated text:  0




generated text:  1




generated text:  1




generated text:  0




generated text:  1




generated text:  1
Accuracy:  0.5


In [8]:
# Load the CSV file into a DataFrame
dataset_path = '/home/ttyh/hot3d/hot3d/dataset/mcq/gazevsnogaze.csv'
df = pd.read_csv(dataset_path)

correct = 0
wrong = 0
data = os.listdir("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/nogaze")

for sample in data:
    # Open images and create input messages
    img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/nogaze/" + sample)
    prompt = processor.tokenizer.apply_chat_template(
      [
            {"role": "user", "content": "<|image_1|>\n<|image_2|>\n<|image_3|>\n<|image_4|>\nAnalyze this sequence of frames. Identify whether the user is performing a pick or place task. Output '0' if it is pick or '1' if it is place."},
        ], 
      tokenize=False, 
      add_generation_prompt=True
    )
    inputs = processor(prompt, img, return_tensors="pt").to("cuda:0") 
    
    generation_args = { 
        "max_new_tokens": 1000, 
        "temperature": 0.0, 
        "do_sample": False, 
    } 
    
    generate_ids = model.generate(**inputs, 
      eos_token_id=processor.tokenizer.eos_token_id, 
      **generation_args
    )
    
    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    generated_texts = processor.batch_decode(generate_ids, 
      skip_special_tokens=True, 
      clean_up_tokenization_spaces=False)[0] 

    # Check if the generated text matches the action class
    action_class = str(df[df['Folder name'] == sample]['action_class'].iloc[0])
    print("generated text: ", generated_texts)
    if action_class in generated_texts:
        correct += 1
    else:
        wrong += 1
    
    # Update the DataFrame with the generated text
    df.loc[df['Folder name'] == sample, 'Phi3.5(B)'] = int(float(generated_texts))
    
    # Delete the variables to free up memory
    del img, prompt, inputs, generate_ids, generated_texts

print("Accuracy: ", correct / (wrong + correct))

# Save the updated DataFrame back to the CSV file
df.to_csv(dataset_path, index=False)




generated text:  1




generated text:  1




generated text:  1




generated text:  1




generated text:  1




generated text:  0




generated text:  1




generated text:  1
Accuracy:  0.375


In [None]:

data = os.listdir("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames")

for sample in data:
    # Open images and create input messages
    img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames" + sample)
    prompt = processor.tokenizer.apply_chat_template(
      [
            {"role": "user", "content": "<|image_1|>\n<|image_2|>\n<|image_3|>\n<|image_4|>\nAnalyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task and explain."},
        ], 
      tokenize=False, 
      add_generation_prompt=True
    )
    inputs = processor(prompt, img, return_tensors="pt").to("cuda:0") 
    
    generation_args = { 
        "max_new_tokens": 1000, 
        "temperature": 0.0, 
        "do_sample": False, 
    } 
    
    generate_ids = model.generate(**inputs, 
      eos_token_id=processor.tokenizer.eos_token_id, 
      **generation_args
    )
    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    generated_texts = processor.batch_decode(generate_ids, 
      skip_special_tokens=True, 
      clean_up_tokenization_spaces=False)[0] 

    print("generated text: ", generated_texts)



In [13]:
# Try few shot
dataset_path = '/home/ttyh/hot3d/hot3d/dataset/mcq/gazevsnogaze.csv'
df = pd.read_csv(dataset_path)

correct = 0
wrong = 0
data = os.listdir("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/gaze")
messages = [
            {"role": "user", "content": "<|image_1|>\n<|image_2|>\n<|image_3|>\n<|image_4|>\nAnalyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task and explain."},
            {"role": "assistant", "content": "0"},
            {"role": "user", "content": "<|image_5|>\n<|image_6|>\n<|image_7|>\n<|image_8|>\nAnalyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task and explain."},
]

#"<|user|>\n<|image_1|>\n<|image_2|>\n<|image_3|>\n<|image_4|>\n{Analyse the sequence of frames where the red spot shows the user's eye gaze. Always output only '0' if the task is pick or '1' if the task is place. Do not include any additional text or description.<|end|>\n<|assistant|>\n"0


for sample in data:
    # Open images and create input messages
    img_prompt = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/mcq/all_frames/Pick up jug from table_58929010438104.00")
    img = open_images_from_directory("/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/gaze/" + sample)
    img_new = img_prompt + img
    prompt = processor.tokenizer.apply_chat_template(messages, 
      tokenize=False, 
      add_generation_prompt=True
    )
    inputs = processor(prompt, img_new, return_tensors="pt").to("cuda:0") 
    
    generation_args = { 
        "max_new_tokens": 1000, 
        "temperature": 0.0, 
        "do_sample": False, 
    } 
    
    generate_ids = model.generate(**inputs, 
      eos_token_id=processor.tokenizer.eos_token_id, 
      **generation_args
    )
    print("generate_ids: ", generate_ids)
    
    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    generated_texts = processor.batch_decode(generate_ids, 
      skip_special_tokens=True, 
      clean_up_tokenization_spaces=False)[0] 

    # Check if the generated text matches the action class
    action_class = str(df[df['Folder name'] == sample]['action_class'].iloc[0])
    print("generated text: ", generated_texts)
    if action_class in generated_texts:
        print("correct")
        correct += 1
    else:
        wrong += 1
        print("wrong")
    
    # Update the DataFrame with the generated text
    df.loc[df['Folder name'] == sample, 'Phi3.5(few)'] = int(float(generated_texts))
    
    # Delete the variables to free up memory
    del img, prompt, inputs, generate_ids, generated_texts

print("Accuracy: ", correct / (wrong + correct))

# Save the updated DataFrame back to the CSV file
df.to_csv(dataset_path, index=False)




generate_ids:  tensor([[    1, 32010, 29871,  ..., 29900, 32007, 32000]], device='cuda:0')
generated text:  0
correct




generate_ids:  tensor([[    1, 32010, 29871,  ..., 29900, 32007, 32000]], device='cuda:0')
generated text:  0
correct




generate_ids:  tensor([[    1, 32010, 29871,  ..., 29900, 32007, 32000]], device='cuda:0')
generated text:  0
correct




generate_ids:  tensor([[    1, 32010, 29871,  ..., 29900, 32007, 32000]], device='cuda:0')
generated text:  0
correct




generate_ids:  tensor([[    1, 32010, 29871,  ..., 29900, 32007, 32000]], device='cuda:0')
generated text:  0
wrong




generate_ids:  tensor([[    1, 32010, 29871,  ..., 29900, 32007, 32000]], device='cuda:0')
generated text:  0
wrong




generate_ids:  tensor([[    1, 32010, 29871,  ..., 29900, 32007, 32000]], device='cuda:0')
generated text:  0
wrong




generate_ids:  tensor([[    1, 32010, 29871,  ..., 29900, 32007, 32000]], device='cuda:0')
generated text:  0
wrong
Accuracy:  0.5
