In [3]:
from PIL import Image
import os

def open_images_from_directory(directory):
    # List all files in the directory
    files = os.listdir(directory)
    img = []
    for file in (files):
        # Check if the file is an image
        if file.endswith('.jpg'):
            try:
                image_path = os.path.join(directory, file)
                img.append(Image.open(image_path))
            except Exception as e:
                print(f"Error opening image {file}: {e}")
    return img


In [4]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoModelForCausalLM
DEVICE = "cuda" if torch.cuda.is_available() else "CPU"


In [5]:
def create_input_messages(image_array):
    # Generate image content based on the length of image_array
    image_contents = [{"type": "image"} for _ in image_array]
    
    # Add the text message to describe the images
    text_content = {"type": "text", "text": "Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."}
    
    # Combine image contents and text content
    messages = [{"role": "user", "content": image_contents + [text_content]}]
    
    return messages

In [4]:
directory = '/home/ttyh/hot3d/hot3d/dataset/mcq/all_frames/Pick up_P0001_a68492d5_new_8'

img = open_images_from_directory(directory)
print(img)

messages = create_input_messages(img)
print(messages)

[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x71BE6C6A2A70>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x71BD99B63E80>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x71BD99B63EB0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x71BD99B63FA0>]
[{'role': 'user', 'content': [{'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'text', 'text': "Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."}]}]


In [5]:

# Set your directory path here
directory = '/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/Pick up_P0001_a68492d5_new_1'

img = open_images_from_directory(directory)
print(img)

messages = create_input_messages(img)
print(messages)

[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x71BD99B63FD0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x71BD99B0ABC0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x71BD99B0AC50>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x71BD99B0ADD0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x71BD99B0A950>]
[{'role': 'user', 'content': [{'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'text', 'text': "Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."}]}]


**SmolVLM-Instruct**

In [6]:
# Initialize processor, model and load PEFT adapter
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)
model.load_adapter("HuggingFaceTB/SmolVLM-Instruct-DPO")



processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


config.json:   0%|          | 0.00/7.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.49G [00:00<?, ?B/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x71be6e46b4c0>>
Traceback (most recent call last):
  File "/home/ttyh/hot3d/hot3d/.pixi/envs/default/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt



In [5]:
# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=img, return_tensors="pt")
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])

User:<image>Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination.
Assistant: The user is performing a pick up task. The item being picked up is the milk carton.


**Idefics**

In [5]:
#only 2 images
directory = "/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/Pick up_P0001_a68492d5_new_8"

img = open_images_from_directory(directory)
img = img[2:]

In [6]:
messages = create_input_messages(img)
print(messages)

[{'role': 'user', 'content': [{'type': 'image'}, {'type': 'image'}, {'type': 'text', 'text': "Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."}]}]


In [7]:
processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceM4/Idefics3-8B-Llama3", torch_dtype=torch.bfloat16
).to(DEVICE)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=img, return_tensors="pt")
#inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
inputs = inputs.to(DEVICE)

# Generate
generated_ids = model.generate(**inputs, max_new_tokens=100)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_texts)

["User:<image>Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination.\nAssistant: The task is a place task. The user is pointing at a red, white, and blue carton that is on a brown cart. The carton is being placed on the cart."]


In [8]:
#CUDA out of memory for 3 even if using float16
directory = '/home/ttyh/hot3d/hot3d/dataset/mcq/all_frames/Pick up cup from table_56743856578795.00'
img = open_images_from_directory(directory)
img = img[:3]
messages = create_input_messages(img)
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(images=img, text=prompt, return_tensors="pt").to(model.device, torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=100)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_texts)

["User:<image>Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination.\nAssistant: The task is a place task. The rubik's cube is being placed on the table."]


In [9]:
#CUDA out of memory with 3 images
directory = "/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/Pick up_P0001_a68492d5_new_8"

img = open_images_from_directory(directory)
img = img[1:]

messages = create_input_messages(img)
print(messages)

prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=img, return_tensors="pt")
#inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
inputs = inputs.to(DEVICE)

# Generate
generated_ids = model.generate(**inputs, max_new_tokens=100)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_texts)

[{'role': 'user', 'content': [{'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'text', 'text': "Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."}]}]
["User:<image>Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination.\nAssistant: The task is placing. The user is putting a milk carton on the shelf."]


**llava-hf/llava-onevision-qwen2-7b-ov-hf**

In [6]:
directory = "/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/Pick up_P0001_a68492d5_new_8"

img = open_images_from_directory(directory)

messages = create_input_messages(img)
print(messages)

[{'role': 'user', 'content': [{'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'text', 'text': "Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."}]}]


In [7]:
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

# Load the model in half-precision
model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(images=img, text=prompt, padding=True, return_tensors="pt").to(model.device, torch.float16)
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


In [None]:
#CUDA out of memory for 5 images
directory = '/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/Pick up_P0001_a68492d5_new_1'

img = open_images_from_directory(directory)
print(img)

messages = create_input_messages(img)
print(messages)

# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(images=img, text=prompt, padding=True, return_tensors="pt").to(model.device, torch.float16)
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])

**"microsoft/Phi-3.5-vision-instruct"**

In [6]:
from transformers import AutoModelForCausalLM 
model_id = "microsoft/Phi-3.5-vision-instruct" 

# Note: set _attn_implementation='eager' if you don't have flash_attn installed
model = AutoModelForCausalLM.from_pretrained(
  model_id, 
  device_map="cuda", 
  trust_remote_code=True, 
  torch_dtype="auto", 
  _attn_implementation='flash_attention_2'    
)

# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor = AutoProcessor.from_pretrained(model_id, 
  trust_remote_code=True, 
  num_crops=4
) 


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [6]:
placeholder = ''
for i in range(len(img)):
    placeholder += f"<|image_{i+1}|>\n"

messages = [
    {"role": "user", "content": placeholder+"Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."},
]

prompt = processor.tokenizer.apply_chat_template(
  messages, 
  tokenize=False, 
  add_generation_prompt=True
)

inputs = processor(prompt, img, return_tensors="pt").to("cuda:0") 

generation_args = { 
    "max_new_tokens": 1000, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

generate_ids = model.generate(**inputs, 
  eos_token_id=processor.tokenizer.eos_token_id, 
  **generation_args
)

# remove input tokens 
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, 
  skip_special_tokens=True, 
  clean_up_tokenization_spaces=False)[0] 

print(response)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


The user is performing a pick task. The item being picked up is the red object, which appears to be a remote control.


**Playing with memory_profiler to compare the memory usage and time taken between models**

**since SmolVLM and Phi3.5 is able to deal with the same 5 images data, i will use those first**

In [10]:
# Initialize processor, model and load PEFT adapter
processor_a = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model_a = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)
model_a.load_adapter("HuggingFaceTB/SmolVLM-Instruct-DPO")

# Set your directory path here
directory = '/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/Pick up_P0001_a68492d5_new_1'

img = open_images_from_directory(directory)
print(img)

messages = create_input_messages(img)
print(messages)

# Prepare inputs
prompt_a = processor_a.apply_chat_template(messages, add_generation_prompt=True)
inputs_a = processor_a(text=prompt_a, images=img, return_tensors="pt")
inputs_a = inputs_a.to(DEVICE)


Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x73C525729DB0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x73C525729B10>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x73C5257297B0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x73C525729A80>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x73C525728C40>]
[{'role': 'user', 'content': [{'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'image'}, {'type': 'text', 'text': "Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."}]}]


In [8]:
from transformers import AutoModelForCausalLM 
model_id = "microsoft/Phi-3.5-vision-instruct" 

# Note: set _attn_implementation='eager' if you don't have flash_attn installed
model_b = AutoModelForCausalLM.from_pretrained(
  model_id, 
  device_map="cuda", 
  trust_remote_code=True, 
  torch_dtype="auto", 
  _attn_implementation='flash_attention_2'    
)

# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor_b = AutoProcessor.from_pretrained(model_id, 
  trust_remote_code=True, 
  num_crops=4
) 

placeholder = ''
for i in range(len(img)):
    placeholder += f"<|image_{i+1}|>\n"

messages = [
    {"role": "user", "content": placeholder+"Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."},
]

prompt_b = processor_b.tokenizer.apply_chat_template(
  messages, 
  tokenize=False, 
  add_generation_prompt=True
)

inputs_b = processor_b(prompt_b, img, return_tensors="pt").to("cuda:0") 

generation_args = { 
    "max_new_tokens": 500, 
    "temperature": 0.0, 
    "do_sample": False, 
} 


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


The user is performing a pick task. The item being picked up is the red object, which appears to be a remote control.


In [10]:
import time
import torch
import gc
from memory_profiler import memory_usage

directory = '/home/ttyh/hot3d/hot3d/dataset/Labelled/Videos/new_frames/Pick up_P0001_a68492d5_new_1'

img = open_images_from_directory(directory)
print(img)


# Function to measure memory for model loading
def load_a(model_cls, processor_cls, model_id):
    model = model_cls.from_pretrained(model_id, device_map="cuda", trust_remote_code=True, torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",).to(DEVICE)
    model.load_adapter("HuggingFaceTB/SmolVLM-Instruct-DPO")
    processor = processor_cls.from_pretrained(model_id, trust_remote_code=True)
    return model, processor

# Function to prepare inputs
def prepare_inputs_a(img, processor_a):
    messages = create_input_messages(img)
    prompt_a = processor_a.apply_chat_template(messages, add_generation_prompt=True)
    return processor_a(text=prompt_a, images=img, return_tensors="pt").to(DEVICE)

def load_b(model_cls, processor_cls, model_id):
    model_b = model_cls.from_pretrained(
      model_id, 
      device_map="cuda", 
      trust_remote_code=True, 
      torch_dtype="auto", 
      _attn_implementation='flash_attention_2'    
    )
    processor_b = processor_cls.from_pretrained(model_id, 
      trust_remote_code=True, 
      num_crops=4
    ) 
    return model_b, processor_b

def prepare_inputs_b(img, processor_b):
    placeholder = ''
    for i in range(len(img)):
        placeholder += f"<|image_{i+1}|>\n"
    messages = [
        {"role": "user", "content": placeholder+"Analyze this sequence of frames where the red spot shows the user's eye gaze. Identify whether the user is performing a pick or place task. If the task is picking, specify the item being picked up. If the task is placing, describe what is being placed and its destination."},
    ]
    prompt_b = processor_b.tokenizer.apply_chat_template(
      messages, 
      tokenize=False, 
      add_generation_prompt=True
    )
    return processor_b(prompt_b, img, return_tensors="pt").to("cuda:0")

generation_args = { 
    "max_new_tokens": 500, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

# Measure memory for Model A setup
start_time = time.time()
mem_usage_a, (model_a, processor_a) = memory_usage(
    (lambda: load_a(AutoModelForVision2Seq, AutoProcessor, "HuggingFaceTB/SmolVLM-Instruct")),
    retval=True,
    max_usage=True,
)
end_time = time.time()
setup_time_a = end_time - start_time

# Prepare inputs for Model A
start_time = time.time()
mem_usage_inputs_a, inputs_a = memory_usage(
    (lambda: prepare_inputs_a(img, processor_a)),
    retval=True,
    max_usage=True,
)
end_time = time.time()
inputs_time_a = end_time - start_time

# Clear memory before Model B setup
torch.cuda.empty_cache()
gc.collect()

# Measure memory for Model B setup
start_time = time.time()
mem_usage_b, (model_b, processor_b) = memory_usage(
    (lambda: load_b(AutoModelForCausalLM, AutoProcessor, "microsoft/Phi-3.5-vision-instruct")),
    retval=True,
    max_usage=True,
)
end_time = time.time()
setup_time_b = end_time - start_time

# Prepare inputs for Model B
start_time = time.time()
mem_usage_inputs_b, inputs_b = memory_usage(
    (lambda: prepare_inputs_b(img, processor_b)),
    retval=True,
    max_usage=True,
)
end_time = time.time()
inputs_time_b = end_time - start_time

# Print results
print(f"Model A Setup Time: {setup_time_a:.4f} seconds, Memory Usage: {mem_usage_a:.2f} MiB")
print(f"Model A Input Preparation Time: {inputs_time_a:.4f} seconds, Memory Usage: {mem_usage_inputs_a:.2f} MiB")
print(f"Model B Setup Time: {setup_time_b:.4f} seconds, Memory Usage: {mem_usage_b:.2f} MiB")
print(f"Model B Input Preparation Time: {inputs_time_b:.4f} seconds, Memory Usage: {mem_usage_inputs_b:.2f} MiB")


[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x79F685C91150>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x79F69647B640>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x79F696478070>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x79F6964788B0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1408x1408 at 0x79F69647B6D0>]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some kwargs in processor config are unused and will not have any effect: image_seq_len. 
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Model A Setup Time: 2.7268 seconds, Memory Usage: 4890.77 MiB
Model A Input Preparation Time: 0.6574 seconds, Memory Usage: 1391.22 MiB
Model B Setup Time: 2.9334 seconds, Memory Usage: 5083.75 MiB
Model B Input Preparation Time: 0.2245 seconds, Memory Usage: 1087.25 MiB


In [53]:
import time
import torch
import gc
from memory_profiler import memory_usage

def generate_a(processor_a, model_a, inputs_a):
    id_a = model_a.generate(**inputs_a, max_new_tokens=500)[:, inputs_a["input_ids"].shape[1]:]
    return processor_a.batch_decode(id_a, skip_special_tokens=True)

def generate_b(generation_args, processor_b, model_b, inputs_b):
    id_b = model_b.generate(
        **inputs_b,
        eos_token_id=processor_b.tokenizer.eos_token_id,
        **generation_args
    )[:, inputs_b["input_ids"].shape[1]:]
    return processor_b.batch_decode(id_b, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

# Measure inference time and memory usage for Model A
start_time = time.time()
mem_usage_a, output_a = memory_usage(
    (lambda: generate_a(processor_a, model_a, inputs_a)),
    retval=True,
    max_usage=True,
)
end_time = time.time()
inference_time_a = end_time - start_time

# Clear memory before measuring Model B
torch.cuda.empty_cache()
gc.collect()

# Measure inference time and memory usage for Model B
start_time = time.time()
mem_usage_b, output_b = memory_usage(
    (lambda: generate_b(generation_args, processor_b, model_b, inputs_b)),
    retval=True,
    max_usage=True,
)
end_time = time.time()
inference_time_b = end_time - start_time

# Print results
print(f"Model A Inference Time: {inference_time_a:.4f} seconds, Memory Usage: {mem_usage_a:.2f} MiB")
print(f"Model B Inference Time: {inference_time_b:.4f} seconds, Memory Usage: {mem_usage_b:.2f} MiB")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Model A Inference Time: 1.6406 seconds, Memory Usage: 1860.91 MiB
Model B Inference Time: 1.0146 seconds, Memory Usage: 1860.91 MiB


In [54]:
output_a

[' The user is performing a pick up task. The item being picked up is the milk carton.']

In [55]:
output_b

'The user is performing a pick task. The item being picked up is the red object, which appears to be a remote control.'