In [1]:
# Imports
import importlib.util
import json
import os
import re
import cv2
from pydantic import BaseModel
from termcolor import colored
import time
import llava
from llava import conversation as clib
from llava.media import Image as LlavaImage #, Video
from llava.model.configuration_llava import JsonSchemaResponseFormat, ResponseFormat
from llava.conversation import auto_set_conversation_mode
from llava.mm_utils import get_model_name_from_path
from llava.model.builder import load_pretrained_model
import torch
from transformers import AutoModel, AutoProcessor, AutoModelForVision2Seq, AutoTokenizer, AutoModelForCausalLM
import numpy as np
from transformers import GenerationConfig

# from PIL import Image 


  from .autonotebook import tqdm as notebook_tqdm


[2025-11-14 03:46:32,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)




Memory utilisation check

In [2]:
# GPU Utilisation Check
print("Allocated:", torch.cuda.memory_allocated() / 1024**3, "GB")
print("Max allocated:", torch.cuda.max_memory_allocated() / 1024**3, "GB")
print("Cached:", torch.cuda.memory_reserved() / 1024**3)

Allocated: 0.0 GB
Max allocated: 0.0 GB
Cached: 0.0


Basic benchmarking script

In [5]:
def benchmark_model(model, prompt, num_iterations=10, **kwargs):
    """
    Simple script to benchmark on provided test image. 
    Stats / function returns:
        last_response:  The last model output.
        avg_latency:    Average latency across iterations.
        latencies:      List of latencies per iteration.
    """
    latencies = []
    last_response = None

    for i in range(num_iterations):
        start_time = time.time()
        print("Memory before:", torch.cuda.memory_allocated()/1e9, "GB")
        last_response = model.generate_content(prompt, **kwargs)
        print("Memory after:", torch.cuda.memory_allocated()/1e9, "GB")

        elapsed = time.time() - start_time
        latencies.append(elapsed)
        
        # Combined per-iteration message
        print(colored(f"Iteration {i+1}/{num_iterations} took {elapsed:.3f} seconds", "green"))

    avg_latency = sum(latencies) / len(latencies)
    print(colored(f"\nLast Response: {last_response}", "cyan", attrs=["bold"]))
    print(colored(f"Average inference time over {num_iterations} runs: {avg_latency:.3f} seconds", 
                  "magenta", attrs=["bold"]))
    return last_response, avg_latency, latencies


### Experiment 1: Vanilla VILA1.5-3b

#### Model Setup

In [3]:
model_path = "Efficient-Large-Model/VILA1.5-3b"
# model_path = "Efficient-Large-Model/VILA1.5-3b-AWQ"
conv_mode = "vicuna_v1"

# Set conversation modes
clib.default_conversation = clib.conv_templates[conv_mode].copy()
auto_set_conversation_mode(model_path)
model_name = get_model_name_from_path(model_path)

# Load model components
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path = model_path,
    model_name = model_name,
    model_base = None
)

# Send model to Cuda
model.to("cuda").eval()

# GPU Utilisation Check
print("Allocated:", torch.cuda.memory_allocated() / 1024**3, "GB")
print("Max allocated:", torch.cuda.max_memory_allocated() / 1024**3, "GB")
print("Cached:", torch.cuda.memory_reserved() / 1024**3)

[32m2025-11-14 03:49:09.338[0m | [1mINFO    [0m | [36mllava.conversation[0m:[36mauto_set_conversation_mode[0m:[36m190[0m - [1mSetting conversation mode to `vicuna_v1` based on model name/path `Efficient-Large-Model/VILA1.5-3b`.[0m
Fetching 17 files: 100%|██████████| 17/17 [00:00<00:00, 118051.60it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.32s/it]
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~j

Allocated: 5.896849632263184 GB
Max allocated: 5.896849632263184 GB
Cached: 6.091796875


In [6]:
# Prepare generation config (max length, max new tokens, num beams, repetition penalty, temperature, top_k, top_p drawn from VILA default settings)
gen_config = GenerationConfig(
    do_sample               =   False,
    max_length              =   2048,
    max_new_tokens          =   2048,
    num_beams               =   1,
    repetition_penalty      =   1.0,
    temperature             =   1.0,
    top_k                   =   50,
    top_p                   =   1.0,
    use_cache               =   False         
)

image_path = "/workspace/VILA/demo_images/demo_img.png"
media = LlavaImage(image_path)
text_prompt = "" \
"Please describe the image in detail" \
""
prompt = [media, text_prompt]

benchmark_model(
    model               =   model, 
    prompt              =   prompt,
    num_iterations      =   10, 
    generation_config   =   gen_config
)

Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 1/10 took 5.267 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 2/10 took 5.134 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 3/10 took 5.218 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 4/10 took 5.204 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 5/10 took 5.244 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 6/10 took 5.313 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 7/10 took 5.061 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 8/10 took 5.216 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 9/10 took 5.155 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 10/10 took 5.171 seconds[0m

('The image is a collage of three distinct images, each with its own unique content. \n\n1. The first image is a vibrant representation of a hockey player in action, captured in mid-motion. The player, donned in a striking blue jersey, is seen skillfully maneuvering the puck on the ice. The background is a blur of colors, suggesting the speed and dynamism of the game.\n\n2. The second image is a detailed diagram of a drug test. The diagram, rendered in shades of green and yellow, provides a clear and concise visual representation of the drug test process. It includes various stages and components, such as the collection of urine samples, the analysis of the samples, and the reporting of the results.\n\n3. The third image is a question and answer format, with a question posed at the top and a corresponding answer at the bottom. The question is about the temperature in Antarctica, specifically asking about the temperature in 1979. The answer, however, is not visible in the image.\n\nEach

#### Experiment 2: Vila1.5-3b (BitsandBytes Quantised, 4bit)
**PLEASE RESTART THE PYTHON KERNEL BEFORE THIS** and rerun the benchmark function from earlier

In [1]:
# Imports
import importlib.util
import json
import os

from termcolor import colored
import time
import llava
from llava import conversation as clib
from llava.media import Image as LlavaImage #, Video
from llava.model.configuration_llava import JsonSchemaResponseFormat, ResponseFormat
from llava.conversation import auto_set_conversation_mode
from llava.mm_utils import get_model_name_from_path
from llava.model.builder import load_pretrained_model
import torch
from transformers import AutoModel, AutoProcessor, AutoModelForVision2Seq, AutoTokenizer, AutoModelForCausalLM
import numpy as np
from transformers import GenerationConfig


  from .autonotebook import tqdm as notebook_tqdm


[2025-11-14 03:54:01,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)




In [3]:
model_path = "Efficient-Large-Model/VILA1.5-3b"
conv_mode = "vicuna_v1"

from transformers import BitsAndBytesConfig

# Set conversation modes
clib.default_conversation = clib.conv_templates[conv_mode].copy()
auto_set_conversation_mode(model_path)
model_name = get_model_name_from_path(model_path)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path=model_path,
    model_name=model_name,
    model_base=None,
    load_4bit=False,             # <-- must be False
    quantization_config=bnb_config,
    torch_dtype=torch.float16    # optional, for model config
)

# Send model to Cuda
model.to("cuda").eval()

# GPU Utilisation Check
print("Allocated:", torch.cuda.memory_allocated() / 1024**3, "GB")
print("Max allocated:", torch.cuda.max_memory_allocated() / 1024**3, "GB")
print("Cached:", torch.cuda.memory_reserved() / 1024**3)

[32m2025-11-14 03:54:59.536[0m | [1mINFO    [0m | [36mllava.conversation[0m:[36mauto_set_conversation_mode[0m:[36m190[0m - [1mSetting conversation mode to `vicuna_v1` based on model name/path `Efficient-Large-Model/VILA1.5-3b`.[0m
Fetching 17 files: 100%|██████████| 17/17 [00:00<00:00, 125533.75it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.80s/it]
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~j

Allocated: 2.5285568237304688 GB
Max allocated: 2.5285568237304688 GB
Cached: 2.625


In [6]:
# Prepare generation config (max length, max new tokens, num beams, repetition penalty, temperature, top_k, top_p drawn from VILA default settings)
gen_config = GenerationConfig(
    do_sample               =   False,
    max_length              =   2048,
    max_new_tokens          =   2048,
    num_beams               =   1,
    repetition_penalty      =   1.0,
    temperature             =   1.0,
    top_k                   =   50,
    top_p                   =   1.0,
    use_cache               =   False         
)

image_path = "/workspace/VILA/demo_images/demo_img.png"
media = LlavaImage(image_path)
text_prompt = "" \
"Please describe the image in detail" \
""
prompt = [media, text_prompt]

benchmark_model(
    model               =   model, 
    prompt              =   prompt,
    num_iterations      =   10, 
    generation_config   =   gen_config
)

Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration 1/10 took 12.471 seconds[0m
Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration 2/10 took 12.368 seconds[0m
Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration 3/10 took 12.343 seconds[0m
Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration 4/10 took 12.377 seconds[0m
Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration 5/10 took 12.856 seconds[0m
Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration 6/10 took 12.895 seconds[0m
Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration 7/10 took 12.273 seconds[0m
Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration 8/10 took 12.247 seconds[0m
Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration 9/10 took 12.224 seconds[0m
Memory before: 2.715017216 GB
Memory after: 2.715017216 GB
[32mIteration

("The image is a collage of various images and text, all related to the topic of temperature. The collage is divided into three sections, each with its own set of images and text.\n\nIn the top left section, there's a map of the world with a legend on the left side. The legend indicates different colors for various types of temperature. The map shows the temperature of the Earth's surface in degrees Celsius.\n\nMoving to the top right section, there's a pie chart with a legend on the left side. The legend shows different types of temperature represented by different colors. The pie chart shows the temperature of the Earth's surface in degrees Celsius.\n\nThe bottom section features a bar chart with a legend on the left side. The legend shows different types of temperature represented by different colors. The bar chart shows the temperature of the Earth's surface in degrees Celsius.\n\nThe text in the image provides additional information about temperature, specifically about the temper