In [1]:
# Imports
import importlib.util
import json
import os
import re
import cv2
from pydantic import BaseModel
from termcolor import colored
import time
import llava
from llava import conversation as clib
from llava.media import Image as LlavaImage #, Video
from llava.model.configuration_llava import JsonSchemaResponseFormat, ResponseFormat
from llava.conversation import auto_set_conversation_mode
from llava.mm_utils import get_model_name_from_path
from llava.model.builder import load_pretrained_model
import torch
from transformers import AutoModel, AutoProcessor, AutoModelForVision2Seq, AutoTokenizer, AutoModelForCausalLM
import numpy as np
from transformers import GenerationConfig
import flash_attn
from awq.quantize.pre_quant import run_awq, apply_awq
from awq.quantize.quantizer import pseudo_quantize_model_weight, real_quantize_model_weight
from awq.utils.lm_eval_adaptor import LMEvalAdaptor

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Flash Attention version: {flash_attn.__version__}")

  from .autonotebook import tqdm as notebook_tqdm


[2025-11-15 03:00:05,742] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)




PyTorch version: 2.3.0+cu121
CUDA version: 12.1
Flash Attention version: 2.5.8


In [2]:
# GPU Utilisation Check
print("Allocated:", torch.cuda.memory_allocated() / 1024**3, "GB")
print("Max allocated:", torch.cuda.max_memory_allocated() / 1024**3, "GB")
print("Cached:", torch.cuda.memory_reserved() / 1024**3)

Allocated: 0.0 GB
Max allocated: 0.0 GB
Cached: 0.0


In [3]:
def benchmark_model(model, prompt, num_iterations=10, **kwargs):
    """
    Simple script to benchmark on provided test image. 
    Stats / function returns:
        last_response:  The last model output.
        avg_latency:    Average latency across iterations.
        latencies:      List of latencies per iteration.
    """
    latencies = []
    last_response = None

    for i in range(num_iterations):
        start_time = time.time()
        print("Memory before:", torch.cuda.memory_allocated()/1e9, "GB")
        last_response = model.generate_content(prompt, **kwargs)
        print("Memory after:", torch.cuda.memory_allocated()/1e9, "GB")

        elapsed = time.time() - start_time
        latencies.append(elapsed)
        
        # Combined per-iteration message
        print(colored(f"Iteration {i+1}/{num_iterations} took {elapsed:.3f} seconds", "green"))

    avg_latency = sum(latencies) / len(latencies)
    print(colored(f"\nLast Response: {last_response}", "cyan", attrs=["bold"]))
    print(colored(f"Average inference time over {num_iterations} runs: {avg_latency:.3f} seconds", 
                  "magenta", attrs=["bold"]))
    return last_response, avg_latency, latencies


### Experiment 1: Vanilla VILA1.5-3b (Vanilla)

In [4]:
model_path = "Efficient-Large-Model/VILA1.5-3b"
# model_path = "Efficient-Large-Model/VILA1.5-3b-AWQ"
conv_mode = "vicuna_v1"

# Set conversation modes
clib.default_conversation = clib.conv_templates[conv_mode].copy()
auto_set_conversation_mode(model_path)
model_name = get_model_name_from_path(model_path)

# Load model components
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path = model_path,
    model_name = model_name,
    model_base = None
)

# Send model to Cuda
model.to("cuda").eval()

# GPU Utilisation Check
print("Allocated:", torch.cuda.memory_allocated() / 1024**3, "GB")
print("Max allocated:", torch.cuda.max_memory_allocated() / 1024**3, "GB")
print("Cached:", torch.cuda.memory_reserved() / 1024**3)

[32m2025-11-15 03:00:51.451[0m | [1mINFO    [0m | [36mllava.conversation[0m:[36mauto_set_conversation_mode[0m:[36m190[0m - [1mSetting conversation mode to `vicuna_v1` based on model name/path `Efficient-Large-Model/VILA1.5-3b`.[0m
Fetching 17 files: 100%|██████████| 17/17 [00:00<00:00, 93943.57it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.33s/it]
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~jo

Allocated: 5.896849632263184 GB
Max allocated: 5.896849632263184 GB
Cached: 6.091796875


In [5]:
# Prepare generation config (max length, max new tokens, num beams, repetition penalty, temperature, top_k, top_p drawn from VILA default settings)
gen_config = GenerationConfig(
    do_sample               =   False,
    max_length              =   2048,
    max_new_tokens          =   2048,
    num_beams               =   1,
    repetition_penalty      =   1.0,
    temperature             =   1.0,
    top_k                   =   50,
    top_p                   =   1.0,
    use_cache               =   False         
)

image_path = "/workspace/VILA/demo_images/demo_img.png"
media = LlavaImage(image_path)
text_prompt = "" \
"Please describe the image in detail" \
""
prompt = [media, text_prompt]

benchmark_model(
    model               =   model, 
    prompt              =   prompt,
    num_iterations      =   10, 
    generation_config   =   gen_config
)

Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 1/10 took 5.661 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 2/10 took 5.331 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 3/10 took 5.189 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 4/10 took 5.239 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 5/10 took 5.233 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 6/10 took 5.291 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 7/10 took 5.181 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 8/10 took 5.144 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 9/10 took 5.198 seconds[0m
Memory before: 6.33169408 GB
Memory after: 6.33169408 GB
[32mIteration 10/10 took 5.164 seconds[0m

('The image is a collage of three distinct images, each with its own unique content. \n\n1. The first image is a vibrant representation of a hockey player in action, captured in mid-motion. The player, donned in a striking blue jersey, is seen skillfully maneuvering the puck on the ice. The background is a blur of colors, suggesting the speed and dynamism of the game.\n\n2. The second image is a detailed diagram of a drug test. The diagram, rendered in shades of green and yellow, provides a clear and concise visual representation of the drug test process. It includes various stages and components, such as the collection of urine samples, the analysis of the samples, and the reporting of the results.\n\n3. The third image is a question and answer format, with a question posed at the top and a corresponding answer at the bottom. The question is about the temperature in Antarctica, specifically asking about the temperature in 1979. The answer, however, is not visible in the image.\n\nEach

#### Experiment 2: Vila1.5-3b (AWQ Quantised, 4bit)

In [7]:
# Move model back to CPU
model = model.to("cpu")

# Perform quantisation
from awq.quantize.quantizer import real_quantize_model_weight

q_config = {
    "zero_point": True,   # AWQ will compute automatically
    "q_group_size": 128,  # required group size for quantization
}

real_quantize_model_weight(
    model=model,
    w_bit=4,
    q_config=q_config
)

# Move model back to cuda
model = model.to("cuda")

# Check resource utilisation
print("Allocated:", torch.cuda.memory_allocated() / 1024**3, "GB")
print("Max allocated:", torch.cuda.max_memory_allocated() / 1024**3, "GB")
print("Cached:", torch.cuda.memory_reserved() / 1024**3)

real weight quantization...: 100%|██████████| 32/32 [01:48<00:00,  3.38s/it]


Allocated: 2.425227165222168 GB
Max allocated: 6.06181001663208 GB
Cached: 2.52734375


In [8]:
# Prepare generation config (max length, max new tokens, num beams, repetition penalty, temperature, top_k, top_p drawn from VILA default settings)
gen_config = GenerationConfig(
    do_sample               =   False,
    max_length              =   2048,
    max_new_tokens          =   2048,
    num_beams               =   1,
    repetition_penalty      =   1.0,
    temperature             =   1.0,
    top_k                   =   50,
    top_p                   =   1.0,
    use_cache               =   False         
)

image_path = "/workspace/VILA/demo_images/demo_img.png"
media = LlavaImage(image_path)
text_prompt = "" \
"Please describe the image in detail" \
""
prompt = [media, text_prompt]

benchmark_model(
    model               =   model, 
    prompt              =   prompt,
    num_iterations      =   10, 
    generation_config   =   gen_config
)

Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 1/10 took 4.477 seconds[0m
Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 2/10 took 4.480 seconds[0m
Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 3/10 took 4.423 seconds[0m
Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 4/10 took 4.535 seconds[0m
Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 5/10 took 4.358 seconds[0m
Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 6/10 took 4.343 seconds[0m
Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 7/10 took 4.299 seconds[0m
Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 8/10 took 4.459 seconds[0m
Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 9/10 took 4.490 seconds[0m
Memory before: 2.60406784 GB
Memory after: 2.60406784 GB
[32mIteration 10/10 took 4.497 seconds[0m

("The image is a collage of three distinct images, each with its own unique content. \n\nThe first image is a vibrant photograph of a hockey player in mid-action, donned in a striking blue and yellow uniform. The player is captured in a dynamic pose, suggesting a moment of intense play. The background is a stark white, which contrasts with the player's colorful attire and brings the focus entirely on the athlete.\n\nThe second image is a striking infographic. It features a circular diagram with a variety of colored sections, each representing different aspects of a topic. The diagram is neatly organized, with each section clearly labeled, making it easy to understand the information presented.\n\nThe third image is a photograph of a map. The map is detailed, with various locations marked and labeled, indicating a geographical context. The map is set against a white background, which enhances the visibility of the various locations and their labels.\n\nEach image is distinct, yet they a