### Qwen-VL Inference Pipeline

##### Imports


In [1]:
import gc
import os
import re
import torch
import warnings
import transformers
from PIL import Image

##### Disabling Warnings


In [2]:
warnings.filterwarnings("ignore")

##### Setting Training Device


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if str(device) == "cuda":
    print(f"Using GPU {torch.cuda.get_device_properties(device)}")
else:
    print("Using CPU")

Using GPU _CudaDeviceProperties(name='AMD Radeon RX 7800 XT', major=11, minor=0, gcnArchName='gfx1100', total_memory=16368MB, multi_processor_count=30, uuid=38336232-6265-3432-3662-306564613532, L2_cache_size=4MB)


##### Emptying the GPU Cache (if necessary)


In [4]:
def empty_cache() -> None:
    # Cleaning out the device cache
    gc.collect()
    torch.cuda.empty_cache()


def print_free_memory() -> None:
    free, total = torch.cuda.mem_get_info(device)
    print(f"Percent of free memory: {round(free/total *100,2)}")


empty_cache()
print_free_memory()

Percent of free memory: 98.79


##### Memory Summary


In [5]:
def memory_summary() -> None:
    print(torch.cuda.memory_summary())


memory_summary()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

##### Preparing GPU (if necessary)


In [6]:
# For AMD GPU - 7800xt
device_name = torch.cuda.get_device_name(0)
if "AMD" in device_name or "Radeon" in device_name:
    os.environ["HSA_OVERRIDE_GFX_VERSION"] = "11.0.0"

print(f"GPU {torch.cuda.get_device_properties(device).name} is now setup")

GPU AMD Radeon RX 7800 XT is now setup


#####


##### Setting Flexible Paths for Data and Base Model


In [7]:
# Directory Paths
training_dataset_name = "coco8"
current_directory = os.getcwd()
path_to_base_directory = re.search(rf"(.*?){"Weird-Stuff-In-Traffic"}", current_directory).group(1)
test_image_path = f"Weird-Stuff-In-Traffic/Data/yolo/{training_dataset_name}/images/train/000000000025.jpg"
complete_test_image_data_path = path_to_base_directory + test_image_path

# Model Paths
base_model_name = "Qwen/Qwen2-VL-7B-Instruct"

##### Preparing Processor

In [8]:
# Initialize Processor
processor = transformers.Qwen2VLProcessor.from_pretrained(base_model_name)

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

##### Preparing Base Model

In [11]:
# Initialize the base model
base_model = transformers.Qwen2VLForConditionalGeneration.from_pretrained(
    base_model_name, torch_dtype=torch.float16, device_map="auto"
)

# Ensure the model is in evaluation mode
base_model.eval()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Qwen2VLForConditionalGeneration(
  (visual): Qwen2VisionTransformerPretrainedModel(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2VLVisionBlock(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): VisionSdpaAttention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): VisionMlp(
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): QuickGELUActivation()
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        )
      )
    )
    (merger): PatchMerger(
      (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (mlp): Seq

##### Utility Functions

In [12]:
# User Instruction need to be formatted in a way for the model to understand
def preprocess(instruction: str, image_path: str, processor: transformers.AutoProcessor) -> transformers.BatchEncoding:
    # Opening Image
    image = Image.open(image_path)

    # System Instructions
    system_instruction = f"You are an assistant that returns the subjects of the image." 
    

    # Formatting the Chat
    chat = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_instruction}],
        },
        {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": {instruction}},
            ],
        }
    ]

    # Applying the Chat template
    text_prompt = processor.apply_chat_template(
        chat, add_generation_prompt=True, 
    )

    # Final Processing to be fed into Model
    model_inputs = processor(
        text=[text_prompt], images=[image], padding=True, return_tensors="pt" 
    )
    return model_inputs


# Model Response Generation (max length needs to be adjusted so that the model's response isn't cut off)
def generate_response(
    model_inputs, base_model: transformers.Qwen2VLForConditionalGeneration, processor: transformers.AutoProcessor, device:torch.device, max_new_tokens=512
):
    model_inputs = model_inputs.to(device)

    with torch.no_grad():
        generated_ids = base_model.generate(**model_inputs, max_new_tokens=max_new_tokens)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        output_texts = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

    return output_texts




##### Model Output -> Image Description 


In [13]:
# Preprocessing the image and instruction
processed_prompt = preprocess(
    instruction="Please create a list of objects in this image.",
    image_path=complete_test_image_data_path,
    processor=processor,
)

# Generating the response
decoded_output = generate_response(
    processed_prompt, base_model, processor, device, max_new_tokens=128
)

##### Output Display (Change Later for Production)

In [14]:
print(decoded_output[0])

['giraffe', 'tree', 'log', 'grass', 'sky', 'leaves', 'branch']
