In [1]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-3B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


['The image depicts a serene beach scene with a person and a dog. The person is sitting on the sandy beach, facing the ocean, and appears to be interacting with the dog. The dog is also sitting on the sand, facing the person, and seems to be giving a paw or a friendly gesture. The person is wearing a plaid shirt and has long hair. The background shows the ocean with gentle waves and a clear sky, suggesting it might be early morning or late afternoon due to the soft lighting. The overall atmosphere of the image is peaceful and joyful.']


In [2]:
class_id_to_name = {
    "0": "air_conditioner",
    "1": "car_horn",
    "2": "children_playing",
    "3": "dog_bark",
    "4": "drilling",
    "5": "engine_idling",
    "6": "gun_shot",
    "7": "jackhammer",
    "8": "siren",
    "9": "street_music"
}


In [3]:
import os
import json

# Define the base directory where the folds are located
base_dir = "../UrbanSound-Spectrogram"

# Define the output file
output_file = "urbansound_dataset.jsonl"

# Open the output file for writing
with open(output_file, "w") as outfile:
    # Iterate through each fold directory
    for fold in range(1, 11):
        fold_dir = os.path.join(base_dir, f"fold{fold}")
        # Iterate through each file in the fold directory
        for filename in os.listdir(fold_dir):
            if filename.endswith(".png"):
                # Extract classID from the filename
                class_id = filename.split("-")[1]
                # Map classID to class name
                label = class_id_to_name.get(class_id, "unknown")
                # Construct the full path to the image
                image_path = os.path.join(fold_dir, filename)
                # Create the data entry
                data_entry = {
                    "image": image_path,
                    "text": "Classify the sound in this spectrogram.",
                    "label": label
                }
                # Write the JSON line to the output file
                outfile.write(json.dumps(data_entry) + "\n")


FileNotFoundError: [Errno 2] No such file or directory: '../UrbanSound-Spectrogram/fold2'