In [1]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
from PIL import Image

import numpy as np
import os
import time
model_path = 'deepseek-ai/Janus-Pro-1B'
config = AutoConfig.from_pretrained(model_path)
language_config = config.language_config
language_config._attn_implementation = 'eager'
vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
                                             language_config=language_config,
                                             trust_remote_code=True)

cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
#cuda_device = 'cpu'
if cuda_device == 'cuda':
    vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
else:
    vl_gpt = vl_gpt.to(torch.float16)

vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

  from .autonotebook import tqdm as notebook_tqdm


Python version is above 3.10, patching the collections module.


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
Some kwargs in processor config are unused and will not have any effect: num_image_tokens, mask_prompt, image_tag, a

In [2]:
def multimodal_understanding(image, question, seed, top_p, temperature):
    # Clear CUDA cache before generating
    torch.cuda.empty_cache()
    
    # set seed
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    
    conversation = [
        {
            "role": "<|User|>",
            "content": f"<image_placeholder>\n{question}",
            "images": [image],
        },
        {"role": "<|Assistant|>", "content": ""},
    ]
    
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
    
    
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
    
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False if temperature == 0 else True,
        use_cache=True,
        temperature=temperature,
        top_p=top_p,
    )
    
    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    return answer


In [4]:
from IPython.display import display
from urllib.request import urlopen
from glob import glob
folder_path = '/home/rmuproject/rmuproject/users/sandesh/Depth-to-Image/Fine-tune-DreamBooth/Instance_images_dir'
import os
import os
import re


# Function to extract numerical order from filenames like "input_1"
def extract_number(filename):
    match = re.search(r'input_(\d+)', filename)
    return int(match.group(1)) if match else float('inf')  # Place non-matching at end

# Get all image files in the folder (modify extensions as needed)
image_extensions = (".png", ".jpg", ".jpeg", ".bmp", ".tiff")
image_files = [f for f in os.listdir(folder_path) if f.endswith(image_extensions)]

# Sort the files in numerical order
image_files.sort(key=extract_number)

# Create a list of full image paths
image_paths = [os.path.join(folder_path, img) for img in image_files]
print(image_paths[:5])
print(image_files[:5])
# Create text files with corresponding names
for img in image_files:
    path = os.path.join(folder_path, img)
    print(path)
    base_name, _ = os.path.splitext(img)  # Remove file extension
    text_file_path = os.path.join(folder_path, base_name + ".txt")
    answer = multimodal_understanding (
        image = path,
        question='describe the image in 50 words',
        seed = 123,
        top_p = 0.8,
        temperature = 1.0
    )
    print(text_file_path)
    # break
    with open(text_file_path, "w") as f:
        f.write(answer)
print("Image paths list created and text files generated.")


['/home/rmuproject/rmuproject/users/sandesh/Depth-to-Image/Fine-tune-DreamBooth/Instance_images_dir/input_1.png', '/home/rmuproject/rmuproject/users/sandesh/Depth-to-Image/Fine-tune-DreamBooth/Instance_images_dir/input_2.png', '/home/rmuproject/rmuproject/users/sandesh/Depth-to-Image/Fine-tune-DreamBooth/Instance_images_dir/input_3.png', '/home/rmuproject/rmuproject/users/sandesh/Depth-to-Image/Fine-tune-DreamBooth/Instance_images_dir/input_4.png', '/home/rmuproject/rmuproject/users/sandesh/Depth-to-Image/Fine-tune-DreamBooth/Instance_images_dir/input_5.png']
['input_1.png', 'input_2.png', 'input_3.png', 'input_4.png', 'input_5.png']
/home/rmuproject/rmuproject/users/sandesh/Depth-to-Image/Fine-tune-DreamBooth/Instance_images_dir/input_1.png
/home/rmuproject/rmuproject/users/sandesh/Depth-to-Image/Fine-tune-DreamBooth/Instance_images_dir/input_1.txt
/home/rmuproject/rmuproject/users/sandesh/Depth-to-Image/Fine-tune-DreamBooth/Instance_images_dir/input_2.png
/home/rmuproject/rmuproject/