Install dependencies

In [12]:
!pip install bitsandbytes transformers huggingface_hub
!pip install git+https://github.com/deepseek-ai/Janus.git
!pip install git+https://github.com/Dao-AILab/flash-attention.git
!pip install basicsr facexlib gfpgan
!pip install git+https://github.com/xinntao/Real-ESRGAN.git

Collecting git+https://github.com/deepseek-ai/Janus.git
  Cloning https://github.com/deepseek-ai/Janus.git to /tmp/pip-req-build-tyeydgvq
  Running command git clone --filter=blob:none --quiet https://github.com/deepseek-ai/Janus.git /tmp/pip-req-build-tyeydgvq
  Resolved https://github.com/deepseek-ai/Janus.git to commit 1daa72fa409002d40931bd7b36a9280362469ead
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting git+https://github.com/Dao-AILab/flash-attention.git
  Cloning https://github.com/Dao-AILab/flash-attention.git to /tmp/pip-req-build-67kow6d6
  Running command git clone --filter=blob:none --quiet https://github.com/Dao-AILab/flash-attention.git /tmp/pip-req-build-67kow6d6
  Resolved https://github.com/Dao-AILab/flash-attention.git to commit 5639b9d26dac63d912d6815cb4369250f6cef764
  Running command git submodule update --init --recursive -q
  Prep

Loading Janus Pro 7B with quantization

In [2]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from janus.models import MultiModalityCausalLM, VLChatProcessor

# Specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True, quantization_config=quantization_config, torch_dtype=torch.bfloat16)

Python version is above 3.10, patching the collections module.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use t

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Test model request with images and text

In [3]:
import numpy as np
import torch
from PIL import Image
from janus.utils.io import load_pil_images

cuda_device = 'cuda:0'
vl_gp=vl_gpt.to(cuda_device)


def multimodal_understanding(images, question, seed, top_p, temperature):
    # Clear CUDA cache before generating
    torch.cuda.empty_cache()

    # Set seed
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)

    conversation = [
        {
            "role": "<|User|>",
            "content": f"<image_placeholder>\n{question}",
            "images": images,
        },
        {"role": "<|Assistant|>", "content": ""},
    ]

    # Ensure images are properly formatted as PIL images
    pil_images = [Image.fromarray(img) if isinstance(img, np.ndarray) else img for img in images]

    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)

    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False if temperature == 0 else True,
        use_cache=True,
        temperature=temperature,
        top_p=top_p,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    return answer

# Load the image
image = Image.open("./frame_0.jpg")

# Define inputs
question = "How many cats do you see in the photo? Describe their appearance and behaviour."

seed = 42
top_p = 0.8
temperature = 0.5

# Call the function with the correct image format
multimodal_understanding([image], question, seed, top_p, temperature)

'I see two cats in the photo. One cat is white and is standing near the wooden shelf on the left side of the image. The other cat is mostly white with some black markings and is sitting on the floor near the center of the image. Both cats appear to be calmly observing their surroundings.'

Adding the RealESRGAN model for image upscaling

In [4]:
!find /usr/local/lib/python*/dist-packages -name "degradations.py"

/usr/local/lib/python3.11/dist-packages/basicsr/data/degradations.py


In [5]:
!sed -i 's/from torchvision.transforms.functional_tensor import rgb_to_grayscale/from torchvision.transforms.functional import rgb_to_grayscale/g' /usr/local/lib/python3.11/dist-packages/basicsr/data/degradations.py

In [14]:
import inspect
import realesrgan

# List all attributes inside the realesrgan module
print("Available attributes in realesrgan:")
for name, obj in inspect.getmembers(realesrgan):
    print(name, "->", obj)


Available attributes in realesrgan:
F -> <module 'torch.nn.functional' from '/usr/local/lib/python3.11/dist-packages/torch/nn/functional.py'>
IOConsumer -> <class 'realesrgan.utils.IOConsumer'>
PrefetchReader -> <class 'realesrgan.utils.PrefetchReader'>
ROOT_DIR -> /usr/local/lib/python3.11/dist-packages
RealESRGANer -> <class 'realesrgan.utils.RealESRGANer'>
All Rights Reserved.

Copyright (c) 2000 BeOpen.com.
All Rights Reserved.

Copyright (c) 1995-2001 Corporation for National Research Initiatives.
All Rights Reserved.

Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.
All Rights Reserved., 'credits':     Thanks to CWI, CNRI, BeOpen.com, Zope Corporation and a cast of thousands
    for supporting Python development.  See www.python.org for more information., 'license': Type license() to see the full license text, 'help': Type help() for interactive help, or help(object) for help about object., 'execfile': <function execfile at 0x7c95e410e340>, 'runfile': <function 

In [21]:
import torch
import os
from realesrgan.utils import RealESRGANer
from basicsr.archs.rrdbnet_arch import RRDBNet
import requests

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize RRDBNet model (for x4 upscaling)
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)

# Download the model weights if not present
model_path = "RealESRGAN_x4plus.pth"
url = "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth"

# Check if file exists, else download
if not os.path.exists(model_path):
    print("Downloading model weights...")
    r = requests.get(url, allow_redirects=True)
    with open(model_path, 'wb') as f:
        f.write(r.content)
    print("Download complete!")

# Load model weights
weights = torch.load(model_path, map_location=device)
model.load_state_dict(weights, strict=False)  # Load weights with flexibility

# Initialize RealESRGANer with the preloaded model
sr_model = RealESRGANer(
    scale=4,
    model_path=model_path,
    model=model,  # Pass the preloaded model, NOT model_path
    dni_weight=None,
    device=device
)

print("Real-ESRGAN model loaded successfully!")

  weights = torch.load(model_path, map_location=device)
  loadnet = torch.load(model_path, map_location=torch.device('cpu'))


Real-ESRGAN model loaded successfully!


Image generation example with Janus Pro 7B

In [26]:
def generate(input_ids,
             width,
             height,
             temperature: float = 1,
             parallel_size: int = 5,
             cfg_weight: float = 5,
             image_token_num_per_image: int = 576,
             patch_size: int = 16):
    # Clear CUDA cache before generating
    torch.cuda.empty_cache()

    tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
    for i in range(parallel_size * 2):
        tokens[i, :] = input_ids
        if i % 2 != 0:
            tokens[i, 1:-1] = vl_chat_processor.pad_id
    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).to(cuda_device)

    pkv = None
    for i in range(image_token_num_per_image):
        with torch.no_grad():
            outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
                                                use_cache=True,
                                                past_key_values=pkv)
            pkv = outputs.past_key_values
            hidden_states = outputs.last_hidden_state
            logits = vl_gpt.gen_head(hidden_states[:, -1, :])
            logit_cond = logits[0::2, :]
            logit_uncond = logits[1::2, :]
            logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
            probs = torch.softmax(logits / temperature, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated_tokens[:, i] = next_token.squeeze(dim=-1)
            next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)

            img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
            inputs_embeds = img_embeds.unsqueeze(dim=1)



    patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
                                                 shape=[parallel_size, 8, width // patch_size, height // patch_size])

    return generated_tokens.to(dtype=torch.int), patches

def unpack(dec, width, height, parallel_size=5):
    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
    dec = np.clip((dec + 1) / 2 * 255, 0, 255)

    visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
    visual_img[:, :, :] = dec

    return visual_img

def image_upsample(img: Image.Image) -> Image.Image:
    if img is None:
        raise ValueError("Image not uploaded")

    width, height = img.size

    if width >= 5000 or height >= 5000:
        raise ValueError("The image is too large.")

    global sr_model

    # Convert PIL image to NumPy array
    img_np = np.array(img)

    # Perform super-resolution enhancement
    result, _ = sr_model.enhance(img_np, outscale=4)  # Use NumPy array input

    # Convert NumPy array back to PIL Image
    return Image.fromarray(result)

def generate_image(prompt,
                   seed=None,
                   guidance=5,
                   t2i_temperature=1.0):
    # Clear CUDA cache and avoid tracking gradients
    torch.cuda.empty_cache()
    # Set the seed for reproducible results
    if seed is not None:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        np.random.seed(seed)
    width = 384
    height = 384
    parallel_size = 5

    with torch.no_grad():
        messages = [{'role': '<|User|>', 'content': prompt},
                    {'role': '<|Assistant|>', 'content': ''}]
        text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
                                                                   sft_format=vl_chat_processor.sft_format,
                                                                   system_prompt='')
        text = text + vl_chat_processor.image_start_tag

        input_ids = torch.LongTensor(tokenizer.encode(text))
        output, patches = generate(input_ids,
                                   width // 16 * 16,
                                   height // 16 * 16,
                                   cfg_weight=guidance,
                                   parallel_size=parallel_size,
                                   temperature=t2i_temperature)
        images = unpack(patches,
                        width // 16 * 16,
                        height // 16 * 16,
                        parallel_size=parallel_size)

        # return [Image.fromarray(images[i]).resize((768, 768), Image.LANCZOS) for i in range(parallel_size)]
        stime = time.time()
        ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
        print(f'upsample time: {time.time() - stime}')
        return ret_images


In [27]:
import torch
import time
from PIL import Image

print("Sending request to Janus Pro for image generation...")

refined_prompt="""The ideal cat-friendly environment in this room should be a cozy
and inviting space that caters to the cat's natural behaviors and preferences.
Here is a description of a realistic and visually engaging space that incorporates warmth,
natural lighting, cozy furniture, and interactive elements for cats:

1. **Warmth and Comfort:**
- The room is filled with soft, plush furniture like
the white cat bed and the cozy wooden shelf, providing a warm and comfortable
resting spot for the cats.
- The carpeted floor is soft and warm, allowing the cats to walk and play comfortably.

2. **Natural Lighting:**
- The room benefits from ample natural light, which is essential for the cats' well-being.
- The placement of the desk near the window ensures that the cats can enjoy
the sunlight while they rest or play.

3. **Interactive Elements:**
- The room is filled with interactive toys and scratching posts, such as the blue ball,
the multicolored tunnel, and the purple cat tunnel, which stimulate the cats'
natural hunting and playing instincts.
- The cats can climb and explore the cat tree, which provides a fun and engaging environment for them.

4. **Cozy Hiding Spots:**
- The room has several cozy hiding spots, such as the small cardboard boxes and
the soft fabric-covered objects, which provide the cats with a sense of security and privacy.
- The cats can retreat to these hiding spots when they feel threatened or want some alone time.

5. **Food and Water:**
- The cat food bowl is placed in a quiet and secure location, away from high-traffic areas,
to ensure the cats have easy access to their food and water.
- The room has a clean and clutter-free environment, which is essential for the cats'
health and well-being. By following these guidelines, the room can be made more cat-friendly,
catering to the cats' needs and preferences.
The room is a warm, inviting, and engaging space that promotes the cats' well-being and happiness."""

generated_images = generate_image(
    prompt=refined_prompt,
    seed=4937,                # Ensure reproducibility
    guidance=11,             # Strengthen adherence to prompt
    t2i_temperature=0.8     # Reduce randomness for better structure
)

output_path="./ideal_cat_room.png"

# Save and display the first generated image
final_image = generated_images[0]
final_image.save(output_path)
final_image.show()


print(f"The image saved at: {output_path}")

Sending request to Janus Pro for image generation...
upsample time: 8.002917051315308
The image saved at: ./ideal_cat_room.png


Clean up memory

In [28]:
import torch
import gc

gc.collect()
torch.cuda.empty_cache()

del vl_gpt
del tokenizer
gc.collect()

torch.cuda.empty_cache()
torch.cuda.synchronize()
for obj in list(globals().values()):
    if torch.is_tensor(obj):
        del obj
torch.cuda.empty_cache()