In [None]:
# pip install funasr hyperpyyaml inflect openai-whisper datasets
# pip install diffusers==0.33.0
# vllm

import torch
from transformers import AutoProcessor, GenerationConfig

from modeling_bailingmm import BailingMMNativeForConditionalGeneration

model_name = "inclusionAI/Ming-Lite-Omni-1.5"
model_name = "your_model_path"

# load model
model = BailingMMNativeForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,  # Use bfloat16 for memory efficiency
    attn_implementation="flash_attention_2",
    # attn_implementation="eager",
    load_image_gen=True,
    low_cpu_mem_usage=True,        # Minimize CPU memory during loading
    trust_remote_code=True, 
).to("cuda")  


In [None]:
# build processor
processor = AutoProcessor.from_pretrained(".", trust_remote_code=True)

# qa
messages = [
    {
        "role": "HUMAN",
        "content": [
            {"type": "text", "text": "请详细介绍鹦鹉的生活习性。"}
        ],
    },
]

# 1. Format inputs using chat template
text = processor.apply_chat_template(messages, add_generation_prompt=True)

# 2. Extract vision/audio data
image_inputs, video_inputs, audio_inputs = processor.process_vision_info(messages)

# 3. Prepare tensor inputs
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    audios=audio_inputs,
    return_tensors="pt",
)
inputs = inputs.to(model.device)
for k in inputs.keys():
    if k == "pixel_values" or k == "pixel_values_videos" or k == "audio_feats":
        inputs[k] = inputs[k].to(dtype=torch.bfloat16)

# 4. Configure generation
generation_config = GenerationConfig.from_dict({'no_repeat_ngram_size': 10})
generated_ids, _ = model.generate(
    **inputs,
    max_new_tokens=512,
    use_cache=True,
    eos_token_id=processor.gen_terminator,
    generation_config=generation_config,
)
generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]

# 5. Decode output
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(output_text)
# Output:

# 鹦鹉是一种非常聪明和社交性强的鸟类，它们的生活习性非常丰富和有趣。以下是一些关于鹦鹉生活习性的详细介绍：
# ### 1. **栖息地**
# 鹦鹉主要分布在热带和亚热带地区，包括非洲、亚洲、澳大利亚和南美洲。它们通常生活在森林、草原、沙漠和城市环境中。不同种类的鹦鹉对栖息地的要求有所不同，但大多数鹦鹉喜欢有丰富植被和水源的地方。
# ### 2. **饮食**
# 鹦鹉是杂食性动物，它们的饮食非常多样化。它们的食物包括种子、坚果、水果、蔬菜、花蜜和昆虫。鹦鹉的喙非常强壮，能够轻松地打开坚硬的果壳和坚果。一些鹦鹉还会吃泥土或沙子，以帮助消化和补充矿物质。
# ......

In [None]:
# Image generation mode currently limits the range of input pixels.
gen_input_pixels = 451584
processor = AutoProcessor.from_pretrained(".", trust_remote_code=True)

processor.max_pixels = gen_input_pixels
processor.min_pixels = gen_input_pixels

prompt = "Draw a girl with short hair."

def generate(messages, processor, model, **image_gen_param):
    text = processor.apply_chat_template(messages, add_generation_prompt=True)
    image_inputs, video_inputs, audio_inputs = processor.process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        audios=audio_inputs,
        return_tensors="pt",
    ).to(model.device)

    for k in inputs.keys():
        if k == "pixel_values" or k == "pixel_values_videos" or k == "audio_feats":
            inputs[k] = inputs[k].to(dtype=torch.bfloat16)
    
    print(image_gen_param)
    image, attentions = model.generate(
        **inputs,
        image_gen=True,
        **image_gen_param,
    )
    return image, attentions

messages = [
    {
        "role": "HUMAN",
        "content": [
            {"type": "text", "text": prompt},
        ],
    }
]
image, attentions = generate(
   messages=messages, processor=processor, model=model, 
   image_gen_cfg=6.0, image_gen_steps=20, image_gen_width=480, image_gen_height=544
)
# image.save("./t2i.jpg")
display(image)