In [1]:
from hmoe_buildkit import HMOEBuilder
model_names = [
    "HuggingFaceTB/SmolVLM-500M-Instruct",
    "meta-llama/Llama-3.2-1B",
    "Qwen/Qwen3-0.6B"
]
builder = HMOEBuilder(model_names, 2)

In [2]:
tokenizer, model = builder.build()

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [3]:
print(model)

HMOE(
  (vision_encoder): Idefics3Model(
    (vision_model): Idefics3VisionTransformer(
      (embeddings): Idefics3VisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
        (position_embedding): Embedding(1024, 768)
      )
      (encoder): Idefics3Encoder(
        (layers): ModuleList(
          (0-11): 12 x Idefics3EncoderLayer(
            (self_attn): Idefics3VisionAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (mlp): Idefics3VisionMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_features=768, out_features=30

In [4]:
from transformers.image_utils import load_image
image = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Can you describe this image?"}
        ]
    },
]

# Prepare inputs
prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(text=prompt, images=[image], return_tensors="pt")

In [5]:
import torch
inputs = inputs.to('cuda')
model = model.to('cuda')
with torch.no_grad():
    outputs = model(**inputs)
outputs

CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.5859,  0.4727, -6.3125,  ..., -2.0156, -0.3633,  2.3281],
         [-6.2188, -2.6562, -1.3359,  ..., -3.4531, -0.2559, -0.0138],
         [-2.3906, -2.6250, -4.2500,  ..., -1.8594, -0.5195, -0.4453],
         ...,
         [-5.2188, -2.7188,  1.2812,  ...,  0.0234,  1.0156,  1.4766],
         [-1.6094, -2.3281, -2.1875,  ..., -0.1592,  1.8906,  0.5977],
         [-4.1875, -5.9688, -3.3438,  ...,  0.3789,  2.0781,  1.2969]]],
       device='cuda:0', dtype=torch.bfloat16), past_key_values=None, hidden_states=None, attentions=None)

In [6]:
with torch.no_grad():
    outputs = model.generate(**inputs)
tokenizer.batch_decode(outputs)

['<|im_start|>User:<fake_token_around_image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><fake_token_around_image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><fake_token_around_im

In [7]:
tokenizer.save_pretrained('outputs/hmoe')
model.save_pretrained('outputs/hmoe')

In [None]:
import hmoe_buildkit
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('outputs/hmoe')
model = AutoModelForCausalLM.from_pretrained(
    'outputs/hmoe', 
    torch_dtype='bfloat16', 
    trust_remote_code=True
)
model

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HMOE(
  (vision_encoder): Idefics3Model(
    (vision_model): Idefics3VisionTransformer(
      (embeddings): Idefics3VisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
        (position_embedding): Embedding(1024, 768)
      )
      (encoder): Idefics3Encoder(
        (layers): ModuleList(
          (0-11): 12 x Idefics3EncoderLayer(
            (self_attn): Idefics3VisionAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (mlp): Idefics3VisionMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_features=768, out_features=30

: 