In [16]:
from hmoe_buildkit import HMOEBuilder
model_names = [
    "HuggingFaceTB/SmolVLM-500M-Instruct",
    "meta-llama/Llama-3.2-1B",
    "Qwen/Qwen3-0.6B"
]
builder = HMOEBuilder(model_names, 2)

In [17]:
tokenizer, model = builder.build()

In [18]:
print(model)

HMOE(
  (embed_tokens): Embedding(180222, 960)
  (vision_encoder): Idefics3Model(
    (vision_model): Idefics3VisionTransformer(
      (embeddings): Idefics3VisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
        (position_embedding): Embedding(1024, 768)
      )
      (encoder): Idefics3Encoder(
        (layers): ModuleList(
          (0-11): 12 x Idefics3EncoderLayer(
            (self_attn): Idefics3VisionAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (mlp): Idefics3VisionMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1)

In [19]:
from transformers.image_utils import load_image
image = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Can you describe this image?"}
        ]
    },
]

# Prepare inputs
prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(text=prompt, images=[image], return_tensors="pt")

In [20]:
import torch
inputs = inputs.to('cuda')
model = model.to('cuda')
with torch.no_grad():
    outputs = model(**inputs)
outputs

CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.9219e+00,  1.0781e+00,  3.2656e+00,  ...,  2.7734e-01,
           3.5352e-01,  9.6680e-02],
         [ 1.0156e+00,  2.8125e+00,  1.5156e+00,  ...,  8.0859e-01,
           4.2915e-04, -2.8906e-01],
         [ 8.7500e-01,  1.6875e+00, -8.7109e-01,  ...,  6.0547e-01,
          -3.6523e-01, -4.1016e-01],
         ...,
         [ 2.0703e-01,  8.2422e-01,  1.9453e+00,  ..., -8.0078e-01,
          -4.2969e-01,  3.6328e-01],
         [-3.1641e-01,  7.2656e-01,  1.6016e+00,  ..., -4.3359e-01,
           2.0410e-01,  2.4023e-01],
         [ 4.2969e-02,  2.3682e-02,  4.5117e-01,  ..., -1.9434e-01,
           3.9258e-01, -7.5195e-02]]], device='cuda:0', dtype=torch.bfloat16), past_key_values=None, hidden_states=None, attentions=None)

In [21]:
with torch.no_grad():
    outputs = model.generate(**inputs)
tokenizer.batch_decode(outputs)

['<|im_start|>User:<fake_token_around_image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><fake_token_around_image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><fake_token_around_im