In [None]:
import torch
from transformers import AutoTokenizer, Qwen3MoeConfig, Qwen3MoeForCausalLM
from transformers.utils import logging
from accelerate import init_empty_weights, infer_auto_device_map

# Optional: Helps debugging
logging.set_verbosity_info()

# Model path
# huihui-ai/Huihui-MoE-5B-A1.7B-abliterated
model_path = "Qwen/Qwen3-30B-A3B"

# Load config
config = Qwen3MoeConfig.from_pretrained(model_path)
# Initialize empty model (no weights yet) to calculate device map
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Step 2: Initialize model with empty weights
with init_empty_weights():
    model = Qwen3MoeForCausalLM(config)

# Step 3: Infer device map for GPU 0 and 1 only
# device_map = infer_auto_device_map(
#     model,
#     max_memory={
#         0: "48GiB",
#         1: "48GiB",
#     },
#     no_split_module_classes=["QwenBlock"],  # prevent splitting inside transformer blocks
#     dtype=torch.bfloat16,
# )

# Step 4: Load actual weights with correct map
model = Qwen3MoeForCausalLM.from_pretrained(
    model_path,
    config=config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.eval()


loading configuration file config.json from cache at /home/tkode/.cache/huggingface/hub/models--Qwen--Qwen3-30B-A3B/snapshots/ae659febe817e4b3ebd7355f47792725801204c9/config.json
Model config Qwen3MoeConfig {
  "architectures": [
    "Qwen3MoeForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "decoder_sparse_step": 1,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 48,
  "mlp_only_layers": [],
  "model_type": "qwen3_moe",
  "moe_intermediate_size": 768,
  "norm_topk_prob": true,
  "num_attention_heads": 32,
  "num_experts": 128,
  "num_experts_per_tok": 8,
  "num_hidden_layers": 48,
  "num_key_value_heads": 4,
  "output_router_logits": false,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "router_aux_loss_coef": 0.001,
  "sliding_window": null,
  "

🚨 `use_probabilistic_routing` is part of Qwen3MoeModel.forward's signature, but not documented. Make sure to add it to the docstring of the function in /home/tkode/Desktop/transformers/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py.
🚨 `prob_routing_temp` is part of Qwen3MoeModel.forward's signature, but not documented. Make sure to add it to the docstring of the function in /home/tkode/Desktop/transformers/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py.
🚨 `use_probabilistic_routing` is part of Qwen3MoeForCausalLM.forward's signature, but not documented. Make sure to add it to the docstring of the function in /home/tkode/Desktop/transformers/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py.
🚨 `prob_routing_temp` is part of Qwen3MoeForCausalLM.forward's signature, but not documented. Make sure to add it to the docstring of the function in /home/tkode/Desktop/transformers/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py.


loading file vocab.json from cache at /home/tkode/.cache/huggingface/hub/models--Qwen--Qwen3-30B-A3B/snapshots/ae659febe817e4b3ebd7355f47792725801204c9/vocab.json
loading file merges.txt from cache at /home/tkode/.cache/huggingface/hub/models--Qwen--Qwen3-30B-A3B/snapshots/ae659febe817e4b3ebd7355f47792725801204c9/merges.txt
loading file tokenizer.json from cache at /home/tkode/.cache/huggingface/hub/models--Qwen--Qwen3-30B-A3B/snapshots/ae659febe817e4b3ebd7355f47792725801204c9/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/tkode/.cache/huggingface/hub/models--Qwen--Qwen3-30B-A3B/snapshots/ae659febe817e4b3ebd7355f47792725801204c9/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Generate config GenerationConfig {
  "bos_

Init Qwen3MoeForCasualLM


loading configuration file config.json from cache at /home/tkode/.cache/huggingface/hub/models--Qwen--Qwen3-30B-A3B/snapshots/ae659febe817e4b3ebd7355f47792725801204c9/config.json
Model config Qwen3MoeConfig {
  "architectures": [
    "Qwen3MoeForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "decoder_sparse_step": 1,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 48,
  "mlp_only_layers": [],
  "model_type": "qwen3_moe",
  "moe_intermediate_size": 768,
  "norm_topk_prob": true,
  "num_attention_heads": 32,
  "num_experts": 128,
  "num_experts_per_tok": 8,
  "num_hidden_layers": 48,
  "num_key_value_heads": 4,
  "output_router_logits": false,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "router_aux_loss_coef": 0.001,
  "sliding_window": null,
  "

Init Qwen3MoeForCasualLM


Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing Qwen3MoeForCausalLM.

All the weights of Qwen3MoeForCausalLM were initialized from the model checkpoint at Qwen/Qwen3-30B-A3B.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen3MoeForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /home/tkode/.cache/huggingface/hub/models--Qwen--Qwen3-30B-A3B/snapshots/ae659febe817e4b3ebd7355f47792725801204c9/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "pad_token_id": 151643,
  "temperature": 0.6,
  "top_k": 20,
  "top_p": 0.95
}



Qwen3MoeForCausalLM(
  (model): Qwen3MoeModel(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-47): 48 x Qwen3MoeDecoderLayer(
        (self_attn): Qwen3MoeAttention(
          (q_proj): Linear(in_features=2048, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2048, bias=False)
          (q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MoeSparseMoeBlock(
          (gate): Linear(in_features=2048, out_features=128, bias=False)
          (experts): ModuleList(
            (0-127): 128 x Qwen3MoeMLP(
              (gate_proj): Linear(in_features=2048, out_features=768, bias=False)
              (up_proj): Linear(in_features=2048, out_features=768, bias=False)
              (down_proj): Linear

In [3]:
print(model.hf_device_map)


{'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 2, 'model.layers.26': 2, 'model.layers.27': 2, 'model.layers.28': 2, 'model.layers.29': 2, 'model.layers.30': 2, 'model.layers.31': 2, 'model.layers.32': 2, 'model.layers.33': 2, 'model.layers.34': 2, 'model.layers.35': 2, 'model.layers.36': 2, 'model.layers.37': 2, 'model.layers.38': 3, 'model.layers.39': 3, 'model.layers.40': 3, 'model.layers.41': 3, 'model.layers.42': 3, 'model.layers.43': 3, 'model.layers.44

In [11]:
if "inputs" in globals():
    del inputs

first_device = model.hf_device_map.get("transformer.wte", "cuda:0")
device = torch.device(first_device)

prompt = 'Zentropa has much in common with The Third Man, another noir-like film set among the rubble of postwar Europe. Like TTM, there is much inventive camera work. There is an innocent American who gets emotionally involved with a woman he doesn\'t really understand, and whose naivety is all the more striking in contrast with the natives.<br /><br />But I\'d have to say that The Third Man has a more well-crafted storyline. Zentropa is a bit disjointed in this respect. Perhaps this is intentional: it is presented as a dream/nightmare, and making it too coherent would spoil the effect. <br /><br />This movie is unrelentingly grim--"noir" in more than one sense; one never sees the sun shine. Grim, but intriguing, and frightening.' +' Is the prior text positive or negative? Give us a rating out of 10'
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

# Move inputs to first device used by the model
# first_device = model.hf_device_map['transformer.wte']  # Or another known input layer
# inputs = {k: v.to(first_device) for k, v in inputs.items()}

# Run inference
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=500,
        use_probabilistic_routing=True,
        prob_routing_temp=0.1,  # Avoid too low values, or it can crash
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForCasualLM
Forward Qwen3MoeForC