From 658f5d5448658b8bc6ad60458264bd9f6b00a89d Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Fri, 10 Apr 2026 00:04:56 -0500 Subject: [PATCH 1/3] Qwen 3.5 architecture adapter --- tests/unit/test_qwen3_next_adapter.py | 842 ++++++++++++++++++ .../factories/architecture_adapter_factory.py | 2 + .../supported_architectures/__init__.py | 4 + .../supported_architectures/qwen3_next.py | 146 +++ .../tools/model_registry/__init__.py | 1 + .../model_registry/data/supported_models.json | 29 +- 6 files changed, 1022 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_qwen3_next_adapter.py create mode 100644 transformer_lens/model_bridge/supported_architectures/qwen3_next.py diff --git a/tests/unit/test_qwen3_next_adapter.py b/tests/unit/test_qwen3_next_adapter.py new file mode 100644 index 000000000..051a8310b --- /dev/null +++ b/tests/unit/test_qwen3_next_adapter.py @@ -0,0 +1,842 @@ +"""Unit tests for the Qwen3Next architecture adapter (Phases A through D). + +Tests cover: +1. Registration: adapter importable, in SUPPORTED_ARCHITECTURES, in HF_SUPPORTED_ARCHITECTURES +2. Config extraction: convert_hf_model_config produces correct config for Qwen3NextForCausalLM +3. _get_partial_rotary_factor helper: reads from rope_parameters dict only (not top-level) +4. Component mapping: correct bridge hierarchy with only universal submodules (no self_attn) +5. Weight conversions: preprocess_weights correctly slices q_proj.weight per-head +6. Integration: end-to-end tests with a tiny programmatically-constructed model +""" + +from unittest import mock + +import pytest + +from transformer_lens.factories.architecture_adapter_factory import ( + SUPPORTED_ARCHITECTURES, +) +from transformer_lens.tools.model_registry import HF_SUPPORTED_ARCHITECTURES + +# ============================================================================ +# Test: Registration +# ============================================================================ + + +class TestQwen3NextRegistration: + """Verify the adapter is properly registered in all lookup tables.""" + + def test_adapter_importable(self): + """Qwen3NextArchitectureAdapter must be importable.""" + from transformer_lens.model_bridge.supported_architectures import ( + Qwen3NextArchitectureAdapter, + ) + + assert Qwen3NextArchitectureAdapter is not None + + def test_in_supported_architectures(self): + """Qwen3NextForCausalLM must be in SUPPORTED_ARCHITECTURES.""" + assert "Qwen3NextForCausalLM" in SUPPORTED_ARCHITECTURES + + def test_in_hf_supported_architectures(self): + """Qwen3NextForCausalLM must be in HF_SUPPORTED_ARCHITECTURES.""" + assert "Qwen3NextForCausalLM" in HF_SUPPORTED_ARCHITECTURES + + def test_adapter_class_correct(self): + """The adapter class must be Qwen3NextArchitectureAdapter.""" + from transformer_lens.model_bridge.supported_architectures import ( + Qwen3NextArchitectureAdapter, + ) + + assert SUPPORTED_ARCHITECTURES["Qwen3NextForCausalLM"] is Qwen3NextArchitectureAdapter + + +# ============================================================================ +# Helpers: mock HF config +# ============================================================================ + + +def _make_hf_config( + *, + hidden_size: int = 2048, + num_attention_heads: int = 8, + num_key_value_heads: int = 2, + head_dim: int = 256, + intermediate_size: int = 6144, + num_hidden_layers: int = 24, + vocab_size: int = 248320, + rms_norm_eps: float = 1e-6, + hidden_act: str = "silu", + tie_word_embeddings: bool = False, + rope_parameters: dict | None = None, +) -> mock.Mock: + """Create a minimal mock HuggingFace config for Qwen3NextForCausalLM. + + Uses spec=[] so only explicitly assigned attributes exist. This prevents + mock.Mock() from auto-creating attributes (like rope_theta) that would + interfere with beartype-validated helpers like _get_rope_theta(). + """ + if rope_parameters is None: + rope_parameters = { + "rope_theta": 10000.0, + "partial_rotary_factor": 0.25, + "rope_type": "default", + } + cfg = mock.Mock(spec=[]) + cfg.architectures = ["Qwen3NextForCausalLM"] + cfg.hidden_size = hidden_size + cfg.num_attention_heads = num_attention_heads + cfg.num_key_value_heads = num_key_value_heads + cfg.head_dim = head_dim + cfg.intermediate_size = intermediate_size + cfg.num_hidden_layers = num_hidden_layers + cfg.vocab_size = vocab_size + cfg.rms_norm_eps = rms_norm_eps + cfg.hidden_act = hidden_act + cfg.tie_word_embeddings = tie_word_embeddings + cfg.rope_parameters = rope_parameters + return cfg + + +# ============================================================================ +# Test: Config extraction +# ============================================================================ + + +class TestQwen3NextConfigExtraction: + """Verify convert_hf_model_config extracts all fields correctly.""" + + def _extract_config(self, hf_config: mock.Mock) -> dict: + """Run convert_hf_model_config with a mocked AutoConfig and model name lookup.""" + from transformer_lens.loading_from_pretrained import convert_hf_model_config + + model_name = "Qwen/Qwen3-Next-80B-A3B" + with mock.patch( + "transformer_lens.loading_from_pretrained.AutoConfig.from_pretrained", + return_value=hf_config, + ), mock.patch( + "transformer_lens.loading_from_pretrained.get_official_model_name", + return_value=model_name, + ): + return convert_hf_model_config(model_name) + + def test_basic_dimensions(self): + """d_model, n_heads, n_layers, d_mlp, d_vocab extracted correctly.""" + hf_config = _make_hf_config() + cfg = self._extract_config(hf_config) + + assert cfg["d_model"] == 2048 + assert cfg["n_heads"] == 8 + assert cfg["n_layers"] == 24 + assert cfg["d_mlp"] == 6144 + assert cfg["d_vocab"] == 248320 + + def test_head_dim(self): + """d_head reads from hf_config.head_dim directly.""" + hf_config = _make_hf_config(head_dim=256) + cfg = self._extract_config(hf_config) + + assert cfg["d_head"] == 256 + + def test_n_key_value_heads_gqa(self): + """n_key_value_heads is set when num_key_value_heads != num_attention_heads (GQA).""" + hf_config = _make_hf_config(num_attention_heads=8, num_key_value_heads=2) + cfg = self._extract_config(hf_config) + + assert cfg["n_key_value_heads"] == 2 + + def test_n_key_value_heads_mha(self): + """n_key_value_heads is None when num_key_value_heads == num_attention_heads (MHA).""" + hf_config = _make_hf_config(num_attention_heads=8, num_key_value_heads=8) + cfg = self._extract_config(hf_config) + + assert cfg["n_key_value_heads"] is None + + def test_n_ctx_is_2048(self): + """n_ctx is hardcoded to 2048 (safe cap for 262144 max).""" + hf_config = _make_hf_config() + cfg = self._extract_config(hf_config) + + assert cfg["n_ctx"] == 2048 + + def test_eps(self): + """eps reads from rms_norm_eps.""" + hf_config = _make_hf_config(rms_norm_eps=1e-6) + cfg = self._extract_config(hf_config) + + assert cfg["eps"] == 1e-6 + + def test_rotary_base_from_rope_parameters(self): + """rotary_base reads rope_theta from rope_parameters dict.""" + hf_config = _make_hf_config( + rope_parameters={ + "rope_theta": 10000000.0, + "partial_rotary_factor": 0.25, + "rope_type": "default", + } + ) + cfg = self._extract_config(hf_config) + + assert cfg["rotary_base"] == 10000000 + + def test_rotary_dim_partial_factor_0_25(self): + """rotary_dim = int(head_dim * partial_rotary_factor). + + With partial_rotary_factor=0.25 and head_dim=256, expect rotary_dim=64. + """ + hf_config = _make_hf_config( + head_dim=256, + rope_parameters={ + "rope_theta": 10000.0, + "partial_rotary_factor": 0.25, + "rope_type": "default", + }, + ) + cfg = self._extract_config(hf_config) + + assert cfg["rotary_dim"] == 64 + + def test_rotary_adjacent_pairs_false(self): + """rotary_adjacent_pairs must be False.""" + hf_config = _make_hf_config() + cfg = self._extract_config(hf_config) + + assert cfg["rotary_adjacent_pairs"] is False + + def test_flags(self): + """final_rms, gated_mlp, use_qk_norm, use_attn_scale all True; default_prepend_bos False.""" + hf_config = _make_hf_config() + cfg = self._extract_config(hf_config) + + assert cfg["final_rms"] is True + assert cfg["gated_mlp"] is True + assert cfg["use_qk_norm"] is True + assert cfg["use_attn_scale"] is True + assert cfg["default_prepend_bos"] is False + + def test_tie_word_embeddings(self): + """tie_word_embeddings reads from hf_config.""" + hf_config = _make_hf_config(tie_word_embeddings=False) + cfg = self._extract_config(hf_config) + + assert cfg["tie_word_embeddings"] is False + + def test_trust_remote_code(self): + """trust_remote_code must be True.""" + hf_config = _make_hf_config() + cfg = self._extract_config(hf_config) + + assert cfg["trust_remote_code"] is True + + def test_normalization_type_rms(self): + """normalization_type is 'RMS'.""" + hf_config = _make_hf_config() + cfg = self._extract_config(hf_config) + + assert cfg["normalization_type"] == "RMS" + + def test_positional_embedding_type_rotary(self): + """positional_embedding_type is 'rotary'.""" + hf_config = _make_hf_config() + cfg = self._extract_config(hf_config) + + assert cfg["positional_embedding_type"] == "rotary" + + +# ============================================================================ +# Test: _get_partial_rotary_factor helper +# ============================================================================ + + +class TestGetPartialRotaryFactor: + """Verify _get_partial_rotary_factor reads from rope_parameters dict only.""" + + def test_reads_from_rope_parameters(self): + """partial_rotary_factor is read from rope_parameters dict.""" + from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor + + cfg = mock.Mock() + cfg.rope_parameters = {"partial_rotary_factor": 0.25} + # Top-level attribute should NOT be consulted + cfg.partial_rotary_factor = 0.99 # wrong value — must not be used + + result = _get_partial_rotary_factor(cfg) + assert result == 0.25 + + def test_fallback_when_rope_parameters_missing(self): + """Returns 1.0 when rope_parameters is absent.""" + from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor + + cfg = mock.Mock(spec=[]) # no attributes at all + + result = _get_partial_rotary_factor(cfg) + assert result == 1.0 + + def test_fallback_when_partial_rotary_factor_not_in_dict(self): + """Returns 1.0 when rope_parameters exists but lacks partial_rotary_factor. + + This is the key correctness test: a config that has partial_rotary_factor + as a top-level attribute but NOT in rope_parameters must return 1.0 (the + fallback), not 0.5. This verifies we only read from the dict. + """ + from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor + + cfg = mock.Mock() + cfg.rope_parameters = {} # no partial_rotary_factor key + cfg.partial_rotary_factor = 0.5 # top-level only — must NOT be used + + result = _get_partial_rotary_factor(cfg) + assert result == 1.0 + + def test_custom_default(self): + """Custom default is returned when rope_parameters is absent.""" + from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor + + cfg = mock.Mock(spec=[]) + + result = _get_partial_rotary_factor(cfg, default=0.5) + assert result == 0.5 + + def test_non_dict_rope_parameters_uses_default(self): + """Returns default when rope_parameters is not a dict.""" + from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor + + cfg = mock.Mock() + cfg.rope_parameters = "not_a_dict" + + result = _get_partial_rotary_factor(cfg) + assert result == 1.0 + + +# ============================================================================ +# Helpers: TransformerBridgeConfig for adapter instantiation +# ============================================================================ + + +def _make_bridge_cfg(**overrides): + """Create a minimal TransformerBridgeConfig for Qwen3Next adapter tests.""" + from transformer_lens.config.TransformerBridgeConfig import TransformerBridgeConfig + + defaults = dict( + d_model=2048, + d_head=256, + n_heads=8, + n_layers=24, + n_ctx=2048, + d_vocab=248320, + n_key_value_heads=2, + architecture="Qwen3NextForCausalLM", + ) + defaults.update(overrides) + return TransformerBridgeConfig(**defaults) + + +# ============================================================================ +# Test: Component Mapping (Phase B) +# ============================================================================ + + +class TestQwen3NextComponentMapping: + """Verify the component_mapping structure for Qwen3Next. + + The key invariant: self_attn is NOT mapped as a block submodule because + linear-attention layers lack self_attn, and get_remote_component raises + AttributeError for missing attributes (verified in architecture_adapter.py). + Only universally present submodules (norms, MLP) are mapped. + """ + + @pytest.fixture + def adapter(self): + from transformer_lens.model_bridge.supported_architectures.qwen3_next import ( + Qwen3NextArchitectureAdapter, + ) + + cfg = _make_bridge_cfg() + return Qwen3NextArchitectureAdapter(cfg) + + # ---- Top-level keys ---- + + def test_component_mapping_keys(self, adapter): + """component_mapping must have exactly the expected top-level keys.""" + assert set(adapter.component_mapping.keys()) == { + "embed", + "rotary_emb", + "blocks", + "ln_final", + "unembed", + } + + # ---- HF path names ---- + + def test_embed_path(self, adapter): + """embed maps to model.embed_tokens.""" + assert adapter.component_mapping["embed"].name == "model.embed_tokens" + + def test_rotary_emb_path(self, adapter): + """rotary_emb maps to model.rotary_emb.""" + assert adapter.component_mapping["rotary_emb"].name == "model.rotary_emb" + + def test_blocks_path(self, adapter): + """blocks maps to model.layers.""" + assert adapter.component_mapping["blocks"].name == "model.layers" + + def test_ln_final_path(self, adapter): + """ln_final maps to model.norm.""" + assert adapter.component_mapping["ln_final"].name == "model.norm" + + def test_unembed_path(self, adapter): + """unembed maps to lm_head.""" + assert adapter.component_mapping["unembed"].name == "lm_head" + + # ---- Block submodules ---- + + def test_block_submodules_keys(self, adapter): + """blocks submodules must contain ln1, ln2, mlp but NOT attn. + + This is a critical correctness test: self_attn is absent on + linear-attention layers, so mapping attn as a block submodule + would crash on those layers. + """ + submodules = adapter.component_mapping["blocks"].submodules + assert set(submodules.keys()) == {"ln1", "ln2", "mlp"} + + def test_no_attn_in_block_submodules(self, adapter): + """attn must NOT appear as a block submodule (hybrid architecture safety check).""" + submodules = adapter.component_mapping["blocks"].submodules + assert "attn" not in submodules + + def test_ln1_path(self, adapter): + """ln1 maps to input_layernorm.""" + submodules = adapter.component_mapping["blocks"].submodules + assert submodules["ln1"].name == "input_layernorm" + + def test_ln2_path(self, adapter): + """ln2 maps to post_attention_layernorm.""" + submodules = adapter.component_mapping["blocks"].submodules + assert submodules["ln2"].name == "post_attention_layernorm" + + def test_mlp_path(self, adapter): + """mlp maps to mlp.""" + submodules = adapter.component_mapping["blocks"].submodules + assert submodules["mlp"].name == "mlp" + + # ---- MLP submodules ---- + + def test_mlp_submodules_keys(self, adapter): + """mlp submodules must be gate, in, out.""" + mlp = adapter.component_mapping["blocks"].submodules["mlp"] + assert set(mlp.submodules.keys()) == {"gate", "in", "out"} + + def test_mlp_gate_path(self, adapter): + """mlp.gate maps to gate_proj.""" + mlp = adapter.component_mapping["blocks"].submodules["mlp"] + assert mlp.submodules["gate"].name == "gate_proj" + + def test_mlp_in_path(self, adapter): + """mlp.in maps to up_proj.""" + mlp = adapter.component_mapping["blocks"].submodules["mlp"] + assert mlp.submodules["in"].name == "up_proj" + + def test_mlp_out_path(self, adapter): + """mlp.out maps to down_proj.""" + mlp = adapter.component_mapping["blocks"].submodules["mlp"] + assert mlp.submodules["out"].name == "down_proj" + + # ---- Bridge types ---- + + def test_mlp_bridge_type(self, adapter): + """mlp uses GatedMLPBridge.""" + from transformer_lens.model_bridge.generalized_components import GatedMLPBridge + + mlp = adapter.component_mapping["blocks"].submodules["mlp"] + assert isinstance(mlp, GatedMLPBridge) + + def test_ln1_bridge_type(self, adapter): + """ln1 uses RMSNormalizationBridge.""" + from transformer_lens.model_bridge.generalized_components import ( + RMSNormalizationBridge, + ) + + ln1 = adapter.component_mapping["blocks"].submodules["ln1"] + assert isinstance(ln1, RMSNormalizationBridge) + + def test_ln2_bridge_type(self, adapter): + """ln2 uses RMSNormalizationBridge.""" + from transformer_lens.model_bridge.generalized_components import ( + RMSNormalizationBridge, + ) + + ln2 = adapter.component_mapping["blocks"].submodules["ln2"] + assert isinstance(ln2, RMSNormalizationBridge) + + def test_blocks_bridge_type(self, adapter): + """blocks uses BlockBridge.""" + from transformer_lens.model_bridge.generalized_components import BlockBridge + + assert isinstance(adapter.component_mapping["blocks"], BlockBridge) + + def test_rotary_emb_bridge_type(self, adapter): + """rotary_emb uses RotaryEmbeddingBridge.""" + from transformer_lens.model_bridge.generalized_components import ( + RotaryEmbeddingBridge, + ) + + assert isinstance(adapter.component_mapping["rotary_emb"], RotaryEmbeddingBridge) + + # ---- weight_processing_conversions ---- + + def test_weight_processing_conversions_empty(self, adapter): + """weight_processing_conversions is empty (no attention submodules mapped).""" + assert adapter.weight_processing_conversions == {} + + +# ============================================================================ +# Test: Weight Conversions (Phase C) +# ============================================================================ + + +class TestQwen3NextWeightConversions: + """Verify preprocess_weights correctly slices q_proj.weight per-head. + + Background: In Qwen3Next, q_proj.weight has shape (n_heads * head_dim * 2, hidden_size) + where rows are organized as interleaved per-head pairs: + head_0_query (d_head rows), head_0_gate (d_head rows), + head_1_query (d_head rows), head_1_gate (d_head rows), ... + + A naive first-half slice would be wrong. The correct approach reshapes by + head and takes only the first d_head rows per head (the query half). + """ + + N_HEADS = 4 + D_HEAD = 8 + HIDDEN_SIZE = 32 + + @pytest.fixture + def adapter(self): + from transformer_lens.model_bridge.supported_architectures.qwen3_next import ( + Qwen3NextArchitectureAdapter, + ) + + cfg = _make_bridge_cfg( + n_heads=self.N_HEADS, + d_head=self.D_HEAD, + d_model=self.HIDDEN_SIZE, + n_key_value_heads=self.N_HEADS, # MHA for simplicity + ) + return Qwen3NextArchitectureAdapter(cfg) + + def _make_q_proj_weight(self): + """Create a q_proj.weight tensor with distinct per-head-row values. + + Shape: (n_heads * d_head * 2, hidden_size) + Each row is filled with a unique integer so we can verify which rows + were selected after slicing. + """ + import torch + + total_rows = self.N_HEADS * self.D_HEAD * 2 + w = torch.zeros(total_rows, self.HIDDEN_SIZE) + for row_idx in range(total_rows): + w[row_idx] = float(row_idx) + return w + + def test_q_proj_output_shape(self, adapter): + """preprocess_weights reduces q_proj rows from n_heads*d_head*2 to n_heads*d_head.""" + import torch + + w = self._make_q_proj_weight() + state_dict = {"model.layers.3.self_attn.q_proj.weight": w} + + result = adapter.preprocess_weights(state_dict) + out = result["model.layers.3.self_attn.q_proj.weight"] + + assert out.shape == (self.N_HEADS * self.D_HEAD, self.HIDDEN_SIZE) + + def test_q_proj_selects_query_rows_not_naive_first_half(self, adapter): + """For each head i, output rows [i*d_head : (i+1)*d_head] == input rows + [i*d_head*2 : i*d_head*2 + d_head]. + + This verifies the per-head reshape: a naive slice of the first half would + incorrectly include gate rows from later heads. + """ + import torch + + w = self._make_q_proj_weight() + state_dict = {"model.layers.0.self_attn.q_proj.weight": w} + + result = adapter.preprocess_weights(state_dict) + out = result["model.layers.0.self_attn.q_proj.weight"] + + for head_idx in range(self.N_HEADS): + out_rows = out[head_idx * self.D_HEAD : (head_idx + 1) * self.D_HEAD] + # Per-head interleaved layout: query rows for head i start at i*(d_head*2) + expected_start = head_idx * self.D_HEAD * 2 + expected_rows = w[expected_start : expected_start + self.D_HEAD] + assert torch.equal(out_rows, expected_rows), ( + f"Head {head_idx}: output rows do not match expected query rows. " + f"Got row values starting at {out_rows[0, 0].item()}, " + f"expected starting at {expected_rows[0, 0].item()}" + ) + + def test_naive_slice_would_be_wrong(self, adapter): + """Demonstrate that a naive first-half slice gives different (wrong) results. + + This documents the correctness invariant: the interleaved layout means + naive slicing includes gate rows from intermediate heads. + """ + import torch + + w = self._make_q_proj_weight() + state_dict = {"model.layers.0.self_attn.q_proj.weight": w} + + result = adapter.preprocess_weights(state_dict) + correct_out = result["model.layers.0.self_attn.q_proj.weight"] + + # Naive first half: just take the top n_heads*d_head rows + naive_out = w[: self.N_HEADS * self.D_HEAD] + + # They should differ (unless n_heads==1, where both produce the same result) + if self.N_HEADS > 1: + assert not torch.equal(correct_out, naive_out), ( + "Naive first-half slice gave the same result as per-head slice — " + "test setup may be wrong" + ) + + def test_non_q_proj_weights_unchanged(self, adapter): + """k_proj, v_proj, and down_proj weights are NOT modified by preprocess_weights.""" + import torch + + k_proj = torch.randn(self.N_HEADS * self.D_HEAD, self.HIDDEN_SIZE) + down_proj = torch.randn(self.HIDDEN_SIZE, self.N_HEADS * self.D_HEAD) + state_dict = { + "model.layers.0.self_attn.k_proj.weight": k_proj.clone(), + "model.layers.0.mlp.down_proj.weight": down_proj.clone(), + } + + result = adapter.preprocess_weights(state_dict) + + assert torch.equal(result["model.layers.0.self_attn.k_proj.weight"], k_proj) + assert torch.equal(result["model.layers.0.mlp.down_proj.weight"], down_proj) + + def test_multiple_layers_all_processed(self, adapter): + """q_proj.weight tensors across multiple layers are all sliced correctly.""" + import torch + + w0 = self._make_q_proj_weight() + w3 = self._make_q_proj_weight() * 2 # distinct values to catch cross-layer bugs + + state_dict = { + "model.layers.0.self_attn.q_proj.weight": w0, + "model.layers.3.self_attn.q_proj.weight": w3, + } + + result = adapter.preprocess_weights(state_dict) + + expected_shape = (self.N_HEADS * self.D_HEAD, self.HIDDEN_SIZE) + assert result["model.layers.0.self_attn.q_proj.weight"].shape == expected_shape + assert result["model.layers.3.self_attn.q_proj.weight"].shape == expected_shape + + def test_empty_state_dict_returns_empty(self, adapter): + """preprocess_weights with an empty state dict returns an empty dict.""" + result = adapter.preprocess_weights({}) + assert result == {} + + def test_state_dict_without_q_proj_unchanged(self, adapter): + """A state dict with no q_proj keys is returned unmodified.""" + import torch + + state_dict = { + "model.embed_tokens.weight": torch.randn(100, self.HIDDEN_SIZE), + } + original_keys = set(state_dict.keys()) + + result = adapter.preprocess_weights(state_dict) + + assert set(result.keys()) == original_keys + + def test_weight_processing_conversions_is_empty_dict(self, adapter): + """weight_processing_conversions is {} — q_proj slicing is done in preprocess_weights.""" + assert adapter.weight_processing_conversions == {} + + +# ============================================================================ +# Test: Integration (Phase D) +# ============================================================================ + +try: + from transformers import Qwen3NextConfig, Qwen3NextForCausalLM + + _QWEN3NEXT_AVAILABLE = True +except ImportError: + _QWEN3NEXT_AVAILABLE = False + + +def _make_tiny_hf_model(): + """Create a tiny Qwen3Next model for integration testing. + + Uses num_experts=0 to force dense (non-MoE) MLP across all layers. + The adapter only maps universally-present submodules (norms + MLP), so + this works regardless of the layer type (linear_attention or full_attention). + + Config details: + - 8 layers: layers 3 and 7 are full-attention (full_attention_interval=4) + - All other layers are linear_attention + - dense MLP on all layers (num_experts=0) + """ + cfg = Qwen3NextConfig( + hidden_size=128, + num_hidden_layers=8, + num_attention_heads=4, + num_key_value_heads=2, + head_dim=32, + intermediate_size=256, + vocab_size=512, + rms_norm_eps=1e-6, + hidden_act="silu", + full_attention_interval=4, + linear_conv_kernel_dim=4, + linear_key_head_dim=32, + linear_value_head_dim=32, + linear_num_key_heads=4, + linear_num_value_heads=4, + num_experts=0, + rope_parameters={ + "rope_theta": 10000.0, + "partial_rotary_factor": 0.25, + "rope_type": "default", + }, + ) + model = Qwen3NextForCausalLM(cfg) + model.eval() + return model + + +def _make_tiny_bridge(): + """Create a Qwen3Next bridge from a tiny HF model.""" + from unittest.mock import MagicMock + + from transformer_lens.config.TransformerBridgeConfig import TransformerBridgeConfig + from transformer_lens.model_bridge import TransformerBridge + from transformer_lens.model_bridge.supported_architectures.qwen3_next import ( + Qwen3NextArchitectureAdapter, + ) + + hf_model = _make_tiny_hf_model() + + bridge_cfg = TransformerBridgeConfig( + d_model=128, + d_head=32, + n_heads=4, + n_layers=8, + n_ctx=2048, + d_vocab=512, + n_key_value_heads=2, + architecture="Qwen3NextForCausalLM", + ) + adapter = Qwen3NextArchitectureAdapter(bridge_cfg) + return TransformerBridge(hf_model, adapter, tokenizer=MagicMock()), hf_model + + +@pytest.mark.skipif( + not _QWEN3NEXT_AVAILABLE, + reason="Qwen3NextForCausalLM not available in installed transformers", +) +class TestQwen3NextIntegration: + """End-to-end integration tests using a tiny programmatic Qwen3Next model. + + Tests use num_experts=0 (dense MLP) to avoid the MoE layer structure, which + requires flash-linear-attention and causal-conv1d libraries not needed here. + """ + + @pytest.fixture(scope="class") + def bridge_and_model(self): + """Create a tiny bridge + HF model pair, shared across the class.""" + return _make_tiny_bridge() + + @pytest.fixture(scope="class") + def bridge(self, bridge_and_model): + br, _ = bridge_and_model + return br + + @pytest.fixture(scope="class") + def hf_model(self, bridge_and_model): + _, hf = bridge_and_model + return hf + + def test_bridge_creation(self, bridge): + """TransformerBridge construction from a tiny Qwen3Next model must succeed.""" + from transformer_lens.model_bridge import TransformerBridge + + assert isinstance(bridge, TransformerBridge) + + def test_hook_names_present(self, bridge): + """Key hook names must be present in the bridge hook_dict. + + Verified hook names: + - blocks.0.hook_resid_pre: present on linear-attention layer (layer 0) + - blocks.3.hook_resid_pre: present on first full-attention layer (layer 3) + - blocks.0.ln1.*: norm is present on all layers (universal submodule) + - blocks.0.mlp.*: MLP is present on all layers (universal submodule) + + Also verifies that blocks.0.attn.* is NOT present — self_attn is only on + full-attention layers, so it is NOT mapped as a block submodule. + """ + hook_keys = set(bridge.hook_dict.keys()) + + # Block-level residual hooks exist on all layers + assert "blocks.0.hook_resid_pre" in hook_keys, "linear-attn layer must have hook_resid_pre" + assert "blocks.3.hook_resid_pre" in hook_keys, "full-attn layer must have hook_resid_pre" + + # Norm hooks present on all layers + assert any( + "blocks.0.ln1" in k for k in hook_keys + ), "blocks.0.ln1 submodule hooks must be present" + + # MLP hooks present on all layers + assert any( + "blocks.0.mlp" in k for k in hook_keys + ), "blocks.0.mlp submodule hooks must be present" + + # No attn bridge — self_attn is absent on linear-attention layers + assert not any( + "blocks.0.attn" in k for k in hook_keys + ), "blocks.0.attn hooks must NOT be present (hybrid architecture)" + + def test_forward_pass_consistency(self, bridge, hf_model): + """Bridge output logits must match HF model output logits to within atol=1e-4.""" + import torch + + tokens = torch.randint(0, 512, (1, 4)) + with torch.no_grad(): + hf_logits = hf_model(tokens).logits + bridge_logits = bridge(tokens) + + assert ( + hf_logits.shape == bridge_logits.shape + ), f"Shape mismatch: HF={hf_logits.shape}, bridge={bridge_logits.shape}" + assert torch.allclose( + hf_logits, bridge_logits, atol=1e-4 + ), f"Logit mismatch: max diff = {(hf_logits - bridge_logits).abs().max().item():.6f}" + + def test_hook_activation_shapes(self, bridge): + """A hook added on blocks.0.mlp.hook_out must capture a (batch, seq, d_model) tensor.""" + import torch + + captured: list[torch.Tensor] = [] + + def capture_hook(tensor: torch.Tensor, hook: object) -> torch.Tensor: + captured.append(tensor.detach().clone()) + return tensor + + tokens = torch.randint(0, 512, (1, 4)) + with torch.no_grad(): + bridge.run_with_hooks(tokens, fwd_hooks=[("blocks.0.mlp.hook_out", capture_hook)]) + + assert len(captured) == 1, "Hook must fire exactly once per forward pass" + output = captured[0] + batch, seq, d_model = 1, 4, 128 + assert output.shape == ( + batch, + seq, + d_model, + ), f"Expected MLP output shape ({batch}, {seq}, {d_model}), got {output.shape}" diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py index 458d1b073..b5981c84f 100644 --- a/transformer_lens/factories/architecture_adapter_factory.py +++ b/transformer_lens/factories/architecture_adapter_factory.py @@ -43,6 +43,7 @@ PhiArchitectureAdapter, Qwen2ArchitectureAdapter, Qwen3ArchitectureAdapter, + Qwen3NextArchitectureAdapter, QwenArchitectureAdapter, StableLmArchitectureAdapter, T5ArchitectureAdapter, @@ -88,6 +89,7 @@ "QwenForCausalLM": QwenArchitectureAdapter, "Qwen2ForCausalLM": Qwen2ArchitectureAdapter, "Qwen3ForCausalLM": Qwen3ArchitectureAdapter, + "Qwen3NextForCausalLM": Qwen3NextArchitectureAdapter, "StableLmForCausalLM": StableLmArchitectureAdapter, "T5ForConditionalGeneration": T5ArchitectureAdapter, "NanoGPTForCausalLM": NanogptArchitectureAdapter, diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py index 2c32f6b38..e8cc60969 100644 --- a/transformer_lens/model_bridge/supported_architectures/__init__.py +++ b/transformer_lens/model_bridge/supported_architectures/__init__.py @@ -120,6 +120,9 @@ from transformer_lens.model_bridge.supported_architectures.qwen3 import ( Qwen3ArchitectureAdapter, ) +from transformer_lens.model_bridge.supported_architectures.qwen3_next import ( + Qwen3NextArchitectureAdapter, +) from transformer_lens.model_bridge.supported_architectures.stablelm import ( StableLmArchitectureAdapter, ) @@ -167,6 +170,7 @@ "QwenArchitectureAdapter", "Qwen2ArchitectureAdapter", "Qwen3ArchitectureAdapter", + "Qwen3NextArchitectureAdapter", "StableLmArchitectureAdapter", "T5ArchitectureAdapter", ] diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py new file mode 100644 index 000000000..97ecb2bc8 --- /dev/null +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py @@ -0,0 +1,146 @@ +"""Qwen3Next architecture adapter. + +Qwen3NextForCausalLM is a hybrid linear-attention + full-attention architecture. +Layers alternate between GatedDeltaNet (linear attention) and standard full +attention blocks, with a shared MLP on every layer. + +Since self_attn is absent on linear-attention layers, we only map submodules +that exist on ALL layers (norms, MLP). The HF native forward handles +linear/full attention dispatch internally. + +Hook coverage: +- Block-level: hook_resid_pre, hook_resid_post on every layer +- Normalization: ln1 (input_layernorm), ln2 (post_attention_layernorm) +- MLP: gate, in, out hooks +- Attention internals are NOT individually hooked (self_attn absent on + linear-attention layers; mapping it would crash on those layers) + +Optional parameters: +- n_key_value_heads: only set when using GQA (num_key_value_heads != num_attention_heads) +""" + +from typing import Any + +import torch + +from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter +from transformer_lens.model_bridge.generalized_components import ( + BlockBridge, + EmbeddingBridge, + GatedMLPBridge, + LinearBridge, + RMSNormalizationBridge, + RotaryEmbeddingBridge, + UnembeddingBridge, +) + + +class Qwen3NextArchitectureAdapter(ArchitectureAdapter): + """Architecture adapter for Qwen3Next models. + + Qwen3NextForCausalLM is a hybrid linear-attention + full-attention + architecture sharing the same design as Qwen3.5: + - Uses RMSNorm for all normalizations + - Uses rotary position embeddings (RoPE) with partial rotation + - Every 4th layer is a full-attention layer (self_attn); the rest are + GatedDeltaNet linear-attention layers (linear_attn) + - Uses gated MLP (gate_proj + up_proj -> down_proj) on ALL layers + - No biases on any linear layers + - Full-attention layers have Q/K normalization (q_norm, k_norm) + - Full-attention q_proj outputs n_heads * head_dim * 2 (interleaved + query+gate layout); the preprocess_weights method slices the query half + + Since self_attn is absent on linear-attention layers, only universally + present submodules (norms, MLP) are mapped as block submodules. The HF + native forward handles per-layer dispatch internally. + + Optional parameters: + - n_key_value_heads: set when num_key_value_heads != num_attention_heads (GQA) + """ + + def __init__(self, cfg: Any) -> None: + """Initialize the Qwen3Next architecture adapter.""" + super().__init__(cfg) + + # Core config attributes + self.cfg.normalization_type = "RMS" + self.cfg.positional_embedding_type = "rotary" + self.cfg.final_rms = True + self.cfg.gated_mlp = True + self.cfg.attn_only = False + self.cfg.uses_rms_norm = True + self.cfg.default_prepend_bos = False + + # Use eager attention to support output_attentions for hook_attn_scores + # and hook_pattern. SDPA doesn't support output_attentions. + self.cfg.attn_implementation = "eager" + + # GQA: only set n_key_value_heads when using grouped-query attention + if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: + self.cfg.n_key_value_heads = cfg.n_key_value_heads + + self.weight_processing_conversions: dict = {} + self.component_mapping: dict = { + "embed": EmbeddingBridge(name="model.embed_tokens"), + "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg), + "blocks": BlockBridge( + name="model.layers", + submodules={ + "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), + "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), + "mlp": GatedMLPBridge( + name="mlp", + config=self.cfg, + submodules={ + "gate": LinearBridge(name="gate_proj"), + "in": LinearBridge(name="up_proj"), + "out": LinearBridge(name="down_proj"), + }, + ), + }, + ), + "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), + "unembed": UnembeddingBridge(name="lm_head"), + } + + def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: + """No-op for hybrid models. + + Hybrid models don't map attention as a block submodule (self_attn is + absent on linear-attention layers), so there are no rotary embedding + references to set up. + + Note: to find which layers are full_attention at runtime, use: + layer_types = getattr(hf_model.config, "layer_types", []) + first_full_attn_idx = next( + i for i, t in enumerate(layer_types) if t == "full_attention" + ) + Do NOT use hf_model.config.full_attention_interval -- it is not stored + on the config object (consumed during __init__ to build layer_types). + """ + + def preprocess_weights(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Slice query half from q_proj.weight (interleaved per-head layout). + + In Qwen3Next, q_proj.weight has shape (n_heads * head_dim * 2, hidden_size). + Rows are organized as per-head interleaved: + head_0_query (d_head rows), head_0_gate (d_head rows), + head_1_query (d_head rows), head_1_gate (d_head rows), ... + + A naive first-half slice would be wrong. We must reshape by head, then + take the first d_head rows of each head (the query half). + + Note: since self_attn is NOT currently mapped as a bridge submodule, + these weights will not be loaded by the bridge. This method is included + for correctness and forward-compatibility. + """ + n_heads = self.cfg.n_heads + d_head = self.cfg.d_head + keys_to_update = [k for k in state_dict if k.endswith(".self_attn.q_proj.weight")] + for key in keys_to_update: + w = state_dict[key] # shape: (n_heads * d_head * 2, hidden_size) + # Reshape to expose per-head layout + w = w.view(n_heads, d_head * 2, -1) + # Take only the first d_head rows of each head (query half) + state_dict[key] = w[:, :d_head, :].reshape(n_heads * d_head, -1) + return state_dict diff --git a/transformer_lens/tools/model_registry/__init__.py b/transformer_lens/tools/model_registry/__init__.py index 409c3dc3f..66a45b50f 100644 --- a/transformer_lens/tools/model_registry/__init__.py +++ b/transformer_lens/tools/model_registry/__init__.py @@ -77,6 +77,7 @@ "QwenForCausalLM", "Qwen2ForCausalLM", "Qwen3ForCausalLM", + "Qwen3NextForCausalLM", "StableLmForCausalLM", "T5ForConditionalGeneration", } diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index 1c8d879d0..bd9aff0fa 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -6,10 +6,35 @@ "min_downloads": 500, "scan_duration_seconds": 3.2 }, - "total_architectures": 36, - "total_models": 5553, + "total_architectures": 37, + "total_models": 5554, "total_verified": 690, "models": [ + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "Qwen/Qwen3-Coder-Next", + "status": 2, + "verified_date": "2026-04-09", + "metadata": { + "downloads": 664116, + "likes": 0, + "last_modified": null, + "tags": [ + "transformers", + "safetensors", + "qwen3_next", + "text-generation" + ], + "parameter_count": 79674391296 + }, + "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, { "architecture_id": "Qwen3ForCausalLM", "model_id": "Qwen/Qwen3-0.6B", From 5060a0ce78aea34112812ec2d898775ffd5e7e29 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Fri, 10 Apr 2026 01:00:36 -0500 Subject: [PATCH 2/3] Qwen 3.5 architecture adapter complete --- tests/unit/test_qwen3_next_adapter.py | 55 ++++--- .../supported_architectures/qwen3_next.py | 55 ++++--- .../model_registry/data/supported_models.json | 136 +++++++++++++++++- .../data/verification_history.json | 102 ++++++++++++- .../tools/model_registry/registry_io.py | 16 +++ 5 files changed, 311 insertions(+), 53 deletions(-) diff --git a/tests/unit/test_qwen3_next_adapter.py b/tests/unit/test_qwen3_next_adapter.py index 051a8310b..509e96378 100644 --- a/tests/unit/test_qwen3_next_adapter.py +++ b/tests/unit/test_qwen3_next_adapter.py @@ -422,34 +422,26 @@ def test_mlp_path(self, adapter): # ---- MLP submodules ---- - def test_mlp_submodules_keys(self, adapter): - """mlp submodules must be gate, in, out.""" - mlp = adapter.component_mapping["blocks"].submodules["mlp"] - assert set(mlp.submodules.keys()) == {"gate", "in", "out"} - - def test_mlp_gate_path(self, adapter): - """mlp.gate maps to gate_proj.""" - mlp = adapter.component_mapping["blocks"].submodules["mlp"] - assert mlp.submodules["gate"].name == "gate_proj" - - def test_mlp_in_path(self, adapter): - """mlp.in maps to up_proj.""" - mlp = adapter.component_mapping["blocks"].submodules["mlp"] - assert mlp.submodules["in"].name == "up_proj" - - def test_mlp_out_path(self, adapter): - """mlp.out maps to down_proj.""" + def test_mlp_has_no_submodules(self, adapter): + """mlp is a MoEBridge with no enumerated submodules. + + Real Qwen3Next checkpoints use Qwen3NextSparseMoeBlock whose router + (`gate`) is a Qwen3NextTopKRouter rather than nn.Linear, and whose + experts are batched as 3D tensors inside Qwen3NextExperts. MoEBridge + wraps the whole block and delegates to HF's native forward, so no + internal submodules are mapped here. + """ mlp = adapter.component_mapping["blocks"].submodules["mlp"] - assert mlp.submodules["out"].name == "down_proj" + assert mlp.submodules == {} # ---- Bridge types ---- def test_mlp_bridge_type(self, adapter): - """mlp uses GatedMLPBridge.""" - from transformer_lens.model_bridge.generalized_components import GatedMLPBridge + """mlp uses MoEBridge (sparse MoE on every real checkpoint).""" + from transformer_lens.model_bridge.generalized_components import MoEBridge mlp = adapter.component_mapping["blocks"].submodules["mlp"] - assert isinstance(mlp, GatedMLPBridge) + assert isinstance(mlp, MoEBridge) def test_ln1_bridge_type(self, adapter): """ln1 uses RMSNormalizationBridge.""" @@ -674,14 +666,15 @@ def test_weight_processing_conversions_is_empty_dict(self, adapter): def _make_tiny_hf_model(): """Create a tiny Qwen3Next model for integration testing. - Uses num_experts=0 to force dense (non-MoE) MLP across all layers. - The adapter only maps universally-present submodules (norms + MLP), so - this works regardless of the layer type (linear_attention or full_attention). + Uses num_experts=4 (sparse MoE) to exercise the real production code path. + Every real Qwen3Next checkpoint has mlp_only_layers=[] and + decoder_sparse_step=1, so every decoder layer uses Qwen3NextSparseMoeBlock. + Test fixtures must mirror this or the adapter's MoE wiring goes untested. Config details: - 8 layers: layers 3 and 7 are full-attention (full_attention_interval=4) - All other layers are linear_attention - - dense MLP on all layers (num_experts=0) + - sparse MoE MLP on all layers (num_experts=4, num_experts_per_tok=2) """ cfg = Qwen3NextConfig( hidden_size=128, @@ -699,7 +692,12 @@ def _make_tiny_hf_model(): linear_value_head_dim=32, linear_num_key_heads=4, linear_num_value_heads=4, - num_experts=0, + num_experts=4, + num_experts_per_tok=2, + moe_intermediate_size=64, + shared_expert_intermediate_size=64, + decoder_sparse_step=1, + mlp_only_layers=[], rope_parameters={ "rope_theta": 10000.0, "partial_rotary_factor": 0.25, @@ -744,8 +742,9 @@ def _make_tiny_bridge(): class TestQwen3NextIntegration: """End-to-end integration tests using a tiny programmatic Qwen3Next model. - Tests use num_experts=0 (dense MLP) to avoid the MoE layer structure, which - requires flash-linear-attention and causal-conv1d libraries not needed here. + Tests use num_experts=4 (sparse MoE) to exercise the real production code + path. The linear attention layers run via the torch fallback path when + flash-linear-attention / causal-conv1d are not installed. """ @pytest.fixture(scope="class") diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py index 97ecb2bc8..53e18dce1 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py @@ -1,19 +1,25 @@ """Qwen3Next architecture adapter. -Qwen3NextForCausalLM is a hybrid linear-attention + full-attention architecture. -Layers alternate between GatedDeltaNet (linear attention) and standard full -attention blocks, with a shared MLP on every layer. +Qwen3NextForCausalLM is a hybrid linear-attention + full-attention architecture +with a sparse Mixture-of-Experts MLP on every layer. Layers alternate between +GatedDeltaNet (linear attention) and standard full attention blocks, while the +MLP is always a Qwen3NextSparseMoeBlock (gate router + batched experts + +shared expert). Since self_attn is absent on linear-attention layers, we only map submodules that exist on ALL layers (norms, MLP). The HF native forward handles -linear/full attention dispatch internally. +linear/full attention dispatch internally, and MoEBridge delegates the entire +MoE forward (including router, experts, and shared expert) to the native +implementation. Hook coverage: - Block-level: hook_resid_pre, hook_resid_post on every layer - Normalization: ln1 (input_layernorm), ln2 (post_attention_layernorm) -- MLP: gate, in, out hooks +- MLP: hook_in, hook_out on the MoE block (MoEBridge) - Attention internals are NOT individually hooked (self_attn absent on linear-attention layers; mapping it would crash on those layers) +- Expert-level internals are NOT individually hooked (batched expert params + live inside Qwen3NextExperts; MoEBridge delegates to HF forward) Optional parameters: - n_key_value_heads: only set when using GQA (num_key_value_heads != num_attention_heads) @@ -27,8 +33,7 @@ from transformer_lens.model_bridge.generalized_components import ( BlockBridge, EmbeddingBridge, - GatedMLPBridge, - LinearBridge, + MoEBridge, RMSNormalizationBridge, RotaryEmbeddingBridge, UnembeddingBridge, @@ -39,12 +44,16 @@ class Qwen3NextArchitectureAdapter(ArchitectureAdapter): """Architecture adapter for Qwen3Next models. Qwen3NextForCausalLM is a hybrid linear-attention + full-attention - architecture sharing the same design as Qwen3.5: + architecture with sparse MoE MLPs, sharing the same design as Qwen3.5: - Uses RMSNorm for all normalizations - Uses rotary position embeddings (RoPE) with partial rotation - Every 4th layer is a full-attention layer (self_attn); the rest are GatedDeltaNet linear-attention layers (linear_attn) - - Uses gated MLP (gate_proj + up_proj -> down_proj) on ALL layers + - Uses Qwen3NextSparseMoeBlock on ALL layers (decoder_sparse_step=1 and + mlp_only_layers=[] on every real checkpoint). The MoE block contains a + top-K router, batched Qwen3NextExperts (experts.gate_up_proj / + experts.down_proj as 3D tensors), plus a shared_expert (gated MLP) and + shared_expert_gate. Each expert is internally a gated MLP. - No biases on any linear layers - Full-attention layers have Q/K normalization (q_norm, k_norm) - Full-attention q_proj outputs n_heads * head_dim * 2 (interleaved @@ -52,7 +61,9 @@ class Qwen3NextArchitectureAdapter(ArchitectureAdapter): Since self_attn is absent on linear-attention layers, only universally present submodules (norms, MLP) are mapped as block submodules. The HF - native forward handles per-layer dispatch internally. + native forward handles per-layer attention dispatch internally, and + MoEBridge delegates the MoE forward pass (including router + experts + + shared expert) to the native Qwen3NextSparseMoeBlock implementation. Optional parameters: - n_key_value_heads: set when num_key_value_heads != num_attention_heads (GQA) @@ -71,6 +82,15 @@ def __init__(self, cfg: Any) -> None: self.cfg.uses_rms_norm = True self.cfg.default_prepend_bos = False + # Disable fold_ln: ln1 is followed by self_attn on full-attention + # layers and by linear_attn (GatedDeltaNet) on linear-attention layers, + # but neither is mapped as a bridge submodule (see class docstring for + # why). With no bridge-mapped target to fold into, the standard fold_ln + # pass leaves LN weights in an inconsistent state and the processed + # bridge output diverges from the unprocessed / HF output. Skipping + # fold_ln keeps processed-mode forward passes numerically equivalent. + self.supports_fold_ln = False + # Use eager attention to support output_attentions for hook_attn_scores # and hook_pattern. SDPA doesn't support output_attentions. self.cfg.attn_implementation = "eager" @@ -88,15 +108,12 @@ def __init__(self, cfg: Any) -> None: submodules={ "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), - "mlp": GatedMLPBridge( - name="mlp", - config=self.cfg, - submodules={ - "gate": LinearBridge(name="gate_proj"), - "in": LinearBridge(name="up_proj"), - "out": LinearBridge(name="down_proj"), - }, - ), + # Qwen3NextSparseMoeBlock has a custom Qwen3NextTopKRouter + # (not an nn.Linear) as `gate`, plus batched experts and a + # shared expert. MoEBridge wraps the whole MoE module and + # delegates to HF's native forward, so we don't enumerate + # the internal structure here. + "mlp": MoEBridge(name="mlp", config=self.cfg), }, ), "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index bd9aff0fa..f350694fb 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -1,20 +1,146 @@ { - "generated_at": "2026-04-09", + "generated_at": "2026-04-10", "scan_info": { "total_scanned": 10000, "task_filter": "text-generation", "min_downloads": 500, - "scan_duration_seconds": 3.2 + "scan_duration_seconds": 0.0 }, "total_architectures": 37, - "total_models": 5554, - "total_verified": 690, + "total_models": 5563, + "total_verified": 693, "models": [ + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "Qwen/Qwen3-Next-80B-A3B-Instruct", + "status": 2, + "verified_date": "2026-04-10", + "metadata": null, + "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "unsloth/Qwen3-Coder-Next", + "status": 2, + "verified_date": "2026-04-10", + "metadata": null, + "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "Qwen/Qwen3-Next-80B-A3B-Thinking", + "status": 2, + "verified_date": "2026-04-10", + "metadata": null, + "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "tiny-random/qwen3-next-moe", + "status": 1, + "verified_date": "2026-04-10", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 75.7, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next", + "status": 1, + "verified_date": "2026-04-10", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 55.9, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "yujiepan/qwen3-next-moe-tiny-random", + "status": 1, + "verified_date": "2026-04-10", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 75.7, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "huihui-ai/Huihui-Qwen3-Coder-Next-abliterated", + "status": 2, + "verified_date": "2026-04-10", + "metadata": null, + "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "Qwen/Qwen3-Coder-Next-Base", + "status": 2, + "verified_date": "2026-04-10", + "metadata": null, + "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "bknyaz/Qwen3-Coder-Next-REAM", + "status": 2, + "verified_date": "2026-04-10", + "metadata": null, + "note": "Estimated 5201.5 GB exceeds 96.0 GB limit", + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, { "architecture_id": "Qwen3NextForCausalLM", "model_id": "Qwen/Qwen3-Coder-Next", "status": 2, - "verified_date": "2026-04-09", + "verified_date": "2026-04-10", "metadata": { "downloads": 664116, "likes": 0, diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json index 3eae3fae0..59a2ec635 100644 --- a/transformer_lens/tools/model_registry/data/verification_history.json +++ b/transformer_lens/tools/model_registry/data/verification_history.json @@ -1,5 +1,5 @@ { - "last_updated": "2026-04-09T13:22:45.115556", + "last_updated": "2026-04-10T00:51:34.188066", "records": [ { "model_id": "Macropodus/macbert4mdcspell_v1", @@ -11320,6 +11320,106 @@ "notes": "Full verification completed", "invalidated": false, "invalidation_reason": null + }, + { + "model_id": "tiny-random/qwen3-next-moe", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'Qwen3NextSparseMoeBlock' object has no attribute 'gate_proj'", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'Qwen3NextSparseMoeBlock' object has no attribute 'gate_proj'", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "yujiepan/qwen3-next-moe-tiny-random", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'Qwen3NextSparseMoeBlock' object has no attribute 'gate_proj'", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "tiny-random/qwen3-next-moe", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=83.3% but required tests failed: logits_equivalence, loss_equivalence \u2014 Text quality score: 75.7/100 (avg perplexity: 524.4) \u2014 generated text may be incoherent", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=83.3% but required tests failed: logits_equivalence, loss_equivalence \u2014 Text quality score: 55.9/100 (avg perplexity: 2921.5) \u2014 generated text may be incoherent", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "yujiepan/qwen3-next-moe-tiny-random", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=83.3% but required tests failed: logits_equivalence, loss_equivalence \u2014 Text quality score: 75.7/100 (avg perplexity: 524.4) \u2014 generated text may be incoherent", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "yujiepan/qwen3-next-moe-tiny-random", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "tiny-random/qwen3-next-moe", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "yujiepan/qwen3-next-moe-tiny-random", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null } ] } diff --git a/transformer_lens/tools/model_registry/registry_io.py b/transformer_lens/tools/model_registry/registry_io.py index dddb1360c..95f092e62 100644 --- a/transformer_lens/tools/model_registry/registry_io.py +++ b/transformer_lens/tools/model_registry/registry_io.py @@ -56,6 +56,14 @@ "_fp8", "-FP8", "_FP8", + "-nvfp4", + "_nvfp4", + "-NVFP4", + "_NVFP4", + "-mxfp4", + "_mxfp4", + "-MXFP4", + "_MXFP4", "-int4", "_int4", "-int8", @@ -64,6 +72,14 @@ "-w8a8", "-W4A16", "-W8A8", + ".w4a16", + ".W4A16", + "-3bit", + "_3bit", + "-2bit", + "_2bit", + "-oQ", + "_oQ", "-quantized.", "_Quantized", "-Quantized", From 9a4e588882b17f37fc953112cdccedbc4c79085e Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Fri, 10 Apr 2026 12:28:16 -0500 Subject: [PATCH 3/3] Cleaning up tests --- tests/unit/test_qwen3_next_adapter.py | 272 +------------------------- 1 file changed, 8 insertions(+), 264 deletions(-) diff --git a/tests/unit/test_qwen3_next_adapter.py b/tests/unit/test_qwen3_next_adapter.py index 509e96378..1a2842e7b 100644 --- a/tests/unit/test_qwen3_next_adapter.py +++ b/tests/unit/test_qwen3_next_adapter.py @@ -2,15 +2,16 @@ Tests cover: 1. Registration: adapter importable, in SUPPORTED_ARCHITECTURES, in HF_SUPPORTED_ARCHITECTURES -2. Config extraction: convert_hf_model_config produces correct config for Qwen3NextForCausalLM -3. _get_partial_rotary_factor helper: reads from rope_parameters dict only (not top-level) -4. Component mapping: correct bridge hierarchy with only universal submodules (no self_attn) -5. Weight conversions: preprocess_weights correctly slices q_proj.weight per-head -6. Integration: end-to-end tests with a tiny programmatically-constructed model +2. Component mapping: correct bridge hierarchy with only universal submodules (no self_attn) +3. Weight conversions: preprocess_weights correctly slices q_proj.weight per-head +4. Integration: end-to-end tests with a tiny programmatically-constructed model + +Note: Qwen3Next is supported only via TransformerBridge, not HookedTransformer. +No tests exercise convert_hf_model_config here — the TransformerBridge path +reads the HF config directly via the adapter and does not go through +transformer_lens.loading_from_pretrained. """ -from unittest import mock - import pytest from transformer_lens.factories.architecture_adapter_factory import ( @@ -51,263 +52,6 @@ def test_adapter_class_correct(self): assert SUPPORTED_ARCHITECTURES["Qwen3NextForCausalLM"] is Qwen3NextArchitectureAdapter -# ============================================================================ -# Helpers: mock HF config -# ============================================================================ - - -def _make_hf_config( - *, - hidden_size: int = 2048, - num_attention_heads: int = 8, - num_key_value_heads: int = 2, - head_dim: int = 256, - intermediate_size: int = 6144, - num_hidden_layers: int = 24, - vocab_size: int = 248320, - rms_norm_eps: float = 1e-6, - hidden_act: str = "silu", - tie_word_embeddings: bool = False, - rope_parameters: dict | None = None, -) -> mock.Mock: - """Create a minimal mock HuggingFace config for Qwen3NextForCausalLM. - - Uses spec=[] so only explicitly assigned attributes exist. This prevents - mock.Mock() from auto-creating attributes (like rope_theta) that would - interfere with beartype-validated helpers like _get_rope_theta(). - """ - if rope_parameters is None: - rope_parameters = { - "rope_theta": 10000.0, - "partial_rotary_factor": 0.25, - "rope_type": "default", - } - cfg = mock.Mock(spec=[]) - cfg.architectures = ["Qwen3NextForCausalLM"] - cfg.hidden_size = hidden_size - cfg.num_attention_heads = num_attention_heads - cfg.num_key_value_heads = num_key_value_heads - cfg.head_dim = head_dim - cfg.intermediate_size = intermediate_size - cfg.num_hidden_layers = num_hidden_layers - cfg.vocab_size = vocab_size - cfg.rms_norm_eps = rms_norm_eps - cfg.hidden_act = hidden_act - cfg.tie_word_embeddings = tie_word_embeddings - cfg.rope_parameters = rope_parameters - return cfg - - -# ============================================================================ -# Test: Config extraction -# ============================================================================ - - -class TestQwen3NextConfigExtraction: - """Verify convert_hf_model_config extracts all fields correctly.""" - - def _extract_config(self, hf_config: mock.Mock) -> dict: - """Run convert_hf_model_config with a mocked AutoConfig and model name lookup.""" - from transformer_lens.loading_from_pretrained import convert_hf_model_config - - model_name = "Qwen/Qwen3-Next-80B-A3B" - with mock.patch( - "transformer_lens.loading_from_pretrained.AutoConfig.from_pretrained", - return_value=hf_config, - ), mock.patch( - "transformer_lens.loading_from_pretrained.get_official_model_name", - return_value=model_name, - ): - return convert_hf_model_config(model_name) - - def test_basic_dimensions(self): - """d_model, n_heads, n_layers, d_mlp, d_vocab extracted correctly.""" - hf_config = _make_hf_config() - cfg = self._extract_config(hf_config) - - assert cfg["d_model"] == 2048 - assert cfg["n_heads"] == 8 - assert cfg["n_layers"] == 24 - assert cfg["d_mlp"] == 6144 - assert cfg["d_vocab"] == 248320 - - def test_head_dim(self): - """d_head reads from hf_config.head_dim directly.""" - hf_config = _make_hf_config(head_dim=256) - cfg = self._extract_config(hf_config) - - assert cfg["d_head"] == 256 - - def test_n_key_value_heads_gqa(self): - """n_key_value_heads is set when num_key_value_heads != num_attention_heads (GQA).""" - hf_config = _make_hf_config(num_attention_heads=8, num_key_value_heads=2) - cfg = self._extract_config(hf_config) - - assert cfg["n_key_value_heads"] == 2 - - def test_n_key_value_heads_mha(self): - """n_key_value_heads is None when num_key_value_heads == num_attention_heads (MHA).""" - hf_config = _make_hf_config(num_attention_heads=8, num_key_value_heads=8) - cfg = self._extract_config(hf_config) - - assert cfg["n_key_value_heads"] is None - - def test_n_ctx_is_2048(self): - """n_ctx is hardcoded to 2048 (safe cap for 262144 max).""" - hf_config = _make_hf_config() - cfg = self._extract_config(hf_config) - - assert cfg["n_ctx"] == 2048 - - def test_eps(self): - """eps reads from rms_norm_eps.""" - hf_config = _make_hf_config(rms_norm_eps=1e-6) - cfg = self._extract_config(hf_config) - - assert cfg["eps"] == 1e-6 - - def test_rotary_base_from_rope_parameters(self): - """rotary_base reads rope_theta from rope_parameters dict.""" - hf_config = _make_hf_config( - rope_parameters={ - "rope_theta": 10000000.0, - "partial_rotary_factor": 0.25, - "rope_type": "default", - } - ) - cfg = self._extract_config(hf_config) - - assert cfg["rotary_base"] == 10000000 - - def test_rotary_dim_partial_factor_0_25(self): - """rotary_dim = int(head_dim * partial_rotary_factor). - - With partial_rotary_factor=0.25 and head_dim=256, expect rotary_dim=64. - """ - hf_config = _make_hf_config( - head_dim=256, - rope_parameters={ - "rope_theta": 10000.0, - "partial_rotary_factor": 0.25, - "rope_type": "default", - }, - ) - cfg = self._extract_config(hf_config) - - assert cfg["rotary_dim"] == 64 - - def test_rotary_adjacent_pairs_false(self): - """rotary_adjacent_pairs must be False.""" - hf_config = _make_hf_config() - cfg = self._extract_config(hf_config) - - assert cfg["rotary_adjacent_pairs"] is False - - def test_flags(self): - """final_rms, gated_mlp, use_qk_norm, use_attn_scale all True; default_prepend_bos False.""" - hf_config = _make_hf_config() - cfg = self._extract_config(hf_config) - - assert cfg["final_rms"] is True - assert cfg["gated_mlp"] is True - assert cfg["use_qk_norm"] is True - assert cfg["use_attn_scale"] is True - assert cfg["default_prepend_bos"] is False - - def test_tie_word_embeddings(self): - """tie_word_embeddings reads from hf_config.""" - hf_config = _make_hf_config(tie_word_embeddings=False) - cfg = self._extract_config(hf_config) - - assert cfg["tie_word_embeddings"] is False - - def test_trust_remote_code(self): - """trust_remote_code must be True.""" - hf_config = _make_hf_config() - cfg = self._extract_config(hf_config) - - assert cfg["trust_remote_code"] is True - - def test_normalization_type_rms(self): - """normalization_type is 'RMS'.""" - hf_config = _make_hf_config() - cfg = self._extract_config(hf_config) - - assert cfg["normalization_type"] == "RMS" - - def test_positional_embedding_type_rotary(self): - """positional_embedding_type is 'rotary'.""" - hf_config = _make_hf_config() - cfg = self._extract_config(hf_config) - - assert cfg["positional_embedding_type"] == "rotary" - - -# ============================================================================ -# Test: _get_partial_rotary_factor helper -# ============================================================================ - - -class TestGetPartialRotaryFactor: - """Verify _get_partial_rotary_factor reads from rope_parameters dict only.""" - - def test_reads_from_rope_parameters(self): - """partial_rotary_factor is read from rope_parameters dict.""" - from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor - - cfg = mock.Mock() - cfg.rope_parameters = {"partial_rotary_factor": 0.25} - # Top-level attribute should NOT be consulted - cfg.partial_rotary_factor = 0.99 # wrong value — must not be used - - result = _get_partial_rotary_factor(cfg) - assert result == 0.25 - - def test_fallback_when_rope_parameters_missing(self): - """Returns 1.0 when rope_parameters is absent.""" - from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor - - cfg = mock.Mock(spec=[]) # no attributes at all - - result = _get_partial_rotary_factor(cfg) - assert result == 1.0 - - def test_fallback_when_partial_rotary_factor_not_in_dict(self): - """Returns 1.0 when rope_parameters exists but lacks partial_rotary_factor. - - This is the key correctness test: a config that has partial_rotary_factor - as a top-level attribute but NOT in rope_parameters must return 1.0 (the - fallback), not 0.5. This verifies we only read from the dict. - """ - from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor - - cfg = mock.Mock() - cfg.rope_parameters = {} # no partial_rotary_factor key - cfg.partial_rotary_factor = 0.5 # top-level only — must NOT be used - - result = _get_partial_rotary_factor(cfg) - assert result == 1.0 - - def test_custom_default(self): - """Custom default is returned when rope_parameters is absent.""" - from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor - - cfg = mock.Mock(spec=[]) - - result = _get_partial_rotary_factor(cfg, default=0.5) - assert result == 0.5 - - def test_non_dict_rope_parameters_uses_default(self): - """Returns default when rope_parameters is not a dict.""" - from transformer_lens.loading_from_pretrained import _get_partial_rotary_factor - - cfg = mock.Mock() - cfg.rope_parameters = "not_a_dict" - - result = _get_partial_rotary_factor(cfg) - assert result == 1.0 - - # ============================================================================ # Helpers: TransformerBridgeConfig for adapter instantiation # ============================================================================