From 4cda56d69af6f226f091480584eccaa4e851e638 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 17 Nov 2025 20:43:48 +0000 Subject: [PATCH 1/4] fix mistral mlp conversion --- fast_llm/models/gpt/conversion/mistral.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fast_llm/models/gpt/conversion/mistral.py b/fast_llm/models/gpt/conversion/mistral.py index b5db3fa06..28941bc8a 100644 --- a/fast_llm/models/gpt/conversion/mistral.py +++ b/fast_llm/models/gpt/conversion/mistral.py @@ -2,6 +2,7 @@ from fast_llm.engine.checkpoint.config import CheckpointFormat from fast_llm.layers.attention.config import AttentionConfig +from fast_llm.layers.decoder.mlp.config import MLPConfig from fast_llm.models.gpt.conversion.config import MistralCheckpointFormat from fast_llm.models.gpt.conversion.llama import ( LlamaAttentionConverter, @@ -10,6 +11,7 @@ LlamaDecoderConverter, LlamaHeadConverter, LlamaHuggingfaceCheckpointHandler, + LlamaMLPConverter, ) from fast_llm.utils import safe_merge_dicts @@ -38,8 +40,26 @@ def _check_config(cls, config: AttentionConfig) -> None: assert not config.add_linear_biases +class MistrallMLPConverter(LlamaMLPConverter): + @classmethod + def import_config(cls, config: dict) -> dict: + config["mlp_bias"] = False + return super().import_config(config) + + @classmethod + def export_config(cls, config: MLPConfig) -> dict: + out = super().export_config(config) + del out["mlp_bias"] + return out + + @classmethod + def _check_config(cls, config: MLPConfig) -> None: + assert not config.add_linear_biases + + class MistralBlockConverter(LlamaBlockConverter): mixer_converter_class: typing.ClassVar[type[MistralAttentionConverter]] = MistralAttentionConverter + mlp_converter_class: typing.ClassVar[type[MistrallMLPConverter]] = MistrallMLPConverter class MistralDecoderConverter(LlamaDecoderConverter): From 41692e9fa2c8890231730abed464cad6e171e3bf Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 17 Nov 2025 20:49:05 +0000 Subject: [PATCH 2/4] remove duplicate from apriel conversion --- fast_llm/models/gpt/conversion/apriel.py | 25 +----------------------- 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/fast_llm/models/gpt/conversion/apriel.py b/fast_llm/models/gpt/conversion/apriel.py index 7550df044..ffd2522ce 100644 --- a/fast_llm/models/gpt/conversion/apriel.py +++ b/fast_llm/models/gpt/conversion/apriel.py @@ -8,18 +8,12 @@ from fast_llm.layers.attention.config import AttentionConfig from fast_llm.layers.block.config import BlockSequenceConfig, FixedBlockSequenceConfig, PatternBlockSequenceConfig from fast_llm.layers.decoder.config import DecoderBlockConfig -from fast_llm.layers.decoder.mlp.config import MLPConfig from fast_llm.layers.ssm.config import DiscreteMamba2Config, Mamba2Config from fast_llm.models.gpt.config import GPTModelConfig from fast_llm.models.gpt.conversion.config import AprielHybridSSMCheckpointFormat -from fast_llm.models.gpt.conversion.llama import ( - LlamaMLPConverter, - get_parameter_converter, - get_weight_and_bias_converters, -) +from fast_llm.models.gpt.conversion.llama import get_parameter_converter, get_weight_and_bias_converters from fast_llm.models.gpt.conversion.mistral import ( MistralBaseModelConverter, - MistralBlockConverter, MistralDecoderConverter, MistralHeadConverter, MistralHuggingfaceCheckpointHandler, @@ -229,23 +223,6 @@ def get_converters( ] -class AprielMLPConverter(LlamaMLPConverter): - @classmethod - def import_config(cls, config: dict) -> dict: - config["mlp_bias"] = False - return super().import_config(config) - - @classmethod - def export_config(cls, config: MLPConfig) -> dict: - out = super().export_config(config) - del out["mlp_bias"] - return out - - -class AprielBlockConverterBase(MistralBlockConverter): - mlp_converter_class: typing.ClassVar[type[AprielMLPConverter]] = AprielMLPConverter - - class AprielDiscreteMamba2BlockConverter(AprielBlockConverterBase): mixer_converter_class: typing.ClassVar[type[AprielDiscreteMamba2Converter]] = AprielDiscreteMamba2Converter hf_mixer_name: typing.ClassVar[str] = "mixer" From 99c42c06d8af13df9bfeaece1e9425de4f9932fb Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 17 Nov 2025 20:56:35 +0000 Subject: [PATCH 3/4] fix --- fast_llm/models/gpt/conversion/apriel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fast_llm/models/gpt/conversion/apriel.py b/fast_llm/models/gpt/conversion/apriel.py index ffd2522ce..e16eac4de 100644 --- a/fast_llm/models/gpt/conversion/apriel.py +++ b/fast_llm/models/gpt/conversion/apriel.py @@ -14,6 +14,7 @@ from fast_llm.models.gpt.conversion.llama import get_parameter_converter, get_weight_and_bias_converters from fast_llm.models.gpt.conversion.mistral import ( MistralBaseModelConverter, + MistralBlockConverter, MistralDecoderConverter, MistralHeadConverter, MistralHuggingfaceCheckpointHandler, @@ -223,12 +224,12 @@ def get_converters( ] -class AprielDiscreteMamba2BlockConverter(AprielBlockConverterBase): +class AprielDiscreteMamba2BlockConverter(MistralBlockConverter): mixer_converter_class: typing.ClassVar[type[AprielDiscreteMamba2Converter]] = AprielDiscreteMamba2Converter hf_mixer_name: typing.ClassVar[str] = "mixer" -class AprielMamba2BlockConverter(AprielBlockConverterBase): +class AprielMamba2BlockConverter(MistralBlockConverter): mixer_converter_class: typing.ClassVar[type[AprielMamba2Converter]] = AprielMamba2Converter hf_mixer_name: typing.ClassVar[str] = "mixer" @@ -240,7 +241,7 @@ class AprielBlockConverter: DiscreteMamba2Config: "m2d", } _converter_classes = { - AttentionConfig: AprielBlockConverterBase, + AttentionConfig: MistralBlockConverter, Mamba2Config: AprielMamba2BlockConverter, DiscreteMamba2Config: AprielDiscreteMamba2BlockConverter, } From d3df7a567e5a8a02400d79bc4aee735ff10a0942 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 17 Nov 2025 21:12:37 +0000 Subject: [PATCH 4/4] move assert --- fast_llm/models/gpt/conversion/mistral.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fast_llm/models/gpt/conversion/mistral.py b/fast_llm/models/gpt/conversion/mistral.py index 28941bc8a..a9a0909ec 100644 --- a/fast_llm/models/gpt/conversion/mistral.py +++ b/fast_llm/models/gpt/conversion/mistral.py @@ -48,14 +48,11 @@ def import_config(cls, config: dict) -> dict: @classmethod def export_config(cls, config: MLPConfig) -> dict: + assert not config.add_linear_biases out = super().export_config(config) del out["mlp_bias"] return out - @classmethod - def _check_config(cls, config: MLPConfig) -> None: - assert not config.add_linear_biases - class MistralBlockConverter(LlamaBlockConverter): mixer_converter_class: typing.ClassVar[type[MistralAttentionConverter]] = MistralAttentionConverter