ServiceNow · jlamypoirier · Sep 18, 2025 · Jul 21, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/examples/mistral.yaml b/examples/mistral.yaml
@@ -62,7 +62,7 @@ model:
   multi_stage:
     zero_stage: 2
   distributed:
-    training_dtype: bf16
+    compute_dtype: bf16
     seed: 984059
 run:
   experiment_dir: mistral_example
diff --git a/fast_llm/__init__.py b/fast_llm/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.0"
+__version__ = "0.3.0"
diff --git a/fast_llm/config.py b/fast_llm/config.py
@@ -759,58 +759,32 @@ def from_dict(
         return cls._from_dict(default, strict)
 
     @classmethod
-    def from_flat_dict(
-        cls,
-        default: dict[str, typing.Any],
-        strict: bool = True,
-    ) -> typing.Self:
-        # TODO v0.3: Remove flat format
-        return cls._from_dict(default, strict, True)
-
-    @classmethod
-    def _from_dict(
-        cls,
-        default: dict[str, typing.Any],
-        strict: bool = True,
-        flat: bool = False,
-    ) -> typing.Self:
-        # TODO v0.3: Remove flat format
+    def _from_dict(cls, default: dict[str, typing.Any], strict: bool = True) -> typing.Self:
         out_arg_dict = {"_from_dict_check": True}
-
-        # TODO v0.3: Remove backward compatibility fix
-        if "__class__" in default:
-            del default["__class__"]
-
         try:
             actual_cls = cls.get_subclass(default.get("type"))
         except KeyError:
             # Try to postpone error to validation.
             actual_cls = cls
 
         if actual_cls is not None and actual_cls is not cls:
-            return actual_cls._from_dict(default, strict=strict, flat=flat)
+            return actual_cls._from_dict(default, strict=strict)
 
         # Do not validate yet in case the root class sets cross-dependencies in validation.
         with NoAutoValidate():
             for name, field in cls.fields():
                 if not field.init or field._field_type != dataclasses._FIELD:  # noqa
                     continue
-                if flat:
-                    if isinstance(field.type, type) and issubclass(field.type, Config):
-                        out_arg_dict[name] = field.type._from_dict(default, False, True)
-                    elif name in default:
-                        out_arg_dict[name] = default.pop(name)
-                else:
-                    # Check for nested configs to instantiate.
-                    try:
-                        value = cls._from_dict_nested(default.pop(name, MISSING), field.type, strict)
-                        if value is not MISSING:
-                            out_arg_dict[name] = value
-                    except FieldTypeError as e:
-                        raise FieldTypeError(
-                            f"Invalid field type `{get_type_name(field.type)}` in class {cls._get_class_name()}: "
-                            + ", ".join(e.args)
-                        )
+                # Check for nested configs to instantiate.
+                try:
+                    value = cls._from_dict_nested(default.pop(name, MISSING), field.type, strict)
+                    if value is not MISSING:
+                        out_arg_dict[name] = value
+                except FieldTypeError as e:
+                    raise FieldTypeError(
+                        f"Invalid field type `{get_type_name(field.type)}` in class {cls._get_class_name()}: "
+                        + ", ".join(e.args)
+                    )
             out = cls(**out_arg_dict)  # noqa
             if strict and default:
                 out._unknown_fields = default.copy()

diff --git a/fast_llm/data/data/gpt/config.py b/fast_llm/data/data/gpt/config.py
@@ -1,23 +1,16 @@
 import logging
-import typing
 
 from fast_llm.config import Field, FieldHint, FieldUpdate, check_field, config_class
 from fast_llm.data.config import MultiprocessingContext, TokenizerConfig
 from fast_llm.data.data.config import DataConfig
-from fast_llm.data.dataset.gpt.config import (
-    GPTLegacyConfig,
-    GPTLegacyDatasetConfig,
-    GPTSampledDatasetConfig,
-    GPTSamplingConfig,
-)
-from fast_llm.engine.distributed.config import PhaseType
+from fast_llm.data.dataset.gpt.config import GPTSampledDatasetConfig, GPTSamplingConfig
 from fast_llm.utils import Assert
 
 logger = logging.getLogger(__name__)
 
 
 @config_class()
-class GPTDataConfig(DataConfig, GPTLegacyConfig):
+class GPTDataConfig(DataConfig):
     """
     Configuration for the dataset(s), split and sampling.
     Currently hard-coded to a GPT dataset.
@@ -48,32 +41,3 @@ class GPTDataConfig(DataConfig, GPTLegacyConfig):
         desc="Multiprocessing context. Do not touch.",
         hint=FieldHint.expert,
     )
-
-    def _validate(self) -> None:
-        if not self.datasets:
-            logger.warning(
-                "Using the legacy dataset definition format." " Specify it through `data.datasets` instead."
-            )
-            self.datasets = {
-                phase.value.lower(): GPTLegacyDatasetConfig.from_dict(self, strict=False)
-                for phase in (PhaseType.training, PhaseType.validation, PhaseType.test)
-            }
-        super()._validate()
-
-    @classmethod
-    def _from_dict(
-        cls,
-        default: dict[str, typing.Any],
-        strict: bool = True,
-        flat: bool = False,
-    ) -> typing.Self:
-        # TODO v0.x: Remove backward compatibility.
-        if "datasets" in default:
-            for phase in PhaseType:
-                if phase.value in default["datasets"]:
-                    rename = phase.value.lower()
-                    logger.warning(f"Renaming dataset {phase.value} to {rename}")
-                    assert rename not in default["datasets"]
-                    default["datasets"][rename] = default["datasets"].pop(phase.value)
-
-        return super()._from_dict(default, strict, flat)
diff --git a/fast_llm/data/dataset/config.py b/fast_llm/data/dataset/config.py
@@ -204,11 +204,6 @@ class BlendedDatasetConfig(SampledDatasetConfig):
         desc="The blending weight of each dataset.",
         hint=FieldHint.core,
     )
-    legacy: bool = Field(
-        default=False,
-        desc="Use the legacy formulas for sub-dataset seeds and sample sizes.",
-        hint=FieldHint.deprecated,
-    )
 
     def _validate(self) -> None:
         self.weights = normalize_probabilities(self.weights)
@@ -231,20 +226,10 @@ def build_and_sample(
                     sampling,
                     parameters=dataclasses.replace(
                         sampling.parameters,
-                        num_samples=(
-                            math.ceil(
-                                weight
-                                * (
-                                    sampling.parameters.num_samples
-                                    + 5 * (sampling.parameters.num_samples * (1 - weight)) ** 0.5
-                                )
-                            )
-                            if self.legacy
-                            else math.ceil(weight * sampling.parameters.num_samples) + 1
-                        ),
+                        num_samples=math.ceil(weight * sampling.parameters.num_samples) + 1,
                     ),
                     # TODO: Seed may not be unique for nested blended datasets.
-                    config=sampling.config.to_copy({"seed": sampling.config.seed + i * (0 if self.legacy else 697)}),
+                    config=sampling.config.to_copy({"seed": sampling.config.seed + i * 697}),
                 ),
             )
             for i, (dataset, weight) in enumerate(zip(self.datasets, self.weights, strict=True))

diff --git a/fast_llm/data/dataset/gpt/config.py b/fast_llm/data/dataset/gpt/config.py
@@ -1,10 +1,8 @@
 import dataclasses
 import enum
-import json
 import pathlib
 import time
 import typing
-import warnings
 
 import yaml
 
@@ -22,8 +20,7 @@
     SamplingData,
     SamplingParameters,
 )
-from fast_llm.engine.distributed.config import PhaseType
-from fast_llm.utils import Assert, normalize_probabilities, padded_cumsum
+from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
     from fast_llm.data.dataset.gpt.indexed import GPTConcatenatedDataset, GPTDatasetSlice, GPTIndexedDataset
@@ -41,7 +38,6 @@ class ShufflingType(str, enum.Enum):
     skip_first_epoch = "skip_first_epoch"
     # Disable shuffling entirely.
     disabled = "disabled"
-    legacy = "legacy"
 
 
 @config_class()
@@ -222,53 +218,14 @@ def _convert_paths(self, config):
         return config
 
 
-# Add user-friendly names for the configs.
-@config_class(dynamic_type={GPTSampledDatasetConfig: "concatenated_memmap"})
-class GPTConcatenatedMemmapConfig(GPTIndexedDatasetConfig):
-    # TODO v0.3: Remove.
-    _abstract: typing.ClassVar[bool] = False
-    path: pathlib.Path = Field(
-        default=None,
-        desc="The path to a dataset directory.",
-        hint=FieldHint.core,
-    )
-
-    def _validate(self) -> None:
-        warnings.warn("`concatenated_memmap` dataset is deprecated. Use `file` instead.", DeprecationWarning)
-        super()._validate()
-
-    def build(self) -> "GPTConcatenatedDataset":
-
-        assert self.path.is_dir()
-        index_path = self.path / "index.txt"
-
-        if index_path.is_file():
-            prefixes = [self.path / line.strip() for line in index_path.open("r").readlines()]
-        else:
-            warnings.warn(
-                f"The dataset path {self.path} points to a directory."
-                " The dataset will be indexed automatically, which may be unsafe."
-                " We recommend using an index file instead."
-            )
-            prefixes = [
-                path.with_suffix("")
-                for path in self.path.iterdir()
-                if path.suffix == ".idx" and path.is_file() and path.with_suffix(".bin").is_file()
-            ]
-        dataset_config = GPTConcatenatedDatasetConfig.from_dict(
-            {"datasets": [{"type": "memmap", "path": prefix} for prefix in prefixes]}
-        )
-        return dataset_config.build()
-
-
 @config_class()
 class FimConfig(Config):
     """
     Configuration for FIM.
     """
 
     rate: float = Field(
-        # TODO: Use meaningful default now that fim is a wrapper? (bad for legacy config)
+        # TODO: Use meaningful default now that fim is a wrapper?
         default=0.0,
         desc="FIM rate for each sample.",
         hint=FieldHint.core,
@@ -352,131 +309,6 @@ def build_and_sample(
         return GPTFimDataset(self, self.dataset.build_and_sample(sampling), sampling)
 
 
-class LegacyDatasetSource(str, enum.Enum):
-    """
-    An enum for the different ways to load datasets.
-    """
-
-    list = "list"
-    file = "file"
-    random = "random"
-
-
-def _validate_split(value: list[int]) -> list[int]:
-    Assert.leq(len(value), 3)
-    return value + [0] * (len(value) - 3)
-
-
-def _validate_path(value: str | list[str]) -> list[str]:
-    return [value] if isinstance(value, str) else value
-
-
-@config_class()
-class GPTLegacyConfig(Config):
-    split: list[float] = Field(
-        default_factory=lambda: [969, 30, 1],
-        desc="Split ratio for train, valid and test datasets.",
-        hint=FieldHint.deprecated,
-        valid=_validate_split,
-    )
-    format: LegacyDatasetSource = Field(
-        default=LegacyDatasetSource.list,
-        desc="Format for the dataset definition.",
-        hint=FieldHint.deprecated,
-    )
-    path: list[str] = Field(
-        default_factory=list,
-        desc="Path or list of paths and weights.",
-        hint=FieldHint.deprecated,
-        valid=_validate_path,
-    )
-    fim: FimConfig = Field(
-        desc="Configuration for Fill In the Middle (FIM).",
-        hint=FieldHint.feature,
-    )
-
-
-@config_class(dynamic_type={GPTSampledDatasetConfig: "legacy"})
-class GPTLegacyDatasetConfig(GPTSampledDatasetConfig, GPTLegacyConfig):
-    _abstract: typing.ClassVar[bool] = False
-
-    def build_and_sample(self, sampling: GPTSamplingData) -> SampledDataset:
-
-        if self.format == LegacyDatasetSource.random:
-            Assert.eq(len(self.path), 0)
-            dataset_config = GPTRandomDatasetConfig()
-        else:
-            if self.format == LegacyDatasetSource.file:
-                Assert.eq(len(self.path), 1)
-                data_path = pathlib.Path(self.path[0])
-                dataset_defs = json.load(data_path.open("r"))
-                data_base_path = data_path.parent
-                dataset_prefixes = [
-                    (data_base_path / dataset_def["prefix"]).resolve() for dataset_def in dataset_defs["datasets"]
-                ]
-                dataset_weights = normalize_probabilities(
-                    [dataset_def["weight"] for dataset_def in dataset_defs["datasets"]]
-                )
-            elif self.format == LegacyDatasetSource.list:
-                Assert.geq(len(self.path), 1)
-                if len(self.path) == 1:
-                    dataset_prefixes, dataset_weights = [self.path[0].strip()], [1.0]
-                else:
-                    Assert.custom(lambda x: x % 2 == 0, len(self.path))
-                    dataset_prefixes = [pathlib.Path(x.strip()).resolve() for x in self.path[1::2]]
-                    assert len(dataset_prefixes) == len(set(dataset_prefixes))
-                    dataset_weights = normalize_probabilities([float(x) for x in self.path[::2]])
-            else:
-                raise NotImplementedError(self.format)
-
-            phase_splits = padded_cumsum(normalize_probabilities(self.split))
-
-            phase_index = {
-                PhaseType.training.value.lower(): 0,
-                PhaseType.validation.value.lower(): 1,
-                PhaseType.test.value.lower(): 2,
-            }[sampling.dataset_name]
-
-            dataset_configs = [
-                {
-                    "type": "slice",
-                    # TODO: this duplicates memmap datasets for each phase.
-                    "dataset": {"type": "memmap", "path": prefix},
-                    "begin": float(phase_splits[phase_index]),
-                    "end": float(phase_splits[phase_index + 1]),
-                }
-                for prefix in dataset_prefixes
-            ]
-            dataset_config = (
-                {
-                    "type": "blended",
-                    "name": "blended",
-                    "datasets": dataset_configs,
-                    "weights": dataset_weights,
-                    "legacy": True,
-                }
-                if len(dataset_configs) > 1
-                else dataset_configs[0]
-            )
-            if self.fim.rate > 0:
-                dataset_config = {
-                    "type": "fim",
-                    "dataset": dataset_config,
-                    **self.fim.to_dict(),
-                }
-        # Legacy sampling config
-        dataset_config = {
-            "type": "sampled",
-            "dataset": dataset_config,
-            "sampling": {
-                "seed": sampling.distributed.config.seed,
-                "shuffle": "legacy",
-            },
-        }
-
-        return GPTSampledDatasetConfig.from_dict(dataset_config).build_and_sample(sampling)
-
-
 @config_class(dynamic_type={GPTSampledDatasetConfig: "test_slow"})
 class GPTTestSlowDatasetConfig(GPTSampledDatasetConfig):
     """