ServiceNow · bigximik · Jun 19, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/docs/quick-start.md b/docs/quick-start.md
@@ -492,10 +492,13 @@ Save the following as `fast-llm-tutorial/train-config.yaml`:
       train_iters: 100  # (1)!
       logs:
         interval: 10
-      evaluations:
+      evaluators:
         validation:
-          iterations: 25
           interval: 100
+          evaluator:
+            type: loss
+            iterations: 25
+            dataset_name: validation
       export:  # (2)!
         format: llama
         interval: 100
@@ -550,10 +553,13 @@ Save the following as `fast-llm-tutorial/train-config.yaml`:
       train_iters: 100_000  # (1)!
       logs:
         interval: 10
-      evaluations:
+      evaluators:
         validation:
-          iterations: 25
-          interval: 1000
+          interval: 100
+          evaluator:
+            type: loss
+            iterations: 25
+            dataset_name: validation
       checkpoint:
         interval: 1000
         keep: 5

diff --git a/docs/recipes/continue-training.md b/docs/recipes/continue-training.md
@@ -5,11 +5,13 @@ title: Continual Pretraining of Llama 3.1 8B or Qwen 2.5 7B
 
 In this guide, we provide step-by-step instructions to do continued pretraining on The Stack with Llama 3.1 8B  or Qwen 2.5 7B models.
 
-# Preliminary steps
+## Preliminary steps
+
 - [Quick Start](../quick-start.md)
 - [Data preparation](data-preparation.md)
 
-# Download the Pretrained Model
+## Download the Pretrained Model
+
 Let's download the model first:
 === "Llama 3.1 8B"
     ```bash
@@ -22,21 +24,27 @@ Let's download the model first:
     git clone https://huggingface.co/Qwen/Qwen2.5-7B ./fast-llm-tutorial/pretrained-model
     ```
 
-# Training
+## Training
+
 This is not much different from a pretraining config. We will:
+
 - specify the the model checkpoint to load and its format. Fast-LLM will automatically infer the corresponding model architecture.
 - adapt some of the training parameters for our needs.
 - and that's it!
 === "Llama 3.1 8B"
+
     ```yaml
     training:
       train_iters: 100_000
       logs:
         interval: 10
-      evaluations:
+      evaluators:
         validation:
-          iterations: 25
-          interval: 1000
+          interval: 100
+          evaluator:
+            type: loss
+            iterations: 25
+            dataset_name: validation
       checkpoint:
         interval: 1000
         keep: 5
@@ -55,8 +63,8 @@ This is not much different from a pretraining config. We will:
           path: fast-llm-tutorial/dataset/fast_llm_config_training.yaml  # (2)!
         validation:
           type: file
-          path: fast-llm-tutorial/dataset/fast_llm_config_validation.yaml  # (2)!  
-    optimizer:  
+          path: fast-llm-tutorial/dataset/fast_llm_config_validation.yaml  # (2)!
+    optimizer:
       weight_decay: 0.1
       beta_1: 0.9
       beta_2: 0.95
@@ -78,20 +86,24 @@ This is not much different from a pretraining config. We will:
       multi_stage:
         zero_stage: 2
       distributed:
-        training_dtype: bf16  
+        training_dtype: bf16
     run:
       experiment_dir: fast-llm-tutorial/Llama-3.1-8B-cpt
     ```
+
 === "Qwen 2.5 7B"
     ```yaml
     training:
       train_iters: 100_000
       logs:
         interval: 10
-      validation:
-        Validation:
-          iterations: 25
-          interval: 1000
+      evaluators:
+        validation:
+          interval: 100
+          evaluator:
+            type: loss
+            iterations: 25
+            dataset_name: validation
       checkpoint:
         interval: 1000
         keep: 5
@@ -110,8 +122,8 @@ This is not much different from a pretraining config. We will:
           path: fast-llm-tutorial/dataset/fast_llm_config_training.yaml  # (6)!
         validation:
           type: file
-          path: fast-llm-tutorial/dataset/fast_llm_config_validation.yaml  # (6)! 
-    optimizer:  
+          path: fast-llm-tutorial/dataset/fast_llm_config_validation.yaml  # (6)!
+    optimizer:
       weight_decay: 0.1
       beta_1: 0.9
       beta_2: 0.95
@@ -133,7 +145,7 @@ This is not much different from a pretraining config. We will:
       multi_stage:
         zero_stage: 2
       distributed:
-        training_dtype: bf16  
+        training_dtype: bf16
     run:
       experiment_dir: fast-llm-tutorial/qwen-2.5-7B-cpt
     ```
@@ -144,7 +156,8 @@ This is not much different from a pretraining config. We will:
 4.  Config of the pretrained model. We load the model downloaded from the repository earlier.
 5.  This tells Fast-LLM to load the weights of the pretrained model. If we wanted to use the model's configuration, but train from scratch, we could use the same config but set this to `no`.
 
-# Checkpoint usage
+## Checkpoint usage
+
 Checkpoints will be saved regularly, and every 20k steps a checkpoint will be exported in the HF format.
 You can use it in `transformers` as you would use the pretrained  model, except this one should be stronger on programming languages!
 === "Llama 3.1 8B"
@@ -160,4 +173,4 @@ You can use it in `transformers` as you would use the pretrained  model, except
 
     tokenizer = AutoTokenizer.from_pretrained("fast-llm-tutorial/pretrained-model")
     pipe = pipeline("text-generation", model="fast-llm-tutorial/qwen-2.5-7B-cpt/export/qwen2/20000/", tokenizer=tokenizer)
-    ```
+    ```
diff --git a/docs/recipes/data-configuration.md b/docs/recipes/data-configuration.md
@@ -25,13 +25,13 @@ We already saw an example dataset configuration in the [quick-start guide](../qu
 
 In this section we are interested in generalizing step 3. For more details on steps 1 and 2, please refer to the quick-start guide or [this example](data-configuration.md).
 
-The section `data.datasets` holds descriptions of datasets used in training, validation, and testing.  
+The section `data.datasets` holds descriptions of datasets used in training, validation, and testing.
 
-The Training and Testing phases must have predetermined dataset names: `training` and `testing`, respectively. Each of these phases can have only one dataset.  
+The Training and Testing phases must have predetermined dataset names: `training` and `testing`, respectively. Each of these phases can have only one dataset.
 
-For validation datasets, the rules are different. There can be as many validation datasets as needed, and their names are arbitrary. In the example above, the dataset name `validation` is chosen for simplicity. The datasets names used for validation and their application details are specified in the training config `evaluations` sections.  
+For datasets used for loss evaluator during a validation phase, the rules are different. There can be as many such datasets as needed, and their names are arbitrary. In the example above, the dataset name `validation` is chosen for simplicity. The datasets names used for validation and their application details are specified in the training config `evaluators` sections.
 
-Adding multiple validation datasets increases flexibility in tracking the accuracy of your trained model. One possible scenario is using a separate validation dataset for each blended training dataset, allowing you to track training progress on each subset separately and observe how the model performs in real time on different subsets of your training data.  
+Adding multiple datasets for loss evaluators in validation phase increases flexibility in tracking the accuracy of your trained model. One possible scenario is using a separate validation dataset for each blended training dataset, allowing you to track training progress on each subset separately and observe how the model performs in real time on different subsets of your training data.
 
 Below are examples of how to configure various aspects of training and validation datasets.
 
@@ -128,22 +128,27 @@ data:
 !!! note "Default seed"
     In the absence of explicit seed, Fast-LLM uses a default seed (`data.sampling`'s default) instead, and uses seed shifts to ensure different seeds for each phase and for the various blended datasets.
 
+## Example 5: Specifying Multiple Dataset for Loss Evaluators During Validation phase
 
-## Example 5: Specifying Multiple Validation Datasets  
+In this example, we show how to specify multiple  datasets for loss evaluators and configure how often they are applied, along with their usage attributes in the `training.evaluators` section.
 
-In this example, we show how to specify multiple validation datasets and configure how often they are applied, along with their usage attributes in the `training.evaluations` section.  
-
-Please note that the same dataset names must be used in the `training.evaluations` section. If a validation dataset is specified in the `datasets` section but not in `training.evaluations`, it will not be used for validation.  
+Please note that the same dataset names must be used in the `training.evaluators` section. If a dataset is specified in the `datasets` section but not in `training.evaluators`, it will not be used for loss evaluation.
 
 ```yaml
 training:
-  evaluations:
+  evaluators:
     the_stack:
-      iterations: 25
       interval: 50
+      evaluator:
+        type: loss
+        iterations: 25
+        dataset_name: the_stack
     fineweb:
-      iterations: 25
       interval: 100
+      evaluator:
+        type: loss
+        iterations: 15
+        dataset_name: fineweb
 data:
   datasets:
     the_stack:
@@ -152,7 +157,7 @@ data:
     fineweb:
       type: file
       path: path/to/validation_fineweb_dataset.yaml
-      
+
 ```
 
 ## Example 6: Advanced scenario
@@ -207,7 +212,7 @@ data:
 
 !!! note "Configure from file"
     If a dataset configuration is especially complex and makes the dataset configuration excessively big, or is reused across many experiments, you may want to save it to a yaml file and refer to it un the config using a `file` dataset. This can be used to reduce the present example to
-    
+
     ```yaml
     data:
       datasets:

diff --git a/docs/recipes/instruction-finetuning.md b/docs/recipes/instruction-finetuning.md
@@ -114,10 +114,13 @@ training:
   train_iters: 5_000
   logs:
     interval: 1
-  evaluations:
+  evaluators:
     validation:
-      iterations: 25
-      interval: 1000
+      interval: 100
+      evaluator:
+        type: loss
+        iterations: 25
+        dataset_name: validation
   checkpoint:
     interval: 1000
     keep: 5

diff --git a/docs/recipes/train.md b/docs/recipes/train.md
@@ -4,13 +4,13 @@ title: Training Llama 3.1 8B
 
 Follow this guide to train a Llama-3.1 or Qwen 2.5 7B like model from scratch!
 
+## Preliminary steps
 
-# Preliminary steps
 - [Quick Start](../quick-start.md)
 - [Data preparation](data-preparation.md)
 
+## Training configuration
 
-# Training configuration
 In this guide, we show you how to configure a model architecture and train a model from scratch.
 Let's start from the following training configuration:
 === "Llama 3.1 8B"
@@ -19,10 +19,12 @@ Let's start from the following training configuration:
       train_iters: 100_000
       logs:
         interval: 10
-      evaluations:
-        validation:
-          iterations: 25
-          interval: 1000
+      evaluators:
+          interval: 100
+          evaluator:
+            type: loss
+            iterations: 25
+            dataset_name: validation
       checkpoint:
         interval: 1000
         keep: 5
@@ -68,10 +70,13 @@ Let's start from the following training configuration:
       train_iters: 100_000
       logs:
         interval: 10
-      evaluations:
+      evaluators:
         validation:
-          iterations: 25
-          interval: 1000
+          interval: 100
+          evaluator:
+            type: loss
+            iterations: 25
+            dataset_name: validation
       checkpoint:
         interval: 1000
         keep: 5
@@ -133,16 +138,16 @@ By specifying a pretrained model from the HuggingFace hub, Fast-LLM automaticall
 === "Llama 3.1 8B"
     ```yaml
     pretrained:
-      format: llama  
+      format: llama
       path: fast-llm-tutorial/pretrained_model
-      model_weights: no 
+      model_weights: no
     ```
 === "Qwen 2.5 7B"
     ```yaml
     pretrained:
-      format: qwen2  
+      format: qwen2
       path: fast-llm-tutorial/pretrained_model
-      model_weights: no 
+      model_weights: no
     ```
 
 Alternatively, we define the model architecture ourselves as follows:
@@ -196,4 +201,3 @@ Alternatively, we define the model architecture ourselves as follows:
 1.  Hidden-size/num-layers will be used to provide good defaults for weight initialization std.
 
 Configuring the model this way is a bit more verbose than using the pretrained configuration, but gives an idea of how to configure a the model with Fast-LLM.
-
diff --git a/examples/mistral.yaml b/examples/mistral.yaml
@@ -3,9 +3,11 @@ training:
   num_workers: 8
   logs:
     interval: 10
-  evaluations:
+  evaluators:
     validation:
-      iterations: null
+      evaluator:
+        type: loss
+        iterations: null
   test_iters: 0
 batch:
   sequence_length: 4096

diff --git a/fast_llm/data/config.py b/fast_llm/data/config.py
@@ -34,3 +34,8 @@ class TokenizerConfig(Config):
         desc="Path to the tokenizer file.",
         hint=FieldHint.core,
     )
+    bos_token: str | None = Field(
+        default=None,
+        desc="BOS token to use if the tokenizer doesn't define one; must be an existing token.",
+        hint=FieldHint.core,
+    )
diff --git a/fast_llm/data/data/gpt/config.py b/fast_llm/data/data/gpt/config.py
@@ -85,5 +85,4 @@ def _from_dict(
                     assert rename not in default["datasets"]
                     default["datasets"][rename] = default["datasets"].pop(phase.value)
 
-        cls._handle_renamed_field(default, "validation", ("evaluations", "validation"))
         return super()._from_dict(default, strict, flat)
diff --git a/fast_llm/data/data/gpt/data.py b/fast_llm/data/data/gpt/data.py
@@ -117,8 +117,10 @@ def setup(
         self._datasets = {}
         for dataset_name, sampling_parameters in self._sampling_parameters.items():
             if self._tokenizer is not None:
-                # TODO: Too constraining?
-                Assert.eq(self._tokenizer.vocab_size, sampling_parameters.vocab_size)
+                # NOTE: Some models like Qwen2-1.5B-Instruct
+                # have vocab_size bigger in model config than in tokenizer
+                # TODO: Still, is it too constraining?
+                Assert.geq(sampling_parameters.vocab_size, self._tokenizer.vocab_size)
             if sampling_parameters.num_samples > 0:
                 sampling = GPTSamplingData(
                     config=self._config.sampling,

diff --git a/fast_llm/data/tokenizer.py b/fast_llm/data/tokenizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import torch
-from transformers import PreTrainedTokenizerFast
+from transformers import AutoTokenizer
 
 from fast_llm.data.config import TokenizerConfig
 from fast_llm.engine.config_utils.run import log_main_rank
@@ -13,9 +13,15 @@ class Tokenizer:
 
     def __init__(self, config: TokenizerConfig):
         log_main_rank(f"> loading tokenizer from {config.path} ...")
-        self.tokenizer: PreTrainedTokenizerFast = PreTrainedTokenizerFast.from_pretrained(
-            pretrained_model_name_or_path=config.path, errors="replace", max_len=None
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=config.path,
+            errors="replace",
+            max_len=None,
+            trust_remote_code=True,
+            use_fast=True,
         )
+        if config.bos_token is not None:
+            self.tokenizer.bos_token = config.bos_token
         if self.tokenizer.eos_token_id is None:
             raise ValueError("Tokenizer does not have an EOS token.")
         if self.tokenizer.bos_token_id is None:
@@ -52,7 +58,7 @@ def tokenize_with_spans(
         token_spans = []
         char_pos = 0
         beginning_of_text = True
-        
+
         for start, end in char_spans:
             if char_pos < start:
                 curr_text = text[char_pos:start]