OptimalScale · research4pan · May 16, 2024 · May 16, 2024
diff --git a/docs/source/examples/customize_conversation_template.md b/docs/source/examples/customize_conversation_template.md
@@ -50,28 +50,72 @@ Recall the requirements for a conversation dataset:
 System message, user message, and assistant message are strings thus we can use `StringFormatter` for them.
 
 ### 3. Build the template
-```python
-from dataclasses import dataclass
+All preset templates are located at `src/lmflow/utils/conversation_template`.
+
+Within the template file, define your own template like:
 
-from lmflow.utils.conversation_formatter import Formatter, TemplateComponent, StringFormatter
-from lmflow.utils.conversation_template import ConversationTemplate
+```python
+from .base import StringFormatter, TemplateComponent, ConversationTemplate
 
 
-@dataclass
-class ChatMLConversationTemplate(ConversationTemplate):
-    user_formatter: Formatter = StringFormatter(
+YOUR_TEMPLATE = ConversationTemplate(
+    template_name='your_template_name',
+    user_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|im_start|>user\n{{content}}<|im_end|>\n')
+            TemplateComponent(type='string', content='User:\n{{content}}\n\n')
         ]
-    )
-    assistant_formatter: Formatter = StringFormatter(
+    ),
+    assistant_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|im_start|>assistant\n{{content}}<|im_end|>\n')
+            TemplateComponent(type='string', content='Assistant:\n{{content}}\n\n'),
+            TemplateComponent(type='token', content='eos_token') # this will add the eos token at the end of every assistant message
+            # please refer to the docstring of the `TemplateComponent` class to 
+            # see the difference between different types of components.
         ]
-    )
-    system_formatter: Formatter = StringFormatter(
+    ),
+    system_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|im_start|>system\n{{content}}<|im_end|>\n')
+            TemplateComponent(type='string', content='System:\n{{content}}\n\n')
         ]
     )
+    # For models that has ONLY ONE bos token at the beginning of 
+    # a conversation session (not a conversation pair), user can
+    # specify a special starter to add that starter to the very
+    # beginning of the conversation session. 
+    # eg:
+    #   llama-2: <s> and </s> at every pair of conversation 
+    #   v.s.
+    #   llama-3: <|begin_of_text|> only at the beginning of a session
+    special_starter=TemplateComponent(type='token', content='bos_token'),
+    # Similar to the special starter...
+    special_stopper=TemplateComponent(type='token', content='eos_token')
+
+)
+```
+
+Feel free to create your own template by inheriting the `ConversationTemplate` class. Llama-2 v.s. llama-3 would be a good examples to refer to.
+
+### 4. Register your template
+After defining your own template, you need to register it in the `src/lmflow/utils/conversation_template/__init__.py` file. 
+
+```python
+# ...
+from .your_template_file import YOUR_TEMPLATE
+
+
+PRESET_TEMPLATES = {
+    #...
+    'your_template_name': YOUR_TEMPLATE,
+}
+```
+
+### 5. Use your template
+You are all set! Specify the template name in, for example, your finetune script:
+
+```bash
+./scripts/run_finetune.sh \
+    --model_name_or_path path_to_your_model \
+    --dataset_path your_conversation_dataset \
+    --conversation_template your_template_name \
+    --output_model_path output_models/your_model
 ```
diff --git a/src/lmflow/utils/conversation_template/base.py b/src/lmflow/utils/conversation_template/base.py
@@ -15,6 +15,38 @@
 
 @dataclass
 class TemplateComponent:
+    """The minimal unit of a template, which can be a token, a string, or a list of tools.
+
+    Parameters
+    ----------
+    type : Literal['token', 'token_id', 'string', 'tools']
+        - Type of the component.  
+
+        - When the component is a token or a string, the content should be `string`. 
+        The difference between the two is that token will be converted to token ids 
+        by the tokenizer.convert_tokens_to_ids() method, while string will be directly 
+        encoded by the tokenizer.encode() method. Specially, since the bos token and eos
+        token are frequently used across different templates, we provide the convenience
+        to use `'bos_token'` and `'eos_token'` to represent the actual bos and eos tokens when
+        `type` of the `TemplateComponent` is `token`. For example:  
+
+        ```python
+        TemplateComponent(type='token', content='bos_token')
+        ```
+
+        After encoding, the content will be replaced by the actual token id of the bos token.
+        Please do remember that if you set the `type` to `string`, the tokenizer will try to 
+        encode the string 'bos_token' instead of providing the actual bos token.
+
+        - When the component is token_id, the content should be `int` or `List[int]`, and 
+        will be directly appended to the encoded token ids.
+
+        - Tools are not supported yet.
+
+    content : Union[str, int, List[str], List[int]]
+        Content of the component.
+
+    """
     type: Literal['token', 'token_id', 'string', 'tools']
     content: Union[str, int, List[str], List[int]]
     mask: Optional[bool] = True # for token specific masking, work in progress