Merge pull request #781 from OptimalScale/yizhenjia-custom-template-doc

add chatml conversation template
OptimalScale · Apr 22, 2024 · 8cd15da · 8cd15da
2 parents 19087e8 + 6b5cf35
commit 8cd15da
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 18 deletions.
diff --git a/docs/source/examples/DATASETS.md b/docs/source/examples/DATASETS.md
@@ -181,6 +181,7 @@ Conversations should be formatted before feeding into the model. As of now, we'v
 
 | Template Name | Filled Example | Detailed Template |
 | ------------- | -------------- | ----------------- |
+| `chatml` | `<\|im_start\|>system`<br>`You are a chatbot developed by LMFlow team.<\|im_end\|>`<br>`<\|im_start\|>user`<br>`Who are you?<\|im_end\|>`<br>`<\|im_start\|>assistant`<br>`I am a chatbot developed by LMFlow team.<\|im_end\|>`<br>`<\|im_start\|>user`<br>`How old are you?<\|im_end\|>`<br>`<\|im_start\|>assistant`<br>`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<\|im_end\|>`<br> | [Link](./supported_conversation_template.md#chatml) |
 | `llama3` | `<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>`<br><br>`You are a chatbot developed by LMFlow team.<\|eot_id\|><\|start_header_id\|>user<\|end_header_id\|>`<br><br>`Who are you?<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>`<br><br>`I am a chatbot developed by LMFlow team.<\|eot_id\|><\|start_header_id\|>user<\|end_header_id\|>`<br><br>`How old are you?<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>`<br><br>`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<\|eot_id\|>` | [Link](./supported_conversation_template.md#llama-3) |
 | `llama2` | `<s>[INST] <<SYS>>`<br>`You are a chatbot developed by LMFlow team.`<br>`<</SYS>>`<br><br>`Who are you? [/INST] I am a chatbot developed by LMFlow team.</s><s>[INST] How old are you? [/INST] I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.</s>` | [Link](./supported_conversation_template.md#llama-2) |
 | `qwen2` | `<\|im_start\|>system`<br>`You are a chatbot developed by LMFlow team.<\|im_end\|>`<br>`<\|im_start\|>user`<br>`Who are you?<\|im_end\|>`<br>`<\|im_start\|>assistant`<br>`I am a chatbot developed by LMFlow team.<\|im_end\|>`<br>`<\|im_start\|>user`<br>`How old are you?<\|im_end\|>`<br>`<\|im_start\|>assistant`<br>`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<\|im_end\|>`<br> | [Link](./supported_conversation_template.md#qwen-2) |

diff --git a/docs/source/examples/supported_conversation_template.md b/docs/source/examples/supported_conversation_template.md
@@ -24,6 +24,39 @@ This template is not preseted in LMFlow currently. We are working on it and will
 ```
 
 
+## ChatML
+**With a system message**
+```
+<|im_start|>system\n{{system_message}}<|im_end|>\n<|im_start|>user\n{{user_message_0}}<|im_end|>\n
+```
+
+**Without a system message**
+```
+<|im_start|>user\n{{user_message_0}}<|im_end|>\n
+```
+
+**A complete conversation**
+```
+<|im_start|>system\n{{system_message}}<|im_end|>\n<|im_start|>user\n{{user_message_0}}<|im_end|>\n<|im_start|>assistant\n{{assistant_reply_0}}<|im_end|>\n
+```
+
+**Multiple rounds**
+```
+<|im_start|>system\n{{system_message}}<|im_end|>\n<|im_start|>user\n{{user_message_0}}<|im_end|>\n<|im_start|>assistant\n{{assistant_reply_0}}<|im_end|>\n<|im_start|>user\n{{user_message_1}}<|im_end|>\n<|im_start|>assistant\n{{assistant_reply_1}}<|im_end|>\n
+```
+
+**jinja template**  
+[[Reference](https://huggingface.co/mlabonne/OrpoLlama-3-8B/blob/3534d0562dee3a541d015ef908a71b0aa9085488/tokenizer_config.json#L2073)]
+```
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}
+```
+
+**Filled Example**
+```
+<|im_start|>system\nYou are a chatbot developed by LMFlow team.<|im_end|>\n<|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\nI am a chatbot developed by LMFlow team.<|im_end|>\n<|im_start|>user\nHow old are you?<|im_end|>\n<|im_start|>assistant\nI don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<|im_end|>\n
+```
+
+
 ## InternLM2
 ```{admonition} **Work in Progress**
 :class: info
@@ -165,7 +198,7 @@ The conversation template for Mixtral 8x7B is slightly different from the templa
 **jinja template**  
 [[Reference](https://huggingface.co/Qwen/Qwen1.5-72B/blob/93bac0d1ae83d50c43b1793e2d74a00dc43a4c36/tokenizer_config.json#L31)]
 ```
-"{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}
+{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}
 ```
 
 **Filled Example**

diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py
@@ -62,6 +62,7 @@
 from lmflow.utils.conversation_template import (
     ConversationTemplate, 
     EmptyConversationTemplate, 
+    ChatMLConversationTemplate,
     Llama2ConversationTemplate,
     Llama3ConversationTemplate,
     Qwen2ConversationTemplate,
@@ -471,6 +472,8 @@ def tokenize(self, dataset, add_special_tokens=True, *args, **kwargs):
                         conversation_template = Llama2ConversationTemplate()
                     elif data_args.conversation_template == 'llama3':
                         conversation_template = Llama3ConversationTemplate()
+                    elif data_args.conversation_template == 'chatml':
+                        conversation_template = ChatMLConversationTemplate()
                     elif data_args.conversation_template == 'qwen2':
                         conversation_template = Qwen2ConversationTemplate()
                     elif data_args.conversation_template == 'empty':

diff --git a/src/lmflow/utils/conversation_template.py b/src/lmflow/utils/conversation_template.py
@@ -219,7 +219,26 @@ def _ensure_id_list(self, obj: Union[int, List[int]]) -> List[int]:
         else:
             raise ValueError(f"Object type {type(obj)} is not supported yet.")
 
-
+
+@dataclass
+class ChatMLConversationTemplate(ConversationTemplate):
+    user_formatter: Formatter = StringFormatter(
+        template=[
+            TemplateComponent(type='string', content='<|im_start|>user\n{{content}}<|im_end|>\n')
+        ]
+    )
+    assistant_formatter: Formatter = StringFormatter(
+        template=[
+            TemplateComponent(type='string', content='<|im_start|>assistant\n{{content}}<|im_end|>\n')
+        ]
+    )
+    system_formatter: Formatter = StringFormatter(
+        template=[
+            TemplateComponent(type='string', content='<|im_start|>system\n{{content}}<|im_end|>\n')
+        ]
+    )
+
+
 @dataclass
 class EmptyConversationTemplate(ConversationTemplate):
     user_formatter: Formatter = StringFormatter(
@@ -327,20 +346,5 @@ def _encode(
 
 
 @dataclass
-class Qwen2ConversationTemplate(ConversationTemplate):
-    user_formatter: Formatter = StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>user\n{{content}}<|im_end|>\n')
-        ]
-    )
-    assistant_formatter: Formatter = StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>assistant\n{{content}}<|im_end|>\n')
-        ]
-    )
-    system_formatter: Formatter = StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>system\n{{content}}<|im_end|>\n')
-        ]
-    )
+class Qwen2ConversationTemplate(ChatMLConversationTemplate):
     separator: TemplateComponent = TemplateComponent(type='string', content='\n')