Merge pull request huggingface#30 from SaulLu/modeling_markup_lm_incl…

…ude_tags_dic_into_tokenizer_config Modeling markup lm include tags dic into tokenizer config
SaulLu · Jan 11, 2022 · 9624b9f · 9624b9f
2 parents 447ac10 + 3a8d067
commit 9624b9f
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 31 deletions.
diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py
@@ -48,7 +48,6 @@
 VOCAB_FILES_NAMES = {
     "vocab_file": "vocab.json",
     "merges_file": "merges.txt",
-    "tags_dict": "tags_dict.json",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
@@ -60,10 +59,6 @@
         "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt",
         "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt",
     },
-    "tags_dict": {
-        "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/tags_dict.json",
-        "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/tags_dict.json",
-    },
 }
 
 
@@ -261,8 +256,8 @@ def __init__(
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
-        with open(tags_dict, encoding="utf-8") as tags_dict_handle:
-            self.tags_dict = json.load(tags_dict_handle)
+
+        self.tags_dict = tags_dict
         self.decoder = {v: k for k, v in self.encoder.items()}
         self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
@@ -425,9 +420,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         merge_file = os.path.join(
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
         )
-        tags_dict_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["tags_dict"]
-        )
 
         # save vocab_file
         with open(vocab_file, "w", encoding="utf-8") as f:
@@ -447,11 +439,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
                 writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
-        # save tags_dict_file
-        with open(tags_dict_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.tags_dict, ensure_ascii=False))
-
-        return vocab_file, merge_file, tags_dict_file
+        return vocab_file, merge_file
 
     def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)

diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
@@ -53,7 +53,6 @@
     "vocab_file": "vocab.json",
     "merges_file": "merges.txt",
     "tokenizer_file": "tokenizer.json",
-    "tags_dict": "tags_dict.json",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
@@ -65,10 +64,6 @@
         "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt",
         "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt",
     },
-    "tags_dict": {
-        "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/tags_dict.json",
-        "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/tags_dict.json",
-    },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -179,8 +174,8 @@ def __init__(
             self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
 
         self.add_prefix_space = add_prefix_space
-        with open(tags_dict, encoding="utf-8") as tags_dict_handle:
-            self.tags_dict = json.load(tags_dict_handle)
+
+        self.tags_dict = tags_dict
 
         tokenizer_component = "post_processor"
         tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
@@ -727,8 +722,4 @@ def create_token_type_ids_from_sequences(
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
 
-        tags_dict_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["tags_dict"]
-        )
-
-        return tuple(files) + (tags_dict_file,)
+        return tuple(files)
diff --git a/tests/test_tokenization_markuplm.py b/tests/test_tokenization_markuplm.py
@@ -71,18 +71,19 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
+        self.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
         self.special_tokens_map = {"unk_token": "<unk>"}
 
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        self.tags_dict = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tags_dict"])
+        self.tokenizer_config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")
+
         with open(self.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
         with open(self.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
-        with open(self.tags_dict, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(tags_dict) + "\n")
+        with open(self.tokenizer_config_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps({"tags_dict": self.tags_dict}))
 
     # def get_clean_sequence(self, tokenizer):
     #     html_string = "<html> hello world </html>"