Skip to content

Commit

Permalink
Merge pull request huggingface#30 from SaulLu/modeling_markup_lm_incl…
Browse files Browse the repository at this point in the history
…ude_tags_dic_into_tokenizer_config

Modeling markup lm include tags dic into tokenizer config
  • Loading branch information
NielsRogge committed Jan 11, 2022
2 parents 447ac10 + 3a8d067 commit 9624b9f
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 31 deletions.
18 changes: 3 additions & 15 deletions src/transformers/models/markuplm/tokenization_markuplm.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tags_dict": "tags_dict.json",
}

PRETRAINED_VOCAB_FILES_MAP = {
Expand All @@ -60,10 +59,6 @@
"microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt",
"microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt",
},
"tags_dict": {
"microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/tags_dict.json",
"microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/tags_dict.json",
},
}


Expand Down Expand Up @@ -261,8 +256,8 @@ def __init__(

with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
with open(tags_dict, encoding="utf-8") as tags_dict_handle:
self.tags_dict = json.load(tags_dict_handle)

self.tags_dict = tags_dict
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
Expand Down Expand Up @@ -425,9 +420,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
tags_dict_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["tags_dict"]
)

# save vocab_file
with open(vocab_file, "w", encoding="utf-8") as f:
Expand All @@ -447,11 +439,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
writer.write(" ".join(bpe_tokens) + "\n")
index += 1

# save tags_dict_file
with open(tags_dict_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.tags_dict, ensure_ascii=False))

return vocab_file, merge_file, tags_dict_file
return vocab_file, merge_file

def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
Expand Down
15 changes: 3 additions & 12 deletions src/transformers/models/markuplm/tokenization_markuplm_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_file": "tokenizer.json",
"tags_dict": "tags_dict.json",
}

PRETRAINED_VOCAB_FILES_MAP = {
Expand All @@ -65,10 +64,6 @@
"microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt",
"microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt",
},
"tags_dict": {
"microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/tags_dict.json",
"microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/tags_dict.json",
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
Expand Down Expand Up @@ -179,8 +174,8 @@ def __init__(
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space
with open(tags_dict, encoding="utf-8") as tags_dict_handle:
self.tags_dict = json.load(tags_dict_handle)

self.tags_dict = tags_dict

tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
Expand Down Expand Up @@ -727,8 +722,4 @@ def create_token_type_ids_from_sequences(
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)

tags_dict_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["tags_dict"]
)

return tuple(files) + (tags_dict_file,)
return tuple(files)
9 changes: 5 additions & 4 deletions tests/test_tokenization_markuplm.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,19 @@ def setUp(self):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
self.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
self.special_tokens_map = {"unk_token": "<unk>"}

self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
self.tags_dict = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tags_dict"])
self.tokenizer_config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")

with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
with open(self.tags_dict, "w", encoding="utf-8") as fp:
fp.write(json.dumps(tags_dict) + "\n")
with open(self.tokenizer_config_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps({"tags_dict": self.tags_dict}))

# def get_clean_sequence(self, tokenizer):
# html_string = "<html> hello world </html>"
Expand Down

0 comments on commit 9624b9f

Please sign in to comment.