Skip to content

Commit

Permalink
fix tokenization logic (#4565)
Browse files Browse the repository at this point in the history
confirmed that the PR fix the performance issue. Force merging to bypass the known CI bug
  • Loading branch information
sijunhe committed Jan 30, 2023
1 parent d6d597d commit 79d9a37
Showing 1 changed file with 64 additions and 37 deletions.
101 changes: 64 additions & 37 deletions model_zoo/ernie-3.0/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,14 @@ def prepare_train_features(examples, tokenizer, args, dynamic_max_length: Option
max_length = get_dynamic_max_length(
examples=tokenized_examples, default_max_length=args.max_seq_length, dynamic_max_length=dynamic_max_length
)
# always pad to max_length
tokenized_examples = tokenizer(
questions, contexts, stride=args.doc_stride, max_length=max_length, padding="max_length", truncation=True
)
else:
max_length = args.max_seq_length
tokenized_examples = tokenizer(
questions, contexts, stride=args.doc_stride, max_length=max_length, padding="max_length", truncation=True
)
tokenized_examples = tokenizer(
questions, contexts, stride=args.doc_stride, max_length=args.max_seq_length, truncation=True
)

# Since one example might give us several features if it has a long context, we need a map from a feature to
# its corresponding example. This key gives us just that.
Expand Down Expand Up @@ -140,11 +143,14 @@ def prepare_validation_features(examples, tokenizer, args, dynamic_max_length: O
max_length = get_dynamic_max_length(
examples=tokenized_examples, default_max_length=args.max_seq_length, dynamic_max_length=dynamic_max_length
)
# always pad to max_length
tokenized_examples = tokenizer(
questions, contexts, stride=args.doc_stride, max_length=max_length, padding="max_length", truncation=True
)
else:
max_length = args.max_seq_length
tokenized_examples = tokenizer(
questions, contexts, stride=args.doc_stride, max_length=max_length, padding="max_length", truncation=True
)
tokenized_examples = tokenizer(
questions, contexts, stride=args.doc_stride, max_length=args.max_seq_length, truncation=True
)
# Since one example might give us several features if it has a long context, we need a map from a feature to
# its corresponding example. This key gives us just that.
sample_mapping = tokenized_examples.pop("overflow_to_sample")
Expand Down Expand Up @@ -315,9 +321,10 @@ def seq_convert_example(
max_length = get_dynamic_max_length(
examples=temp_example, default_max_length=max_seq_length, dynamic_max_length=dynamic_max_length
)
# always pad to max_length
example = tokenizer(example["sentence"], max_length=max_length, padding="max_length", truncation=True)
else:
max_length = max_seq_length
example = tokenizer(example["sentence"], max_length=max_length, padding="max_length", truncation=True)
example = tokenizer(example["sentence"], max_length=max_seq_length, truncation=True)
elif "sentence1" in example:
if dynamic_max_length is not None:
temp_example = tokenizer(
Expand All @@ -329,15 +336,21 @@ def seq_convert_example(
max_length = get_dynamic_max_length(
examples=temp_example, default_max_length=max_seq_length, dynamic_max_length=dynamic_max_length
)
example = tokenizer(
example["sentence1"],
text_pair=example["sentence2"],
max_length=max_length,
padding="max_length",
truncation=True,
)
else:
max_length = max_seq_length
example = tokenizer(
example["sentence1"],
text_pair=example["sentence2"],
max_length=max_length,
padding="max_length",
truncation=True,
)
example = tokenizer(
example["sentence1"],
text_pair=example["sentence2"],
max_length=max_seq_length,
truncation=True,
)

if not is_test:
if "token_type_ids" in example:
return {"input_ids": example["input_ids"], "token_type_ids": example["token_type_ids"], "labels": label}
Expand Down Expand Up @@ -369,16 +382,23 @@ def token_convert_example(
max_length = get_dynamic_max_length(
examples=tokenized_input, default_max_length=max_seq_length, dynamic_max_length=dynamic_max_length
)
# always pad to max_length
tokenized_input = tokenizer(
example,
is_split_into_words=True,
max_length=max_length,
padding="max_length",
truncation=True,
return_length=return_length,
)
else:
max_length = max_seq_length
tokenized_input = tokenizer(
example,
is_split_into_words=True,
max_length=max_length,
padding="max_length",
truncation=True,
return_length=return_length,
)
tokenized_input = tokenizer(
example,
is_split_into_words=True,
max_length=max_seq_length,
truncation=True,
return_length=return_length,
)

# -2 for [CLS] and [SEP]
if len(tokenized_input["input_ids"]) - 2 < len(labels):
Expand Down Expand Up @@ -406,17 +426,24 @@ def token_convert_example(
max_length = get_dynamic_max_length(
examples=tokenized_input, default_max_length=max_seq_length, dynamic_max_length=dynamic_max_length
)
# always pad to max_length
tokenized_input = tokenizer(
example["tokens"],
max_length=max_length,
padding="max_length",
truncation=True,
is_split_into_words=True,
return_length=return_length,
)
else:
max_length = max_seq_length

tokenized_input = tokenizer(
example["tokens"],
max_length=max_length,
padding="max_length",
truncation=True,
is_split_into_words=True,
return_length=return_length,
)
tokenized_input = tokenizer(
example["tokens"],
max_length=max_seq_length,
truncation=True,
is_split_into_words=True,
return_length=return_length,
)

label_ids = example["ner_tags"]
if len(tokenized_input["input_ids"]) - 2 < len(label_ids):
label_ids = label_ids[: len(tokenized_input["input_ids"]) - 2]
Expand Down

0 comments on commit 79d9a37

Please sign in to comment.