Skip to content

Commit 9299fab

Browse files
committed
Move some regex as constants, bump to v0.3.4
1 parent 26484b1 commit 9299fab

File tree

2 files changed

+19
-7
lines changed

2 files changed

+19
-7
lines changed

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
unitytranslate_version = 0.3.3
1+
unitytranslate_version = 0.3.4
22

33
# This isn't actually used in the download process, however it's used for the sake of caching the downloaded files.
44
# https://github.com/OpenNMT/CTranslate2/blob/master/python/ctranslate2/version.py

native/src/bpe.rs

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@ pub struct BPEConstants {
7272
IS_PUNCT: Regex,
7373
IS_OPEN_QUOTE: Regex,
7474

75+
SYMBOLS: Regex,
76+
NUMBERS: Regex,
77+
S_END: Regex,
78+
COLON: Regex,
79+
OPEN_QUOTES: Regex,
80+
7581
CJK_RANGES: Vec<(usize, usize)>,
7682
}
7783

@@ -159,6 +165,12 @@ impl Default for BPEConstants {
159165
r#"^['"„“`]+$"#
160166
).unwrap(),
161167

168+
SYMBOLS: Regex::new(r"^[?!:;\\%]$").unwrap(),
169+
NUMBERS: Regex::new(r"^[0-9]+$").unwrap(),
170+
S_END: Regex::new(r"s$").unwrap(),
171+
COLON: Regex::new(r"^:$").unwrap(),
172+
OPEN_QUOTES: Regex::new(r"^[„“”]+$").unwrap(),
173+
162174
CJK_RANGES: vec![
163175
(4352, 4607),
164176
(11904, 42191),
@@ -400,7 +412,7 @@ impl Tokenizer for BPETokenizer {
400412
detokenized_text += token;
401413
prepend_space = "".to_string();
402414
} else if self.constants.IS_PUNCT.is_match(token) {
403-
if lang == "fr" && Regex::new(r"^[?!:;\\%]$").unwrap().is_match(token) {
415+
if lang == "fr" && self.constants.SYMBOLS.is_match(token) {
404416
detokenized_text += " ";
405417
}
406418

@@ -410,9 +422,9 @@ impl Tokenizer for BPETokenizer {
410422
detokenized_text += token;
411423
prepend_space = " ".to_string();
412424
} else if lang == "cs" && i > 1
413-
&& Regex::new(r"^[0-9]+$").unwrap().is_match(tokens[tokens.len() - 2])
425+
&& self.constants.NUMBERS.is_match(tokens[tokens.len() - 2])
414426
&& Regex::new(r"^[.,]+$").unwrap().is_match(tokens[tokens.len() - 1])
415-
&& Regex::new(r"^[0-9]+$").unwrap().is_match(token) {
427+
&& self.constants.NUMBERS.is_match(token) {
416428
detokenized_text += token;
417429
prepend_space = " ".to_string();
418430
} else if (lang == "fr" || lang == "it" || lang == "ga") && i <= tokens.len() - 2
@@ -430,7 +442,7 @@ impl Tokenizer for BPETokenizer {
430442
prepend_space = "".to_string();
431443
} else if self.constants.IS_OPEN_QUOTE.is_match(token) {
432444
let mut normalized_quo = token;
433-
if Regex::new(r"^[„“”]+$").unwrap().is_match(token) {
445+
if self.constants.OPEN_QUOTES.is_match(token) {
434446
normalized_quo = &"\"";
435447
}
436448

@@ -445,7 +457,7 @@ impl Tokenizer for BPETokenizer {
445457
}
446458

447459
if quote_counts[normalized_quo] % 2 == 0 {
448-
if lang == "en" && *token == "'" && i > 0 && Regex::new(r"[s]$").unwrap().is_match(tokens[i - 1]) {
460+
if lang == "en" && *token == "'" && i > 0 && self.constants.S_END.is_match(tokens[i - 1]) {
449461
detokenized_text += token;
450462
prepend_space = " ".to_string();
451463
} else {
@@ -458,7 +470,7 @@ impl Tokenizer for BPETokenizer {
458470
prepend_space = " ".to_string();
459471
quote_counts.insert(normalized_quo, *quote_counts.get(normalized_quo).unwrap_or(&0) + 1);
460472
}
461-
} else if lang == "fi" && Regex::new(r"^:$").unwrap().is_match(tokens[i - 1]) && self.constants.FINNISH_REGEX.is_match(token) {
473+
} else if lang == "fi" && self.constants.COLON.is_match(tokens[i - 1]) && self.constants.FINNISH_REGEX.is_match(token) {
462474
detokenized_text += prepend_space.as_str();
463475
detokenized_text += token;
464476
prepend_space = " ".to_string();

0 commit comments

Comments
 (0)