@@ -72,6 +72,12 @@ pub struct BPEConstants {
7272 IS_PUNCT : Regex ,
7373 IS_OPEN_QUOTE : Regex ,
7474
75+ SYMBOLS : Regex ,
76+ NUMBERS : Regex ,
77+ S_END : Regex ,
78+ COLON : Regex ,
79+ OPEN_QUOTES : Regex ,
80+
7581 CJK_RANGES : Vec < ( usize , usize ) > ,
7682}
7783
@@ -159,6 +165,12 @@ impl Default for BPEConstants {
159165 r#"^['"„“`]+$"#
160166 ) . unwrap ( ) ,
161167
168+ SYMBOLS : Regex :: new ( r"^[?!:;\\%]$" ) . unwrap ( ) ,
169+ NUMBERS : Regex :: new ( r"^[0-9]+$" ) . unwrap ( ) ,
170+ S_END : Regex :: new ( r"s$" ) . unwrap ( ) ,
171+ COLON : Regex :: new ( r"^:$" ) . unwrap ( ) ,
172+ OPEN_QUOTES : Regex :: new ( r"^[„“”]+$" ) . unwrap ( ) ,
173+
162174 CJK_RANGES : vec ! [
163175 ( 4352 , 4607 ) ,
164176 ( 11904 , 42191 ) ,
@@ -400,7 +412,7 @@ impl Tokenizer for BPETokenizer {
400412 detokenized_text += token;
401413 prepend_space = "" . to_string ( ) ;
402414 } else if self . constants . IS_PUNCT . is_match ( token) {
403- if lang == "fr" && Regex :: new ( r"^[?!:;\\%]$" ) . unwrap ( ) . is_match ( token) {
415+ if lang == "fr" && self . constants . SYMBOLS . is_match ( token) {
404416 detokenized_text += " " ;
405417 }
406418
@@ -410,9 +422,9 @@ impl Tokenizer for BPETokenizer {
410422 detokenized_text += token;
411423 prepend_space = " " . to_string ( ) ;
412424 } else if lang == "cs" && i > 1
413- && Regex :: new ( r"^[0-9]+$" ) . unwrap ( ) . is_match ( tokens[ tokens. len ( ) - 2 ] )
425+ && self . constants . NUMBERS . is_match ( tokens[ tokens. len ( ) - 2 ] )
414426 && Regex :: new ( r"^[.,]+$" ) . unwrap ( ) . is_match ( tokens[ tokens. len ( ) - 1 ] )
415- && Regex :: new ( r"^[0-9]+$" ) . unwrap ( ) . is_match ( token) {
427+ && self . constants . NUMBERS . is_match ( token) {
416428 detokenized_text += token;
417429 prepend_space = " " . to_string ( ) ;
418430 } else if ( lang == "fr" || lang == "it" || lang == "ga" ) && i <= tokens. len ( ) - 2
@@ -430,7 +442,7 @@ impl Tokenizer for BPETokenizer {
430442 prepend_space = "" . to_string ( ) ;
431443 } else if self . constants . IS_OPEN_QUOTE . is_match ( token) {
432444 let mut normalized_quo = token;
433- if Regex :: new ( r"^[„“”]+$" ) . unwrap ( ) . is_match ( token) {
445+ if self . constants . OPEN_QUOTES . is_match ( token) {
434446 normalized_quo = & "\" " ;
435447 }
436448
@@ -445,7 +457,7 @@ impl Tokenizer for BPETokenizer {
445457 }
446458
447459 if quote_counts[ normalized_quo] % 2 == 0 {
448- if lang == "en" && * token == "'" && i > 0 && Regex :: new ( r"[s]$" ) . unwrap ( ) . is_match ( tokens[ i - 1 ] ) {
460+ if lang == "en" && * token == "'" && i > 0 && self . constants . S_END . is_match ( tokens[ i - 1 ] ) {
449461 detokenized_text += token;
450462 prepend_space = " " . to_string ( ) ;
451463 } else {
@@ -458,7 +470,7 @@ impl Tokenizer for BPETokenizer {
458470 prepend_space = " " . to_string ( ) ;
459471 quote_counts. insert ( normalized_quo, * quote_counts. get ( normalized_quo) . unwrap_or ( & 0 ) + 1 ) ;
460472 }
461- } else if lang == "fi" && Regex :: new ( r"^:$" ) . unwrap ( ) . is_match ( tokens[ i - 1 ] ) && self . constants . FINNISH_REGEX . is_match ( token) {
473+ } else if lang == "fi" && self . constants . COLON . is_match ( tokens[ i - 1 ] ) && self . constants . FINNISH_REGEX . is_match ( token) {
462474 detokenized_text += prepend_space. as_str ( ) ;
463475 detokenized_text += token;
464476 prepend_space = " " . to_string ( ) ;
0 commit comments