From 0f6c218bbb1394d0a3a24a547583212ccb7514b8 Mon Sep 17 00:00:00 2001 From: Peter Xu
Date: Wed, 21 Sep 2022 19:13:37 +0800 Subject: [PATCH 1/2] add utf8 string support for aho corasick --- src/string/aho_corasick.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/string/aho_corasick.rs b/src/string/aho_corasick.rs index e1d5759c491..09a8919ee06 100644 --- a/src/string/aho_corasick.rs +++ b/src/string/aho_corasick.rs @@ -64,7 +64,8 @@ impl AhoCorasick { pub fn search<'a>(&self, s: &'a str) -> Vec<&'a str> { let mut ans = vec![]; let mut cur = Rc::clone(&self.root); - for (i, c) in s.chars().enumerate() { + let mut position: usize = 0; + for (_, c) in s.chars().enumerate() { loop { if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) { cur = Rc::clone(child); @@ -76,8 +77,9 @@ impl AhoCorasick { None => break, } } + position += c.len_utf8(); for &len in &cur.borrow().lengths { - ans.push(&s[i + 1 - len..=i]); + ans.push(&s[position - len..position]); } } ans @@ -95,4 +97,12 @@ mod tests { let res = ac.search("ababcxyzacxy12678acxy6543"); assert_eq!(res, ["abc", "xyz", "acxy", "678", "acxy", "6543",]); } + + #[test] + fn test_aho_corasick_with_utf8() { + let dict = ["abc", "中文", "abc中", "abcd", "xyz", "acxy", "efg", "123", "678", "6543", "ハンバーガー"]; + let ac = AhoCorasick::new(&dict); + let res = ac.search("ababc中xyzacxy12678acxyハンバーガー6543中文"); + assert_eq!(res, ["abc", "abc中", "xyz", "acxy", "678", "acxy", "ハンバーガー", "6543", "中文"]); + } } From 629eb8c44887914bd0849d78d82b3cad9a25bab8 Mon Sep 17 00:00:00 2001 From: Peter Xu
Date: Thu, 22 Sep 2022 19:21:48 +0800 Subject: [PATCH 2/2] fix the cargo fmt build check --- src/string/aho_corasick.rs | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/string/aho_corasick.rs b/src/string/aho_corasick.rs index 09a8919ee06..901d822ddd7 100644 --- a/src/string/aho_corasick.rs +++ b/src/string/aho_corasick.rs @@ -100,9 +100,34 @@ mod tests { #[test] fn test_aho_corasick_with_utf8() { - let dict = ["abc", "中文", "abc中", "abcd", "xyz", "acxy", "efg", "123", "678", "6543", "ハンバーガー"]; + let dict = [ + "abc", + "中文", + "abc中", + "abcd", + "xyz", + "acxy", + "efg", + "123", + "678", + "6543", + "ハンバーガー", + ]; let ac = AhoCorasick::new(&dict); let res = ac.search("ababc中xyzacxy12678acxyハンバーガー6543中文"); - assert_eq!(res, ["abc", "abc中", "xyz", "acxy", "678", "acxy", "ハンバーガー", "6543", "中文"]); + assert_eq!( + res, + [ + "abc", + "abc中", + "xyz", + "acxy", + "678", + "acxy", + "ハンバーガー", + "6543", + "中文" + ] + ); } }