Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Optimize the performance for regexp_replace #16356

Merged
merged 3 commits into from
Jan 10, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 4 additions & 2 deletions be/src/exprs/string_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2818,6 +2818,7 @@ static ColumnPtr regexp_replace_const(re2::RE2* const_re, const Columns& columns

auto size = columns[0]->size();
ColumnBuilder<TYPE_VARCHAR> result(size);
std::string result_str;
for (int row = 0; row < size; ++row) {
if (str_viewer.is_null(row) || rpl_viewer.is_null(row)) {
result.append_null();
Expand All @@ -2827,8 +2828,9 @@ static ColumnPtr regexp_replace_const(re2::RE2* const_re, const Columns& columns
auto rpl_value = rpl_viewer.value(row);
re2::StringPiece rpl_str = re2::StringPiece(rpl_value.get_data(), rpl_value.get_size());
auto str_value = str_viewer.value(row);
std::string result_str(str_value.get_data(), str_value.get_size());
re2::RE2::GlobalReplace(&result_str, *const_re, rpl_str);
re2::StringPiece str_str = re2::StringPiece(str_value.get_data(), str_value.get_size());
result_str.clear();
re2::RE2::GlobalReplace(str_str, *const_re, rpl_str, result_str);
result.append(Slice(result_str.data(), result_str.size()));
}

Expand Down
2 changes: 1 addition & 1 deletion thirdparty/download-thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ echo "Finished patching $GLOG_SOURCE"
# re2 patch
cd $TP_SOURCE_DIR/$RE2_SOURCE
if [ ! -f $PATCHED_MARK ]; then
patch -p0 < $TP_PATCH_DIR/re2-2017-05-01.patch
patch -p1 < $TP_PATCH_DIR/re2-2022-12-01.patch
touch $PATCHED_MARK
fi
cd -
Expand Down
100 changes: 100 additions & 0 deletions thirdparty/patches/re2-2022-12-01.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
commit 6bfac7c766bddb2ac9eda2c2acd98009b9da95bd
Author: stdpain <drfeng08@gmail.com>
Date: Mon Jan 9 10:00:35 2023 +0800

add a interface to reuse memory for GlobalReplace

diff --git a/re2/re2.cc b/re2/re2.cc
index b24c6d6..9d4969e 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -461,6 +461,30 @@ bool RE2::Replace(std::string* str,
int RE2::GlobalReplace(std::string* str,
const RE2& re,
const StringPiece& rewrite) {
+ std::string out;
+ int count = _GlobalReplace(*str, re, rewrite, out);
+ if (count > 0) {
+ using std::swap;
+ swap(out, *str);
+ }
+ return count;
+}
+
+int RE2::GlobalReplace(const StringPiece& str,
+ const RE2& re,
+ const StringPiece& rewrite,
+ std::string& out) {
+ int count = _GlobalReplace(str, re, rewrite, out);
+ if (count == 0) {
+ out.append(str.data(), str.size());
+ }
+ return count;
+}
+
+int RE2::_GlobalReplace(const StringPiece& str,
+ const RE2& re,
+ const StringPiece& rewrite,
+ std::string& out) {
StringPiece vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
@@ -468,17 +492,16 @@ int RE2::GlobalReplace(std::string* str,
if (nvec > static_cast<int>(arraysize(vec)))
return false;

- const char* p = str->data();
- const char* ep = p + str->size();
+ const char* p = str.data();
+ const char* ep = p + str.size();
const char* lastend = NULL;
- std::string out;
int count = 0;
while (p <= ep) {
if (maximum_global_replace_count != -1 &&
count >= maximum_global_replace_count)
break;
- if (!re.Match(*str, static_cast<size_t>(p - str->data()),
- str->size(), UNANCHORED, vec, nvec))
+ if (!re.Match(str, static_cast<size_t>(p - str.data()),
+ str.size(), UNANCHORED, vec, nvec))
break;
if (p < vec[0].data())
out.append(p, vec[0].data() - p);
@@ -523,8 +546,6 @@ int RE2::GlobalReplace(std::string* str,

if (p < ep)
out.append(p, ep - p);
- using std::swap;
- swap(out, *str);
return count;
}

diff --git a/re2/re2.h b/re2/re2.h
index 1d82518..7da0922 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -485,6 +485,11 @@ class RE2 {
const RE2& re,
const StringPiece& rewrite);

+ static int GlobalReplace(const StringPiece& str,
+ const RE2& re,
+ const StringPiece& rewrite,
+ std::string& out);
+
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
@@ -767,6 +772,11 @@ class RE2 {

re2::Prog* ReverseProg() const;

+ static int _GlobalReplace(const StringPiece& str,
+ const RE2& re,
+ const StringPiece& rewrite,
+ std::string& out);
+
// First cache line is relatively cold fields.
const std::string* pattern_; // string regular expression
Options options_; // option flags
8 changes: 4 additions & 4 deletions thirdparty/vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,10 @@ CURL_SOURCE=curl-7.79.0
CURL_MD5SUM="b40e4dc4bbc9e109c330556cd58c8ec8"

# RE2
RE2_DOWNLOAD="https://github.com/google/re2/archive/2017-05-01.tar.gz"
RE2_NAME=re2-2017-05-01.tar.gz
RE2_SOURCE=re2-2017-05-01
RE2_MD5SUM="4aa65a0b22edacb7ddcd7e4aec038dcf"
RE2_DOWNLOAD="https://github.com/google/re2/archive/refs/tags/2022-12-01.tar.gz"
RE2_NAME=re2-2022-12-01.tar.gz
RE2_SOURCE=re2-2022-12-01
RE2_MD5SUM="f25d7b06a3e7747ecbb2f12d48be61cd"

# boost
BOOST_DOWNLOAD="https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.gz"
Expand Down