Skip to content

Commit

Permalink
[cherry_pick](like) pick doris for regexp and like 17953 18351 (apach…
Browse files Browse the repository at this point in the history
…e#1586)

* [Bug][Fix] regexp function core dump DCHECK failed and error result (apache#17953)

CREATE TABLE `test` (
`name` varchar(64) NULL,
`age` int(11) NULL
) ENGINE=OLAP
DUPLICATE KEY(`name`)
COMMENT 'OLAP'
DISTRIBUTED BY HASH(`name`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"in_memory" = "false",
"storage_format" = "V2",
"disable_auto_compaction" = "false"
);
insert into `test` values ("lemon",1),("tom",2);

select a.name regexp concat('^', a.name) from test a;

* [fix](like) fix wrong result of like pattern with backslash (apache#18351)

---------

Co-authored-by: HappenLee <happenlee@hotmail.com>
Co-authored-by: TengJianPing <18241664+jacktengg@users.noreply.github.com>
  • Loading branch information
3 people authored and gavinchou committed Apr 4, 2023
1 parent bffb01f commit ec5a362
Show file tree
Hide file tree
Showing 58 changed files with 424 additions and 16 deletions.
109 changes: 93 additions & 16 deletions be/src/vec/functions/like.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ static const RE2 STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\
static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");

// Like patterns
static const re2::RE2 LIKE_SUBSTRING_RE("(?:%+)(((\\\\%)|(\\\\_)|([^%_]))+)(?:%+)");
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\%)|(\\\\_)|([^%_]))+)");
static const re2::RE2 LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_]))+)(?:%+)");
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\%)|(\\\\_)|([^%_]))+)");
static const re2::RE2 LIKE_SUBSTRING_RE("(?:%+)(((\\\\_)|([^%_\\\\]))+)(?:%+)");
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
static const re2::RE2 LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_\\\\]))+)(?:%+)");
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");

Status LikeSearchState::clone(LikeSearchState& cloned) {
cloned.escape_char = escape_char;
Expand Down Expand Up @@ -211,11 +211,11 @@ Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const

Status FunctionLikeBase::regexp_fn_scalar(LikeSearchState* state, const StringRef& val,
const StringValue& pattern, unsigned char* result) {
std::string_view re_pattern(pattern.ptr, pattern.len);
std::string re_pattern(pattern.ptr, pattern.len);

hs_database_t* database = nullptr;
hs_scratch_t* scratch = nullptr;
RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.data(), &database, &scratch));
RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));

auto ret = hs_scan(database, val.data, val.size, 0, scratch, state->hs_match_handler,
(void*)result);
Expand Down Expand Up @@ -248,11 +248,11 @@ Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnS

Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const ColumnString& val,
const StringValue& pattern, ColumnUInt8::Container& result) {
std::string_view re_pattern(pattern.ptr, pattern.len);
std::string re_pattern(pattern.ptr, pattern.len);

hs_database_t* database = nullptr;
hs_scratch_t* scratch = nullptr;
RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.data(), &database, &scratch));
RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));

auto sz = val.size();
for (size_t i = 0; i < sz; i++) {
Expand Down Expand Up @@ -293,11 +293,11 @@ Status FunctionLikeBase::regexp_fn_predicate(LikeSearchState* state,
const StringValue& pattern,
ColumnUInt8::Container& result, uint16_t* sel,
size_t sz) {
std::string_view re_pattern(pattern.ptr, pattern.len);
std::string re_pattern(pattern.ptr, pattern.len);

hs_database_t* database = nullptr;
hs_scratch_t* scratch = nullptr;
RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.data(), &database, &scratch));
RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));

auto data_ptr = reinterpret_cast<const StringRef*>(val.get_data().data());
for (size_t i = 0; i < sz; i++) {
Expand Down Expand Up @@ -368,10 +368,13 @@ Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
const auto pattern_col = block.get_by_position(arguments[1]).column;

if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
DCHECK_EQ(str_patterns->size(), 1);
const auto& pattern_val = str_patterns->get_data_at(0);
RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
&state->search_state));
for (int i = 0; i < input_rows_count; i++) {
const auto pattern_val = str_patterns->get_data_at(i);
const auto value_val = values->get_data_at(i);
(state->scalar_function)(
const_cast<vectorized::LikeSearchState*>(&state->search_state), value_val,
pattern_val, &vec_res[i]);
}
} else if (const auto* const_patterns =
check_and_get_column<ColumnConst>(pattern_col.get())) {
const auto& pattern_val = const_patterns->get_data_at(0);
Expand Down Expand Up @@ -507,7 +510,7 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
}

// add $ to pattern tail to match line tail
if (pattern.size() > 0 && pattern[pattern.size() - 1] != '%') {
if (pattern.size() > 0 && re_pattern->back() != '*') {
re_pattern->append("$");
}
}
Expand All @@ -518,7 +521,8 @@ void FunctionLike::remove_escape_character(std::string* search_string) {
int len = tmp_search_string.length();
for (int i = 0; i < len;) {
if (tmp_search_string[i] == '\\' && i + 1 < len &&
(tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_')) {
(tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
tmp_search_string[i + 1] == '\\')) {
search_string->append(1, tmp_search_string[i + 1]);
i += 2;
} else {
Expand All @@ -528,6 +532,38 @@ void FunctionLike::remove_escape_character(std::string* search_string) {
}
}

bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
if (!re.ok()) {
return false;
}

std::vector<RE2::Arg> arguments;
std::vector<RE2::Arg*> arguments_ptrs;
std::size_t args_count = re.NumberOfCapturingGroups();
arguments.resize(args_count);
arguments_ptrs.resize(args_count);
results.resize(args_count);
for (std::size_t i = 0; i < args_count; ++i) {
arguments[i] = &results[i];
arguments_ptrs[i] = &arguments[i];
}

return RE2::FullMatchN(str, re, arguments_ptrs.data(), args_count);
}

void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
std::vector<std::string> results;
VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
<< ": " << re.pattern() << ", size: " << re.pattern().size();
if (re2_full_match(str, re, results)) {
for (int i = 0; i < results.size(); ++i) {
VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
}
} else {
VLOG_DEBUG << "no match";
}
}

Status FunctionLike::prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
if (scope != FunctionContext::THREAD_LOCAL) {
return Status::OK();
Expand All @@ -545,32 +581,73 @@ Status FunctionLike::prepare(FunctionContext* context, FunctionContext::Function
state->search_state.pattern_str = pattern_str;
std::string search_string;
if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
if (VLOG_DEBUG_IS_ON) {
verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
VLOG_DEBUG << "search_string : " << search_string
<< ", size: " << search_string.size();
}
remove_escape_character(&search_string);
if (VLOG_DEBUG_IS_ON) {
VLOG_DEBUG << "search_string escape removed: " << search_string
<< ", size: " << search_string.size();
}
state->search_state.set_search_string(search_string);
state->function = constant_equals_fn;
state->predicate_like_function = constant_equals_fn_predicate;
state->scalar_function = constant_equals_fn_scalar;
} else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
if (VLOG_DEBUG_IS_ON) {
verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
VLOG_DEBUG << "search_string : " << search_string
<< ", size: " << search_string.size();
}
remove_escape_character(&search_string);
if (VLOG_DEBUG_IS_ON) {
VLOG_DEBUG << "search_string escape removed: " << search_string
<< ", size: " << search_string.size();
}
state->search_state.set_search_string(search_string);
state->function = constant_starts_with_fn;
state->predicate_like_function = constant_starts_with_fn_predicate;
state->scalar_function = constant_starts_with_fn_scalar;
} else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
if (VLOG_DEBUG_IS_ON) {
verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
VLOG_DEBUG << "search_string : " << search_string
<< ", size: " << search_string.size();
}
remove_escape_character(&search_string);
if (VLOG_DEBUG_IS_ON) {
VLOG_DEBUG << "search_string escape removed: " << search_string
<< ", size: " << search_string.size();
}
state->search_state.set_search_string(search_string);
state->function = constant_ends_with_fn;
state->predicate_like_function = constant_ends_with_fn_predicate;
state->scalar_function = constant_ends_with_fn_scalar;
} else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
if (VLOG_DEBUG_IS_ON) {
verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
VLOG_DEBUG << "search_string : " << search_string
<< ", size: " << search_string.size();
}
remove_escape_character(&search_string);
if (VLOG_DEBUG_IS_ON) {
VLOG_DEBUG << "search_string escape removed: " << search_string
<< ", size: " << search_string.size();
}
state->search_state.set_search_string(search_string);
state->function = constant_substring_fn;
state->predicate_like_function = constant_substring_fn_predicate;
state->scalar_function = constant_substring_fn_scalar;
} else {
std::string re_pattern;
convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
if (VLOG_DEBUG_IS_ON) {
VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
<< ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
<< ", size: " << re_pattern.size();
}

hs_database_t* database = nullptr;
hs_scratch_t* scratch = nullptr;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
%test
te%st
test%
%test%
%te%s%
%tes\%
\test
\\test
test\test
test\
test\\
\test\
\tes\t\
test\\test
_test
te_st
test_
_test_
_te_st_
tes*t
tes?t
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !analytic_query --
4627 1830 257.82 4627 4627 1830 4627 257.82 257.82 5.7.99 60
4535 1201 218.04 4535 4535 1201 4535 218.04 218.04 5.7.99 76
2581 1971 931.05 2581 2581 1971 2581 931.05 931.05 5.7.99 4
3554 317 332.0 3554 3554 317 3554 332.0 332.0 5.7.99 18
5517 1658 968.84 5517 5517 1658 5517 968.84 968.84 5.7.99 82
8761 1053 106.7 8761 8761 1053 8761 106.7 106.7 5.7.99 24
2972 57 123.11 2972 2972 57 2972 123.11 123.11 5.7.99 58
717 254 564.77 717 717 254 717 564.77 564.77 5.7.99 82
1674 705 102.28 1674 1674 705 1674 102.28 102.28 5.7.99 38
796 971 318.79 796 796 971 796 318.79 318.79 5.7.99 72
6707 1634 372.29 6707 6707 1634 6707 372.29 372.29 5.7.99 35
4049 1872 447.75 4049 4049 1872 4049 447.75 447.75 5.7.99 73
8067 1376 912.31 8067 8067 1376 8067 912.31 912.31 5.7.99 53
2313 1214 438.22 2313 2313 1214 2313 438.22 438.22 5.7.99 52
5327 73 108.96 5327 5327 73 5327 108.96 108.96 5.7.99 24
5500 1821 882.5 5500 5500 1821 5500 882.5 882.5 5.7.99 51
1798 433 128.56 1798 1798 433 1798 128.56 128.56 5.7.99 92
5475 39 532.26 5475 5475 39 5475 532.26 532.26 5.7.99 90
6054 1606 486.42 6054 6054 1606 6054 486.42 486.42 5.7.99 89
6875 1082 826.47 6875 6875 1082 6875 826.47 826.47 5.7.99 53
3180 1873 629.18 3180 3180 1873 3180 629.18 629.18 5.7.99 17
3391 1793 588.68 3391 3391 1793 3391 588.68 588.68 5.7.99 78
517 964 517.38 517 517 964 517 517.38 517.38 5.7.99 65
4580 97 761.41 4580 4580 97 4580 761.41 761.41 5.7.99 73
8071 1390 612.13 8071 8071 1390 8071 612.13 612.13 5.7.99 91
7236 736 872.53 7236 7236 736 7236 872.53 872.53 5.7.99 37
3126 1403 173.62 3126 3126 1403 3126 173.62 173.62 5.7.99 21
9755 1053 189.42 9755 9755 1053 9755 189.42 189.42 5.7.99 54
9702 26 821.89 9702 9702 26 9702 821.89 821.89 5.7.99 2
2356 1188 841.79 2356 2356 1188 2356 841.79 841.79 5.7.99 89
626 747 91.87 626 626 747 626 91.87 91.87 5.7.99 80
1285 1518 335.15 1285 1285 1518 1285 335.15 335.15 5.7.99 99
8316 959 435.36 8316 8316 959 8316 435.36 435.36 5.7.99 62
2338 1228 658.94 2338 2338 1228 2338 658.94 658.94 5.7.99 29
8112 1860 215.52 8112 8112 1860 8112 215.52 215.52 5.7.99 4
8915 1134 839.18 8915 8915 1134 8915 839.18 839.18 5.7.99 71
1303 772 942.93 1303 1303 772 1303 942.93 942.93 5.7.99 5
3238 648 69.51 3238 3238 648 3238 69.51 69.51 5.7.99 80
2205 850 219.21 2205 2205 850 2205 219.21 219.21 5.7.99 17
1064 1589 439.88 1064 1064 1589 1064 439.88 439.88 5.7.99 30
3224 1958 829.21 3224 3224 1958 3224 829.21 829.21 5.7.99 3
7645 252 275.23 7645 7645 252 7645 275.23 275.23 5.7.99 34
1550 999 136.14 1550 1550 999 1550 136.14 136.14 5.7.99 2
3896 1292 924.38 3896 3896 1292 3896 924.38 924.38 5.7.99 4
7278 972 967.71 7278 7278 972 7278 967.71 967.71 5.7.99 75
753 1365 871.48 753 753 1365 753 871.48 871.48 5.7.99 80
5985 663 332.39 5985 5985 663 5985 332.39 332.39 5.7.99 57
7347 1747 461.67 7347 7347 1747 7347 461.67 461.67 5.7.99 90
352 536 143.93 352 352 536 352 143.93 143.93 5.7.99 97
2261 1794 157.22 2261 2261 1794 2261 157.22 157.22 5.7.99 95
3793 503 68.79 3793 3793 503 3793 68.79 68.79 5.7.99 34
282 1423 800.36 282 282 1423 282 800.36 800.36 5.7.99 41
1567 1809 991.6 1567 1567 1809 1567 991.6 991.6 5.7.99 39
7407 922 457.36 7407 7407 922 7407 457.36 457.36 5.7.99 91
5827 1159 645.53 5827 5827 1159 5827 645.53 645.53 5.7.99 68
5472 1602 547.06 5472 5472 1602 5472 547.06 547.06 5.7.99 85
6971 723 108.21 6971 6971 723 6971 108.21 108.21 5.7.99 20
6443 598 265.4 6443 6443 598 6443 265.4 265.4 5.7.99 99
3242 1392 673.51 3242 3242 1392 3242 673.51 673.51 5.7.99 7
2113 1953 148.78 2113 2113 1953 2113 148.78 148.78 5.7.99 86
7244 951 709.23 7244 7244 951 7244 709.23 709.23 5.7.99 52
2894 1367 951.21 2894 2894 1367 2894 951.21 951.21 5.7.99 82
9489 1873 405.31 9489 9489 1873 9489 405.31 405.31 5.7.99 3
2708 1566 571.65 2708 2708 1566 2708 571.65 571.65 5.7.99 87
1019 122 451.29 1019 1019 122 1019 451.29 451.29 5.7.99 23
1821 1906 386.55 1821 1821 1906 1821 386.55 386.55 5.7.99 39
5058 1712 338.6 5058 5058 1712 5058 338.6 338.6 5.7.99 13
1695 1776 138.16 1695 1695 1776 1695 138.16 138.16 5.7.99 77
3058 343 18.48 3058 3058 343 3058 18.48 18.48 5.7.99 44
6105 516 600.29 6105 6105 516 6105 600.29 600.29 5.7.99 17
7441 289 519.98 7441 7441 289 7441 519.98 519.98 5.7.99 71
341 889 931.12 341 341 889 341 931.12 931.12 5.7.99 23
1168 1301 624.73 1168 1168 1301 1168 624.73 624.73 5.7.99 16
2307 119 473.64 2307 2307 119 2307 473.64 473.64 5.7.99 20
4692 113 141.48 4692 4692 113 4692 141.48 141.48 5.7.99 92
1726 1111 22.72 1726 1726 1111 1726 22.72 22.72 5.7.99 84

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_1 --
%te%s%
%tes\\%
%test%
test%

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_2 --
%te%s%
%tes\\%
%test%
test%

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_3 --
%tes\\%
\\\\test
\\tes\\t\\
\\test
\\test\\
test\\
test\\\\
test\\\\test
test\\test

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_4 --
%tes\\%
\\\\test
\\tes\\t\\
\\test
\\test\\
test\\
test\\\\
test\\\\test
test\\test

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_5 --
%tes\\%

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_6 --
%tes\\%

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_7 --
\\\\test
test\\\\
test\\\\test

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_8 --
\\\\test
test\\\\
test\\\\test

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_9 --

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_begin_1 --

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_begin_2 --

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !like_backslash_begin_3 --
\\\\test
\\tes\\t\\
\\test
\\test\\

Loading

0 comments on commit ec5a362

Please sign in to comment.