-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: leorishdu <18771113323@163.com>
- Loading branch information
1 parent
5ac845e
commit 5536c53
Showing
7 changed files
with
233 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
// Copyright 2021-present StarRocks, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "exprs/gin_functions.h" | ||
|
||
#include <CLucene.h> | ||
#include <CLucene/analysis/LanguageBasedAnalyzer.h> | ||
|
||
#include <boost/locale/encoding_utf.hpp> | ||
|
||
#include "column/array_column.h" | ||
#include "column/column_viewer.h" | ||
#include "column/datum.h" | ||
#include "util/faststring.h" | ||
|
||
namespace starrocks { | ||
|
||
Status GinFunctions::tokenize_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) { | ||
if (scope != FunctionContext::THREAD_LOCAL) { | ||
return Status::OK(); | ||
} | ||
|
||
auto column = context->get_constant_column(0); | ||
auto method_col = ColumnHelper::get_const_value<TYPE_VARCHAR>(column); | ||
std::string method = method_col.to_string(); | ||
|
||
lucene::analysis::Analyzer* analyzer; | ||
|
||
if (method == "english") { | ||
analyzer = _CLNEW lucene::analysis::SimpleAnalyzer(); | ||
} else if (method == "standard") { | ||
analyzer = _CLNEW lucene::analysis::standard::StandardAnalyzer(); | ||
} else if (method == "chinese") { | ||
auto* canalyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer(); | ||
canalyzer->setLanguage(L"cjk"); | ||
canalyzer->setStem(false); | ||
analyzer = canalyzer; | ||
} else { | ||
return Status::NotSupported("Unknown method " + method); | ||
} | ||
|
||
context->set_function_state(scope, analyzer); | ||
|
||
return Status::OK(); | ||
} | ||
Status GinFunctions::tokenize_close(FunctionContext* context, FunctionContext::FunctionStateScope scope) { | ||
if (scope == FunctionContext::THREAD_LOCAL) { | ||
auto* analyzer = reinterpret_cast<lucene::analysis::Analyzer*>( | ||
context->get_function_state(FunctionContext::THREAD_LOCAL)); | ||
delete analyzer; | ||
} | ||
return Status::OK(); | ||
} | ||
|
||
StatusOr<ColumnPtr> GinFunctions::tokenize(FunctionContext* context, const starrocks::Columns& columns) { | ||
auto* analyzer = | ||
reinterpret_cast<lucene::analysis::Analyzer*>(context->get_function_state(FunctionContext::THREAD_LOCAL)); | ||
|
||
if (columns.size() != 2) { | ||
return Status::InvalidArgument("Tokenize function only call by tokenize('<index_type>', str_column)"); | ||
} | ||
|
||
ColumnViewer<TYPE_VARCHAR> value_viewer(columns[1]); | ||
size_t num_rows = value_viewer.size(); | ||
|
||
//Array Offset | ||
int offset = 0; | ||
UInt32Column::Ptr array_offsets = UInt32Column::create(); | ||
array_offsets->reserve(num_rows + 1); | ||
|
||
//Array Binary | ||
BinaryColumn::Ptr array_binary_column = BinaryColumn::create(); | ||
|
||
NullColumnPtr null_array = NullColumn::create(); | ||
|
||
for (int row = 0; row < num_rows; ++row) { | ||
array_offsets->append(offset); | ||
|
||
if (value_viewer.is_null(row) || value_viewer.value(row).empty()) { | ||
null_array->append(1); | ||
} else { | ||
null_array->append(0); | ||
auto data = value_viewer.value(row); | ||
std::string slice_str(data.data, data.get_size()); | ||
std::wstring wstr = boost::locale::conv::utf_to_utf<TCHAR>(slice_str); | ||
lucene::util::StringReader reader(wstr.c_str(), wstr.size(), false); | ||
auto stream = analyzer->reusableTokenStream(L"", &reader); | ||
lucene::analysis::Token token; | ||
while (stream->next(&token)) { | ||
if (token.termLength() != 0) { | ||
offset++; | ||
std::string str = | ||
boost::locale::conv::utf_to_utf<char>(std::wstring(token.termBuffer(), token.termLength())); | ||
array_binary_column->append(Slice(std::move(str))); | ||
} | ||
} | ||
} | ||
} | ||
array_offsets->append(offset); | ||
auto result_array = ArrayColumn::create(NullableColumn::create(array_binary_column, NullColumn::create(offset, 0)), | ||
array_offsets); | ||
return NullableColumn::create(result_array, null_array); | ||
} | ||
|
||
} // namespace starrocks |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
// Copyright 2021-present StarRocks, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
|
||
#pragma once | ||
|
||
#include "exprs/builtin_functions.h" | ||
#include "exprs/function_helper.h" | ||
#include <CLucene.h> | ||
|
||
namespace starrocks { | ||
|
||
class GinFunctions { | ||
public: | ||
// regex method | ||
static Status tokenize_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope); | ||
|
||
static Status tokenize_close(FunctionContext* context, FunctionContext::FunctionStateScope scope); | ||
|
||
DEFINE_VECTORIZED_FN(tokenize); | ||
|
||
}; | ||
|
||
} // namespace starrrocks |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
-- name: test_tokenize_function | ||
SELECT tokenize('english', 'Today is saturday'); | ||
-- result: | ||
["today","is","saturday"] | ||
-- !result | ||
SELECT tokenize('standard', 'Привет, мир'); | ||
-- result: | ||
["привет","мир"] | ||
-- !result | ||
SELECT tokenize('chinese', '中华人民共和国'); | ||
-- result: | ||
["中华","华人","人民","民共","共和","和国"] | ||
-- !result | ||
|
||
CREATE TABLE `t_tokenized_table` ( | ||
`id` bigint(20) NOT NULL COMMENT "", | ||
`english_text` varchar(255) NULL COMMENT "", | ||
`standard_text` varchar(255) NULL COMMENT "", | ||
`chinese_text` varchar(255) NULL COMMENT "" | ||
) ENGINE=OLAP | ||
DUPLICATE KEY(`id`) | ||
DISTRIBUTED BY HASH(`id`) BUCKETS 1 | ||
PROPERTIES ( | ||
"replication_num" = "1", | ||
"enable_persistent_index" = "false", | ||
"replicated_storage" = "false", | ||
"compression" = "LZ4" | ||
); | ||
-- result: | ||
-- !result | ||
|
||
INSERT INTO t_tokenized_table VALUES | ||
(1, 'hello', 'Привет', '你好'), | ||
(2, 'hello world', 'Привет, мир', '你好世界'), | ||
(3, 'Shanghai tap water comes from the sea', 'Водопроводная вода в Шанхае поступает из моря', '上海自来水来自海上'); | ||
-- result: | ||
-- !result | ||
|
||
select id, tokenize('english', english_text), tokenize('standard', standard_text), tokenize('chinese', chinese_text) from t_tokenized_table order by id; | ||
-- result: | ||
1 ["hello"] ["привет"] ["你好"] | ||
2 ["hello","world"] ["привет","мир"] ["你好","好世","世界"] | ||
3 ["shanghai","tap","water","comes","from","the","sea"] ["водопроводная","вода","в","шанхае","поступает","из","моря"] ["上海","海自","自来","来水","水来","来自","自海","海上"] | ||
-- !result | ||
|
||
DROP TABLE t_tokenized_table; | ||
-- result: | ||
-- !result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
-- name: test_tokenize_function | ||
SELECT tokenize('english', 'Today is saturday'); | ||
SELECT tokenize('standard', 'hello world'); | ||
SELECT tokenize('chinese', '中华人民共和国'); | ||
|
||
CREATE TABLE `t_tokenized_table` ( | ||
`id` bigint(20) NOT NULL COMMENT "", | ||
`english_text` varchar(255) NULL COMMENT "", | ||
`standard_text` varchar(255) NULL COMMENT "", | ||
`chinese_text` varchar(255) NULL COMMENT "" | ||
) ENGINE=OLAP | ||
DUPLICATE KEY(`id`) | ||
DISTRIBUTED BY HASH(`id`) BUCKETS 1 | ||
PROPERTIES ( | ||
"replication_num" = "1", | ||
"enable_persistent_index" = "false", | ||
"replicated_storage" = "false", | ||
"compression" = "LZ4" | ||
); | ||
|
||
INSERT INTO t_tokenized_table VALUES | ||
(1, 'hello', 'Привет', '你好'), | ||
(2, 'hello world', 'Привет, мир', '你好世界'), | ||
(3, 'Shanghai tap water comes from the sea', 'Водопроводная вода в Шанхае поступает из моря', '上海自来水来自海上'); | ||
|
||
select id, tokenize('english', english_text), tokenize('standard', standard_text), tokenize('chinese', chinese_text) from t_tokenized_table order by id; | ||
|
||
DROP TABLE t_tokenized_table; |