Skip to content

Commit

Permalink
[Enhancement] Support tokenize function
Browse files Browse the repository at this point in the history
Signed-off-by: leorishdu <18771113323@163.com>
  • Loading branch information
dujijun007 committed May 6, 2024
1 parent 5ac845e commit 49e7d23
Show file tree
Hide file tree
Showing 7 changed files with 232 additions and 1 deletion.
1 change: 1 addition & 0 deletions be/src/exprs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ set(EXPR_FILES
dictionary_get_expr.cpp
ngram.cpp
match_expr.cpp
gin_functions.cpp
)

add_library(Exprs ${EXPR_FILES})
116 changes: 116 additions & 0 deletions be/src/exprs/gin_functions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "exprs/gin_functions.h"

#include <CLucene.h>
#include <CLucene/analysis/LanguageBasedAnalyzer.h>

#include <boost/locale/encoding_utf.hpp>

#include "column/array_column.h"
#include "column/column_viewer.h"
#include "column/datum.h"
#include "util/faststring.h"

namespace starrocks {

Status GinFunctions::tokenize_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
if (scope != FunctionContext::THREAD_LOCAL) {
return Status::OK();
}

auto column = context->get_constant_column(0);
auto method_col = ColumnHelper::get_const_value<TYPE_VARCHAR>(column);
std::string method = method_col.to_string();

lucene::analysis::Analyzer* analyzer;

if (method == "english") {
analyzer = _CLNEW lucene::analysis::SimpleAnalyzer();
} else if (method == "standard") {
analyzer = _CLNEW lucene::analysis::standard::StandardAnalyzer();
} else if (method == "chinese") {
auto* canalyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer();
canalyzer->setLanguage(L"cjk");
canalyzer->setStem(false);
analyzer = canalyzer;
} else {
return Status::NotSupported("Unknown method " + method);
}

context->set_function_state(scope, analyzer);

return Status::OK();
}
Status GinFunctions::tokenize_close(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
if (scope == FunctionContext::THREAD_LOCAL) {
auto* analyzer = reinterpret_cast<lucene::analysis::Analyzer*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
delete analyzer;
}
return Status::OK();
}

StatusOr<ColumnPtr> GinFunctions::tokenize(FunctionContext* context, const starrocks::Columns& columns) {
auto* analyzer =
reinterpret_cast<lucene::analysis::Analyzer*>(context->get_function_state(FunctionContext::THREAD_LOCAL));

if (columns.size() != 2) {
return Status::InvalidArgument("Tokenize function only call by tokenize('<index_type>', str_column)");
}

ColumnViewer<TYPE_VARCHAR> value_viewer(columns[1]);
size_t num_rows = value_viewer.size();

//Array Offset
int offset = 0;
UInt32Column::Ptr array_offsets = UInt32Column::create();
array_offsets->reserve(num_rows + 1);

//Array Binary
BinaryColumn::Ptr array_binary_column = BinaryColumn::create();

NullColumnPtr null_array = NullColumn::create();

for (int row = 0; row < num_rows; ++row) {
array_offsets->append(offset);

if (value_viewer.is_null(row) || value_viewer.value(row).empty()) {
null_array->append(1);
} else {
null_array->append(0);
auto data = value_viewer.value(row);
std::string slice_str(data.data, data.get_size());
std::wstring wstr = boost::locale::conv::utf_to_utf<TCHAR>(slice_str);
lucene::util::StringReader reader(wstr.c_str(), wstr.size(), false);
auto stream = analyzer->reusableTokenStream(L"", &reader);
lucene::analysis::Token token;
while (stream->next(&token)) {
if (token.termLength() != 0) {
offset++;
std::string str =
boost::locale::conv::utf_to_utf<char>(std::wstring(token.termBuffer(), token.termLength()));
array_binary_column->append(Slice(std::move(str)));
}
}
}
}
array_offsets->append(offset);
auto result_array = ArrayColumn::create(NullableColumn::create(array_binary_column, NullColumn::create(offset, 0)),
array_offsets);
return NullableColumn::create(result_array, null_array);
}

} // namespace starrocks
34 changes: 34 additions & 0 deletions be/src/exprs/gin_functions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <CLucene.h>

#include "exprs/builtin_functions.h"
#include "exprs/function_helper.h"

namespace starrocks {

class GinFunctions {
public:
// regex method
static Status tokenize_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope);

static Status tokenize_close(FunctionContext* context, FunctionContext::FunctionStateScope scope);

DEFINE_VECTORIZED_FN(tokenize);
};

} // namespace starrocks
5 changes: 4 additions & 1 deletion gensrc/script/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,5 +1230,8 @@
[170501, 'named_struct', True, False, 'ANY_STRUCT', ['ANY_ELEMENT', "..."], 'StructFunctions::named_struct'],

# user function
[180000, 'is_role_in_session', True, False, 'BOOLEAN', ['VARCHAR'], 'nullptr']
[180000, 'is_role_in_session', True, False, 'BOOLEAN', ['VARCHAR'], 'nullptr'],

# gin functions
[190000, 'tokenize', True, False, 'ARRAY_VARCHAR', ['VARCHAR', 'VARCHAR'], 'GinFunctions::tokenize', 'GinFunctions::tokenize_prepare', 'GinFunctions::tokenize_close']
]
1 change: 1 addition & 0 deletions gensrc/script/gen_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
#include "exprs/grouping_sets_functions.h"
#include "exprs/es_functions.h"
#include "exprs/utility_functions.h"
#include "exprs/gin_functions.h"
namespace starrocks {
Expand Down
48 changes: 48 additions & 0 deletions test/sql/test_inverted_index/R/test_tokenize
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
-- name: test_tokenize_function
SELECT tokenize('english', 'Today is saturday');
-- result:
["today","is","saturday"]
-- !result
SELECT tokenize('standard', 'Привет, мир');
-- result:
["привет","мир"]
-- !result
SELECT tokenize('chinese', '中华人民共和国');
-- result:
["中华","华人","人民","民共","共和","和国"]
-- !result

CREATE TABLE `t_tokenized_table` (
`id` bigint(20) NOT NULL COMMENT "",
`english_text` varchar(255) NULL COMMENT "",
`standard_text` varchar(255) NULL COMMENT "",
`chinese_text` varchar(255) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`id`)
DISTRIBUTED BY HASH(`id`) BUCKETS 1
PROPERTIES (
"replication_num" = "1",
"enable_persistent_index" = "false",
"replicated_storage" = "false",
"compression" = "LZ4"
);
-- result:
-- !result

INSERT INTO t_tokenized_table VALUES
(1, 'hello', 'Привет', '你好'),
(2, 'hello world', 'Привет, мир', '你好世界'),
(3, 'Shanghai tap water comes from the sea', 'Водопроводная вода в Шанхае поступает из моря', '上海自来水来自海上');
-- result:
-- !result

select id, tokenize('english', english_text), tokenize('standard', standard_text), tokenize('chinese', chinese_text) from t_tokenized_table order by id;
-- result:
1 ["hello"] ["привет"] ["你好"]
2 ["hello","world"] ["привет","мир"] ["你好","好世","世界"]
3 ["shanghai","tap","water","comes","from","the","sea"] ["водопроводная","вода","в","шанхае","поступает","из","моря"] ["上海","海自","自来","来水","水来","来自","自海","海上"]
-- !result

DROP TABLE t_tokenized_table;
-- result:
-- !result
28 changes: 28 additions & 0 deletions test/sql/test_inverted_index/T/test_tokenize
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- name: test_tokenize_function
SELECT tokenize('english', 'Today is saturday');
SELECT tokenize('standard', 'hello world');
SELECT tokenize('chinese', '中华人民共和国');

CREATE TABLE `t_tokenized_table` (
`id` bigint(20) NOT NULL COMMENT "",
`english_text` varchar(255) NULL COMMENT "",
`standard_text` varchar(255) NULL COMMENT "",
`chinese_text` varchar(255) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`id`)
DISTRIBUTED BY HASH(`id`) BUCKETS 1
PROPERTIES (
"replication_num" = "1",
"enable_persistent_index" = "false",
"replicated_storage" = "false",
"compression" = "LZ4"
);

INSERT INTO t_tokenized_table VALUES
(1, 'hello', 'Привет', '你好'),
(2, 'hello world', 'Привет, мир', '你好世界'),
(3, 'Shanghai tap water comes from the sea', 'Водопроводная вода в Шанхае поступает из моря', '上海自来水来自海上');

select id, tokenize('english', english_text), tokenize('standard', standard_text), tokenize('chinese', chinese_text) from t_tokenized_table order by id;

DROP TABLE t_tokenized_table;

0 comments on commit 49e7d23

Please sign in to comment.