Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Support tokenize function #45119

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions be/src/exprs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ set(EXPR_FILES
dictionary_get_expr.cpp
ngram.cpp
match_expr.cpp
gin_functions.cpp
)

add_library(Exprs ${EXPR_FILES})
116 changes: 116 additions & 0 deletions be/src/exprs/gin_functions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "exprs/gin_functions.h"

#include <CLucene.h>
#include <CLucene/analysis/LanguageBasedAnalyzer.h>

#include <boost/locale/encoding_utf.hpp>

#include "column/array_column.h"
#include "column/column_viewer.h"
#include "column/datum.h"
#include "util/faststring.h"

namespace starrocks {

Status GinFunctions::tokenize_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
if (scope != FunctionContext::THREAD_LOCAL) {
return Status::OK();
}

auto column = context->get_constant_column(0);
auto method_col = ColumnHelper::get_const_value<TYPE_VARCHAR>(column);
std::string method = method_col.to_string();

lucene::analysis::Analyzer* analyzer;

if (method == "english") {
analyzer = _CLNEW lucene::analysis::SimpleAnalyzer();
} else if (method == "standard") {
analyzer = _CLNEW lucene::analysis::standard::StandardAnalyzer();
} else if (method == "chinese") {
auto* canalyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer();
canalyzer->setLanguage(L"cjk");
canalyzer->setStem(false);
analyzer = canalyzer;
} else {
return Status::NotSupported("Unknown method " + method);
}

context->set_function_state(scope, analyzer);

return Status::OK();
}
Status GinFunctions::tokenize_close(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
if (scope == FunctionContext::THREAD_LOCAL) {
auto* analyzer = reinterpret_cast<lucene::analysis::Analyzer*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
delete analyzer;
}
return Status::OK();
}

StatusOr<ColumnPtr> GinFunctions::tokenize(FunctionContext* context, const starrocks::Columns& columns) {
auto* analyzer =
reinterpret_cast<lucene::analysis::Analyzer*>(context->get_function_state(FunctionContext::THREAD_LOCAL));

if (columns.size() != 2) {
return Status::InvalidArgument("Tokenize function only call by tokenize('<index_type>', str_column)");
}

ColumnViewer<TYPE_VARCHAR> value_viewer(columns[1]);
size_t num_rows = value_viewer.size();

//Array Offset
int offset = 0;
UInt32Column::Ptr array_offsets = UInt32Column::create();
array_offsets->reserve(num_rows + 1);

//Array Binary
BinaryColumn::Ptr array_binary_column = BinaryColumn::create();

NullColumnPtr null_array = NullColumn::create();

for (int row = 0; row < num_rows; ++row) {
array_offsets->append(offset);

if (value_viewer.is_null(row) || value_viewer.value(row).empty()) {
null_array->append(1);
} else {
null_array->append(0);
auto data = value_viewer.value(row);
std::string slice_str(data.data, data.get_size());
std::wstring wstr = boost::locale::conv::utf_to_utf<TCHAR>(slice_str);
lucene::util::StringReader reader(wstr.c_str(), wstr.size(), false);
auto stream = analyzer->reusableTokenStream(L"", &reader);
lucene::analysis::Token token;
while (stream->next(&token)) {
if (token.termLength() != 0) {
offset++;
std::string str =
boost::locale::conv::utf_to_utf<char>(std::wstring(token.termBuffer(), token.termLength()));
array_binary_column->append(Slice(std::move(str)));
}
}
}
}
array_offsets->append(offset);
auto result_array = ArrayColumn::create(NullableColumn::create(array_binary_column, NullColumn::create(offset, 0)),
array_offsets);
return NullableColumn::create(result_array, null_array);
}

} // namespace starrocks
34 changes: 34 additions & 0 deletions be/src/exprs/gin_functions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <CLucene.h>

#include "exprs/builtin_functions.h"
#include "exprs/function_helper.h"

namespace starrocks {

class GinFunctions {
public:
// regex method
static Status tokenize_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope);

static Status tokenize_close(FunctionContext* context, FunctionContext::FunctionStateScope scope);

DEFINE_VECTORIZED_FN(tokenize);
};

} // namespace starrocks
1 change: 1 addition & 0 deletions be/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ set(EXEC_FILES
./exprs/encryption_functions_test.cpp
./exprs/function_call_expr_test.cpp
./exprs/geography_functions_test.cpp
./exprs/gin_functions_test.cpp
./exprs/hash_functions_test.cpp
./exprs/hyperloglog_functions_test.cpp
./exprs/if_expr_test.cpp
Expand Down
70 changes: 70 additions & 0 deletions be/test/exprs/gin_functions_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "exprs/gin_functions.h"

#include <glog/logging.h>
#include <gtest/gtest.h>

#include "column/array_column.h"
#include "exprs/mock_vectorized_expr.h"

namespace starrocks {

class GinFunctionsTest : public ::testing::Test {
public:
void SetUp() override {}
};

TEST_F(GinFunctionsTest, tokenizeTest) {
{
std::unique_ptr<FunctionContext> ctx(FunctionContext::create_test_context());
Columns columns;

auto tokenizer = BinaryColumn::create();
tokenizer->append("error_tokenizer");
columns.emplace_back(ConstColumn::create(tokenizer));

ctx->set_constant_columns(columns);

ASSERT_FALSE(GinFunctions::tokenize_prepare(ctx.get(), FunctionContext::THREAD_LOCAL).ok());
}
{
std::unique_ptr<FunctionContext> ctx(FunctionContext::create_test_context());
Columns columns;

auto tokenizer = BinaryColumn::create();
auto content = BinaryColumn::create();

tokenizer->append("english");
content->append("hello world");
columns.emplace_back(ConstColumn::create(tokenizer));
columns.emplace_back(ConstColumn::create(content));
ctx->set_constant_columns(columns);
ASSERT_TRUE(GinFunctions::tokenize_prepare(ctx.get(), FunctionContext::THREAD_LOCAL).ok());
ColumnPtr result = GinFunctions::tokenize(ctx.get(), columns).value();
ASSERT_TRUE(GinFunctions::tokenize_close(ctx.get(), FunctionContext::THREAD_LOCAL).ok());

columns.clear();
auto nullable_result = ColumnHelper::as_column<NullableColumn>(result);
auto v = ColumnHelper::as_column<ArrayColumn>(nullable_result->data_column());

auto res_array = v->get(0).get_array();

ASSERT_EQ("hello", res_array[0].get_slice().to_string());
ASSERT_EQ("world", res_array[1].get_slice().to_string());
}
}

} // namespace starrocks
5 changes: 4 additions & 1 deletion gensrc/script/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,5 +1230,8 @@
[170501, 'named_struct', True, False, 'ANY_STRUCT', ['ANY_ELEMENT', "..."], 'StructFunctions::named_struct'],

# user function
[180000, 'is_role_in_session', True, False, 'BOOLEAN', ['VARCHAR'], 'nullptr']
[180000, 'is_role_in_session', True, False, 'BOOLEAN', ['VARCHAR'], 'nullptr'],

# gin functions
[190000, 'tokenize', True, False, 'ARRAY_VARCHAR', ['VARCHAR', 'VARCHAR'], 'GinFunctions::tokenize', 'GinFunctions::tokenize_prepare', 'GinFunctions::tokenize_close']
]
1 change: 1 addition & 0 deletions gensrc/script/gen_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
#include "exprs/grouping_sets_functions.h"
#include "exprs/es_functions.h"
#include "exprs/utility_functions.h"
#include "exprs/gin_functions.h"

namespace starrocks {

Expand Down
48 changes: 48 additions & 0 deletions test/sql/test_inverted_index/R/test_tokenize
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
-- name: test_tokenize_function
SELECT tokenize('english', 'Today is saturday');
-- result:
["today","is","saturday"]
-- !result
SELECT tokenize('standard', 'Привет, мир');
-- result:
["привет","мир"]
-- !result
SELECT tokenize('chinese', '中华人民共和国');
-- result:
["中华","华人","人民","民共","共和","和国"]
-- !result

CREATE TABLE `t_tokenized_table` (
`id` bigint(20) NOT NULL COMMENT "",
`english_text` varchar(255) NULL COMMENT "",
`standard_text` varchar(255) NULL COMMENT "",
`chinese_text` varchar(255) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`id`)
DISTRIBUTED BY HASH(`id`) BUCKETS 1
PROPERTIES (
"replication_num" = "1",
"enable_persistent_index" = "false",
"replicated_storage" = "false",
"compression" = "LZ4"
);
-- result:
-- !result

INSERT INTO t_tokenized_table VALUES
(1, 'hello', 'Привет', '你好'),
(2, 'hello world', 'Привет, мир', '你好世界'),
(3, 'Shanghai tap water comes from the sea', 'Водопроводная вода в Шанхае поступает из моря', '上海自来水来自海上');
-- result:
-- !result

select id, tokenize('english', english_text), tokenize('standard', standard_text), tokenize('chinese', chinese_text) from t_tokenized_table order by id;
-- result:
1 ["hello"] ["привет"] ["你好"]
2 ["hello","world"] ["привет","мир"] ["你好","好世","世界"]
3 ["shanghai","tap","water","comes","from","the","sea"] ["водопроводная","вода","в","шанхае","поступает","из","моря"] ["上海","海自","自来","来水","水来","来自","自海","海上"]
-- !result

DROP TABLE t_tokenized_table;
-- result:
-- !result
28 changes: 28 additions & 0 deletions test/sql/test_inverted_index/T/test_tokenize
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- name: test_tokenize_function
SELECT tokenize('english', 'Today is saturday');
SELECT tokenize('standard', 'hello world');
SELECT tokenize('chinese', '中华人民共和国');

CREATE TABLE `t_tokenized_table` (
`id` bigint(20) NOT NULL COMMENT "",
`english_text` varchar(255) NULL COMMENT "",
`standard_text` varchar(255) NULL COMMENT "",
`chinese_text` varchar(255) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`id`)
DISTRIBUTED BY HASH(`id`) BUCKETS 1
PROPERTIES (
"replication_num" = "1",
"enable_persistent_index" = "false",
"replicated_storage" = "false",
"compression" = "LZ4"
);

INSERT INTO t_tokenized_table VALUES
(1, 'hello', 'Привет', '你好'),
(2, 'hello world', 'Привет, мир', '你好世界'),
(3, 'Shanghai tap water comes from the sea', 'Водопроводная вода в Шанхае поступает из моря', '上海自来水来自海上');

select id, tokenize('english', english_text), tokenize('standard', standard_text), tokenize('chinese', chinese_text) from t_tokenized_table order by id;

DROP TABLE t_tokenized_table;
Loading