[Feature] Support tokenize function

Signed-off-by: leorishdu <18771113323@163.com>
StarRocks · May 6, 2024 · 5536c53 · 5536c53
1 parent 5ac845e
commit 5536c53
Show file tree

Hide file tree

Showing 7 changed files with 233 additions and 1 deletion.
diff --git a/be/src/exprs/CMakeLists.txt b/be/src/exprs/CMakeLists.txt
@@ -105,6 +105,7 @@ set(EXPR_FILES
   dictionary_get_expr.cpp
   ngram.cpp
   match_expr.cpp
+  gin_functions.cpp
 )
 
 add_library(Exprs ${EXPR_FILES})
diff --git a/be/src/exprs/gin_functions.cpp b/be/src/exprs/gin_functions.cpp
@@ -0,0 +1,116 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "exprs/gin_functions.h"
+
+#include <CLucene.h>
+#include <CLucene/analysis/LanguageBasedAnalyzer.h>
+
+#include <boost/locale/encoding_utf.hpp>
+
+#include "column/array_column.h"
+#include "column/column_viewer.h"
+#include "column/datum.h"
+#include "util/faststring.h"
+
+namespace starrocks {
+
+Status GinFunctions::tokenize_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
+    if (scope != FunctionContext::THREAD_LOCAL) {
+        return Status::OK();
+    }
+
+    auto column = context->get_constant_column(0);
+    auto method_col = ColumnHelper::get_const_value<TYPE_VARCHAR>(column);
+    std::string method = method_col.to_string();
+
+    lucene::analysis::Analyzer* analyzer;
+
+    if (method == "english") {
+        analyzer = _CLNEW lucene::analysis::SimpleAnalyzer();
+    } else if (method == "standard") {
+        analyzer = _CLNEW lucene::analysis::standard::StandardAnalyzer();
+    } else if (method == "chinese") {
+        auto* canalyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer();
+        canalyzer->setLanguage(L"cjk");
+        canalyzer->setStem(false);
+        analyzer = canalyzer;
+    } else {
+        return Status::NotSupported("Unknown method " + method);
+    }
+
+    context->set_function_state(scope, analyzer);
+
+    return Status::OK();
+}
+Status GinFunctions::tokenize_close(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
+    if (scope == FunctionContext::THREAD_LOCAL) {
+        auto* analyzer = reinterpret_cast<lucene::analysis::Analyzer*>(
+                context->get_function_state(FunctionContext::THREAD_LOCAL));
+        delete analyzer;
+    }
+    return Status::OK();
+}
+
+StatusOr<ColumnPtr> GinFunctions::tokenize(FunctionContext* context, const starrocks::Columns& columns) {
+    auto* analyzer =
+            reinterpret_cast<lucene::analysis::Analyzer*>(context->get_function_state(FunctionContext::THREAD_LOCAL));
+
+    if (columns.size() != 2) {
+        return Status::InvalidArgument("Tokenize function only call by tokenize('<index_type>', str_column)");
+    }
+
+    ColumnViewer<TYPE_VARCHAR> value_viewer(columns[1]);
+    size_t num_rows = value_viewer.size();
+
+    //Array Offset
+    int offset = 0;
+    UInt32Column::Ptr array_offsets = UInt32Column::create();
+    array_offsets->reserve(num_rows + 1);
+
+    //Array Binary
+    BinaryColumn::Ptr array_binary_column = BinaryColumn::create();
+
+    NullColumnPtr null_array = NullColumn::create();
+
+    for (int row = 0; row < num_rows; ++row) {
+        array_offsets->append(offset);
+
+        if (value_viewer.is_null(row) || value_viewer.value(row).empty()) {
+            null_array->append(1);
+        } else {
+            null_array->append(0);
+            auto data = value_viewer.value(row);
+            std::string slice_str(data.data, data.get_size());
+            std::wstring wstr = boost::locale::conv::utf_to_utf<TCHAR>(slice_str);
+            lucene::util::StringReader reader(wstr.c_str(), wstr.size(), false);
+            auto stream = analyzer->reusableTokenStream(L"", &reader);
+            lucene::analysis::Token token;
+            while (stream->next(&token)) {
+                if (token.termLength() != 0) {
+                    offset++;
+                    std::string str =
+                            boost::locale::conv::utf_to_utf<char>(std::wstring(token.termBuffer(), token.termLength()));
+                    array_binary_column->append(Slice(std::move(str)));
+                }
+            }
+        }
+    }
+    array_offsets->append(offset);
+    auto result_array = ArrayColumn::create(NullableColumn::create(array_binary_column, NullColumn::create(offset, 0)),
+                                            array_offsets);
+    return NullableColumn::create(result_array, null_array);
+}
+
+} // namespace starrocks
diff --git a/be/src/exprs/gin_functions.h b/be/src/exprs/gin_functions.h
@@ -0,0 +1,35 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "exprs/builtin_functions.h"
+#include "exprs/function_helper.h"
+#include <CLucene.h>
+
+namespace starrocks {
+
+class GinFunctions {
+public:
+    // regex method
+    static Status tokenize_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope);
+
+    static Status tokenize_close(FunctionContext* context, FunctionContext::FunctionStateScope scope);
+
+    DEFINE_VECTORIZED_FN(tokenize);
+
+};
+
+} // namespace starrrocks
diff --git a/gensrc/script/functions.py b/gensrc/script/functions.py
@@ -1230,5 +1230,8 @@
     [170501, 'named_struct', True, False, 'ANY_STRUCT', ['ANY_ELEMENT', "..."], 'StructFunctions::named_struct'],
 
     # user function
-    [180000, 'is_role_in_session', True, False, 'BOOLEAN', ['VARCHAR'], 'nullptr']
+    [180000, 'is_role_in_session', True, False, 'BOOLEAN', ['VARCHAR'], 'nullptr'],
+
+    # gin functions
+    [190000, 'tokenize', True, False, 'ARRAY_VARCHAR', ['VARCHAR', 'VARCHAR'], 'GinFunctions::tokenize', 'GinFunctions::tokenize_prepare', 'GinFunctions::tokenize_close']
 ]
diff --git a/gensrc/script/gen_functions.py b/gensrc/script/gen_functions.py
@@ -88,6 +88,7 @@
 #include "exprs/grouping_sets_functions.h"
 #include "exprs/es_functions.h"
 #include "exprs/utility_functions.h"
+#include "exprs/gin_functions.h"
 
 namespace starrocks {
 

diff --git a/test/sql/test_inverted_index/R/test_tokenize b/test/sql/test_inverted_index/R/test_tokenize
@@ -0,0 +1,48 @@
+-- name: test_tokenize_function
+SELECT tokenize('english', 'Today is saturday');
+-- result:
+["today","is","saturday"]
+-- !result
+SELECT tokenize('standard', 'Привет, мир');
+-- result:
+["привет","мир"]
+-- !result
+SELECT tokenize('chinese', '中华人民共和国');
+-- result:
+["中华","华人","人民","民共","共和","和国"]
+-- !result
+
+CREATE TABLE `t_tokenized_table` (
+  `id` bigint(20) NOT NULL COMMENT "",
+  `english_text` varchar(255) NULL COMMENT "",
+  `standard_text` varchar(255) NULL COMMENT "",
+  `chinese_text` varchar(255) NULL COMMENT ""
+) ENGINE=OLAP
+DUPLICATE KEY(`id`)
+DISTRIBUTED BY HASH(`id`) BUCKETS 1
+PROPERTIES (
+"replication_num" = "1",
+"enable_persistent_index" = "false",
+"replicated_storage" = "false",
+"compression" = "LZ4"
+);
+-- result:
+-- !result
+
+INSERT INTO t_tokenized_table VALUES
+(1, 'hello', 'Привет', '你好'),
+(2, 'hello world', 'Привет, мир', '你好世界'),
+(3, 'Shanghai tap water comes from the sea', 'Водопроводная вода в Шанхае поступает из моря', '上海自来水来自海上');
+-- result:
+-- !result
+
+select id, tokenize('english', english_text), tokenize('standard', standard_text), tokenize('chinese', chinese_text) from t_tokenized_table order by id;
+-- result:
+1 ["hello"] ["привет"] ["你好"]
+2 ["hello","world"] ["привет","мир"] ["你好","好世","世界"]
+3 ["shanghai","tap","water","comes","from","the","sea"] ["водопроводная","вода","в","шанхае","поступает","из","моря"] ["上海","海自","自来","来水","水来","来自","自海","海上"]
+-- !result
+
+DROP TABLE t_tokenized_table;
+-- result:
+-- !result
diff --git a/test/sql/test_inverted_index/T/test_tokenize b/test/sql/test_inverted_index/T/test_tokenize
@@ -0,0 +1,28 @@
+-- name: test_tokenize_function
+SELECT tokenize('english', 'Today is saturday');
+SELECT tokenize('standard', 'hello world');
+SELECT tokenize('chinese', '中华人民共和国');
+
+CREATE TABLE `t_tokenized_table` (
+  `id` bigint(20) NOT NULL COMMENT "",
+  `english_text` varchar(255) NULL COMMENT "",
+  `standard_text` varchar(255) NULL COMMENT "",
+  `chinese_text` varchar(255) NULL COMMENT ""
+) ENGINE=OLAP
+DUPLICATE KEY(`id`)
+DISTRIBUTED BY HASH(`id`) BUCKETS 1
+PROPERTIES (
+"replication_num" = "1",
+"enable_persistent_index" = "false",
+"replicated_storage" = "false",
+"compression" = "LZ4"
+);
+
+INSERT INTO t_tokenized_table VALUES
+(1, 'hello', 'Привет', '你好'),
+(2, 'hello world', 'Привет, мир', '你好世界'),
+(3, 'Shanghai tap water comes from the sea', 'Водопроводная вода в Шанхае поступает из моря', '上海自来水来自海上');
+
+select id, tokenize('english', english_text), tokenize('standard', standard_text), tokenize('chinese', chinese_text) from t_tokenized_table order by id;
+
+DROP TABLE t_tokenized_table;