From 8cfc4b19e9b3f4602b5d9d73d64e5f284d020f11 Mon Sep 17 00:00:00 2001 From: Fedor Indutnyy Date: Tue, 24 Jan 2023 17:39:46 -0800 Subject: [PATCH] Database#createFTS5Tokenizer API --- docs/api.md | 46 +++++ lib/database.js | 3 + lib/methods/createFTS5Tokenizer.js | 24 +++ src/better_sqlite3.cpp | 302 +++++++++++++++++++++++------ src/better_sqlite3.hpp | 240 ++++++++++++++--------- src/better_sqlite3.lzz | 2 + src/objects/database.lzz | 37 ++++ src/objects/tokenizer.lzz | 158 +++++++++++++++ test/38.database.tokenizer.js | 72 +++++++ 9 files changed, 729 insertions(+), 155 deletions(-) create mode 100644 lib/methods/createFTS5Tokenizer.js create mode 100644 src/objects/tokenizer.lzz create mode 100644 test/38.database.tokenizer.js diff --git a/docs/api.md b/docs/api.md index 8c074d1a..29bd989f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -17,6 +17,7 @@ - [Database#aggregate()](#aggregatename-options---this) - [Database#table()](#tablename-definition---this) - [Database#loadExtension()](#loadextensionpath-entrypoint---this) +- [Database#createFTS5Tokenizer()](#createfts5tokenizername-factory---this) - [Database#exec()](#execstring---this) - [Database#close()](#close---this) - [Properties](#properties) @@ -372,6 +373,51 @@ It's your responsibility to make sure the extensions you load are compiled/linke db.loadExtension('./my-extensions/compress.so'); ``` +### .createFTS5Tokenizer(*name*, *factory*) -> *this* + +Creates a custom JavaScript-based tokenizer for the [FTS5](https://www.sqlite.org/fts5.html#tokenizers). + +One of the main use cases for such tokenizer would be to add support for CJK symbols and non-latin locales for FTS5. As an example, this could be done with `Intl.Segmenter` API: +```js +db.createFTS5Tokenizer('js_tokenizer', class Tokenizer { + constructor(params) { + // params will be ["param1", "param2"] + } + + run(str) { + const result = []; + let off = 0; + for (const seg of segmenter.segment(str)) { + const len = Buffer.byteLength(seg.segment); + if (seg.isWordLike) { + // Remove diacritic symbols + const normalized = seg.segment.normalize('NFD') + .replace(/[\u0300-\u036f]/g, ''); + result.push( + // Segment start byte offset + off, + // Segment end byte offset + off + len, + // Either normalized segment or a `null` (optimization) + normalized === seg.segment ? null : normalized, + ); + } + off += len; + } + return result; + } +}); + +db.exec(` + CREATE VIRTUAL TABLE fts_table USING fts5( + body, + tokenize='js_tokenizer param1 param2' + ); + + INSERT INTO fts_table(body) VALUES ('hello world'); +`); +``` + ### .exec(*string*) -> *this* Executes the given SQL string. Unlike [prepared statements](#preparestring---statement), this can execute strings that contain multiple SQL statements. This function performs worse and is less safe than using [prepared statements](#preparestring---statement). You should only use this method when you need to execute SQL from an external source (usually a file). If an error occurs, execution stops and further statements are not executed. You must rollback changes manually. diff --git a/lib/database.js b/lib/database.js index aea774d3..ce01ed94 100644 --- a/lib/database.js +++ b/lib/database.js @@ -65,6 +65,8 @@ function Database(filenameGiven, options) { }); } +function noop() {} + const wrappers = require('./methods/wrappers'); Database.prototype.prepare = wrappers.prepare; Database.prototype.transaction = require('./methods/transaction'); @@ -74,6 +76,7 @@ Database.prototype.serialize = require('./methods/serialize'); Database.prototype.function = require('./methods/function'); Database.prototype.aggregate = require('./methods/aggregate'); Database.prototype.table = require('./methods/table'); +Database.prototype.createFTS5Tokenizer = require('./methods/createFTS5Tokenizer'); Database.prototype.loadExtension = wrappers.loadExtension; Database.prototype.exec = wrappers.exec; Database.prototype.close = wrappers.close; diff --git a/lib/methods/createFTS5Tokenizer.js b/lib/methods/createFTS5Tokenizer.js new file mode 100644 index 00000000..cc05c514 --- /dev/null +++ b/lib/methods/createFTS5Tokenizer.js @@ -0,0 +1,24 @@ +'use strict'; +const { cppdb } = require('../util'); + +module.exports = function createFTS5Tokenizer(name, factory) { + // Validate arguments + if (typeof name !== 'string') throw new TypeError('Expected first argument to be a string'); + if (!name) throw new TypeError('Virtual table module name cannot be an empty string'); + if (typeof factory !== 'function') throw new TypeError('Expected second argument to be a constructor'); + + this[cppdb].createFTS5Tokenizer(name, function create(params) { + const instance = new factory(params); + + function run(str) { + if (!instance.run) { + // This will throw in C++ + return; + } + return instance.run(str); + } + + return run; + }); + return this; +}; diff --git a/src/better_sqlite3.cpp b/src/better_sqlite3.cpp index b560b9d6..d75d4c77 100644 --- a/src/better_sqlite3.cpp +++ b/src/better_sqlite3.cpp @@ -17,7 +17,7 @@ ctx->Exit(); return proto->StrictEquals(baseProto) || proto->StrictEquals(v8::Null(isolate)); } -#line 67 "./src/better_sqlite3.lzz" +#line 69 "./src/better_sqlite3.lzz" NODE_MODULE_INIT(/* exports, context */) { v8::Isolate* isolate = context->GetIsolate(); v8::HandleScope scope(isolate); @@ -321,45 +321,46 @@ v8::Local Database::Init (v8::Isolate * isolate, v8::Local GetFunction( isolate -> GetCurrentContext ( ) ).ToLocalChecked(); } -#line 24 "./src/objects/database.lzz" +#line 25 "./src/objects/database.lzz" bool Database::CompareDatabase::operator () (Database const * const a, Database const * const b) const -#line 24 "./src/objects/database.lzz" +#line 25 "./src/objects/database.lzz" { return a < b; } -#line 29 "./src/objects/database.lzz" +#line 30 "./src/objects/database.lzz" bool Database::CompareStatement::operator () (Statement const * const a, Statement const * const b) const -#line 29 "./src/objects/database.lzz" +#line 30 "./src/objects/database.lzz" { return Statement::Compare(a, b); } -#line 34 "./src/objects/database.lzz" +#line 35 "./src/objects/database.lzz" bool Database::CompareBackup::operator () (Backup const * const a, Backup const * const b) const -#line 34 "./src/objects/database.lzz" +#line 35 "./src/objects/database.lzz" { return Backup::Compare(a, b); } -#line 40 "./src/objects/database.lzz" +#line 41 "./src/objects/database.lzz" void Database::ThrowDatabaseError () -#line 40 "./src/objects/database.lzz" +#line 41 "./src/objects/database.lzz" { if (was_js_error) was_js_error = false; else ThrowSqliteError(addon, db_handle); } -#line 44 "./src/objects/database.lzz" +#line 45 "./src/objects/database.lzz" void Database::ThrowSqliteError (Addon * addon, sqlite3 * db_handle) -#line 44 "./src/objects/database.lzz" +#line 45 "./src/objects/database.lzz" { assert(db_handle != NULL); ThrowSqliteError(addon, sqlite3_errmsg(db_handle), sqlite3_extended_errcode(db_handle)); } -#line 48 "./src/objects/database.lzz" +#line 49 "./src/objects/database.lzz" void Database::ThrowSqliteError (Addon * addon, char const * message, int code) -#line 48 "./src/objects/database.lzz" +#line 49 "./src/objects/database.lzz" { assert(message != NULL); assert((code & 0xff) != SQLITE_OK); @@ -374,9 +375,9 @@ void Database::ThrowSqliteError (Addon * addon, char const * message, int code) ->NewInstance( isolate -> GetCurrentContext ( ) , 2, args) .ToLocalChecked()); } -#line 64 "./src/objects/database.lzz" +#line 65 "./src/objects/database.lzz" bool Database::Log (v8::Isolate * isolate, sqlite3_stmt * handle) -#line 64 "./src/objects/database.lzz" +#line 65 "./src/objects/database.lzz" { assert(was_js_error == false); if (!has_logger) return false; @@ -388,9 +389,9 @@ bool Database::Log (v8::Isolate * isolate, sqlite3_stmt * handle) if (expanded) sqlite3_free(expanded); return was_js_error; } -#line 107 "./src/objects/database.lzz" +#line 108 "./src/objects/database.lzz" void Database::CloseHandles () -#line 107 "./src/objects/database.lzz" +#line 108 "./src/objects/database.lzz" { if (open) { open = false; @@ -402,25 +403,25 @@ void Database::CloseHandles () assert(status == SQLITE_OK); ((void)status); } } -#line 119 "./src/objects/database.lzz" +#line 120 "./src/objects/database.lzz" Database::~ Database () -#line 119 "./src/objects/database.lzz" +#line 120 "./src/objects/database.lzz" { if (open) addon->dbs.erase(this); CloseHandles(); } -#line 126 "./src/objects/database.lzz" +#line 127 "./src/objects/database.lzz" Database::Database (v8::Isolate * isolate, Addon * addon, sqlite3 * db_handle, v8::Local logger) -#line 131 "./src/objects/database.lzz" +#line 132 "./src/objects/database.lzz" : node::ObjectWrap (), db_handle (db_handle), open (true), busy (false), safe_ints (false), unsafe_mode (false), was_js_error (false), has_logger (logger->IsFunction()), iterators (0), addon (addon), logger (isolate, logger), stmts (), backups () -#line 144 "./src/objects/database.lzz" +#line 145 "./src/objects/database.lzz" { assert(db_handle != NULL); addon->dbs.insert(this); } -#line 149 "./src/objects/database.lzz" +#line 150 "./src/objects/database.lzz" void Database::JS_new (v8::FunctionCallbackInfo const & info) -#line 149 "./src/objects/database.lzz" +#line 150 "./src/objects/database.lzz" { assert(info.IsConstructCall()); if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsString ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a string" ) ; v8 :: Local < v8 :: String > filename = ( info [ 0 ] . As < v8 :: String > ( ) ) ; @@ -472,9 +473,9 @@ void Database::JS_new (v8::FunctionCallbackInfo const & info) info.GetReturnValue().Set(info.This()); } -#line 201 "./src/objects/database.lzz" +#line 202 "./src/objects/database.lzz" void Database::JS_prepare (v8::FunctionCallbackInfo const & info) -#line 201 "./src/objects/database.lzz" +#line 202 "./src/objects/database.lzz" { if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsString ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a string" ) ; v8 :: Local < v8 :: String > source = ( info [ 0 ] . As < v8 :: String > ( ) ) ; if ( info . Length ( ) <= ( 1 ) || ! info [ 1 ] -> IsObject ( ) ) return ThrowTypeError ( "Expected " "second" " argument to be " "an object" ) ; v8 :: Local < v8 :: Object > database = ( info [ 1 ] . As < v8 :: Object > ( ) ) ; @@ -490,9 +491,9 @@ void Database::JS_prepare (v8::FunctionCallbackInfo const & info) addon->privileged_info = NULL; if (!maybeStatement.IsEmpty()) info.GetReturnValue().Set(maybeStatement.ToLocalChecked()); } -#line 217 "./src/objects/database.lzz" +#line 218 "./src/objects/database.lzz" void Database::JS_exec (v8::FunctionCallbackInfo const & info) -#line 217 "./src/objects/database.lzz" +#line 218 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsString ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a string" ) ; v8 :: Local < v8 :: String > source = ( info [ 0 ] . As < v8 :: String > ( ) ) ; @@ -532,9 +533,9 @@ void Database::JS_exec (v8::FunctionCallbackInfo const & info) db->ThrowDatabaseError(); } } -#line 257 "./src/objects/database.lzz" +#line 258 "./src/objects/database.lzz" void Database::JS_backup (v8::FunctionCallbackInfo const & info) -#line 257 "./src/objects/database.lzz" +#line 258 "./src/objects/database.lzz" { if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsObject ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "an object" ) ; v8 :: Local < v8 :: Object > database = ( info [ 0 ] . As < v8 :: Object > ( ) ) ; if ( info . Length ( ) <= ( 1 ) || ! info [ 1 ] -> IsString ( ) ) return ThrowTypeError ( "Expected " "second" " argument to be " "a string" ) ; v8 :: Local < v8 :: String > attachedName = ( info [ 1 ] . As < v8 :: String > ( ) ) ; @@ -552,9 +553,9 @@ void Database::JS_backup (v8::FunctionCallbackInfo const & info) addon->privileged_info = NULL; if (!maybeBackup.IsEmpty()) info.GetReturnValue().Set(maybeBackup.ToLocalChecked()); } -#line 275 "./src/objects/database.lzz" +#line 276 "./src/objects/database.lzz" void Database::JS_serialize (v8::FunctionCallbackInfo const & info) -#line 275 "./src/objects/database.lzz" +#line 276 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsString ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a string" ) ; v8 :: Local < v8 :: String > attachedName = ( info [ 0 ] . As < v8 :: String > ( ) ) ; @@ -576,9 +577,9 @@ void Database::JS_serialize (v8::FunctionCallbackInfo const & info node::Buffer::New(isolate, reinterpret_cast(data), length, FreeSerialization, NULL).ToLocalChecked() ); } -#line 297 "./src/objects/database.lzz" +#line 298 "./src/objects/database.lzz" void Database::JS_function (v8::FunctionCallbackInfo const & info) -#line 297 "./src/objects/database.lzz" +#line 298 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsFunction ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a function" ) ; v8 :: Local < v8 :: Function > fn = ( info [ 0 ] . As < v8 :: Function > ( ) ) ; @@ -602,9 +603,9 @@ void Database::JS_function (v8::FunctionCallbackInfo const & info) db->ThrowDatabaseError(); } } -#line 321 "./src/objects/database.lzz" +#line 322 "./src/objects/database.lzz" void Database::JS_aggregate (v8::FunctionCallbackInfo const & info) -#line 321 "./src/objects/database.lzz" +#line 322 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); if ( info . Length ( ) <= ( 0 ) ) return ThrowTypeError ( "Expected a " "first" " argument" ) ; v8 :: Local < v8 :: Value > start = info [ 0 ] ; @@ -633,9 +634,9 @@ void Database::JS_aggregate (v8::FunctionCallbackInfo const & info db->ThrowDatabaseError(); } } -#line 350 "./src/objects/database.lzz" +#line 351 "./src/objects/database.lzz" void Database::JS_table (v8::FunctionCallbackInfo const & info) -#line 350 "./src/objects/database.lzz" +#line 351 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsFunction ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a function" ) ; v8 :: Local < v8 :: Function > factory = ( info [ 0 ] . As < v8 :: Function > ( ) ) ; @@ -655,9 +656,9 @@ void Database::JS_table (v8::FunctionCallbackInfo const & info) } db->busy = false; } -#line 370 "./src/objects/database.lzz" +#line 371 "./src/objects/database.lzz" void Database::JS_loadExtension (v8::FunctionCallbackInfo const & info) -#line 370 "./src/objects/database.lzz" +#line 371 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); v8::Local entryPoint; @@ -679,9 +680,9 @@ void Database::JS_loadExtension (v8::FunctionCallbackInfo const & } sqlite3_free(error); } -#line 392 "./src/objects/database.lzz" +#line 393 "./src/objects/database.lzz" void Database::JS_close (v8::FunctionCallbackInfo const & info) -#line 392 "./src/objects/database.lzz" +#line 393 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); if (db->open) { @@ -691,39 +692,77 @@ void Database::JS_close (v8::FunctionCallbackInfo const & info) db->CloseHandles(); } } -#line 402 "./src/objects/database.lzz" +#line 403 "./src/objects/database.lzz" void Database::JS_defaultSafeIntegers (v8::FunctionCallbackInfo const & info) -#line 402 "./src/objects/database.lzz" +#line 403 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); if (info.Length() == 0) db->safe_ints = true; else { if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsBoolean ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a boolean" ) ; db -> safe_ints = ( info [ 0 ] . As < v8 :: Boolean > ( ) ) -> Value ( ) ; } } -#line 408 "./src/objects/database.lzz" +#line 409 "./src/objects/database.lzz" void Database::JS_unsafeMode (v8::FunctionCallbackInfo const & info) -#line 408 "./src/objects/database.lzz" +#line 409 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); if (info.Length() == 0) db->unsafe_mode = true; else { if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsBoolean ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a boolean" ) ; db -> unsafe_mode = ( info [ 0 ] . As < v8 :: Boolean > ( ) ) -> Value ( ) ; } sqlite3_db_config(db->db_handle, SQLITE_DBCONFIG_DEFENSIVE, static_cast(!db->unsafe_mode), NULL); } -#line 415 "./src/objects/database.lzz" +#line 416 "./src/objects/database.lzz" +void Database::JS_createFTS5Tokenizer (v8::FunctionCallbackInfo const & info) +#line 416 "./src/objects/database.lzz" + { + Addon * addon = static_cast < Addon * > ( info . Data ( ) . As < v8 :: External > ( ) -> Value ( ) ) ; + v8 :: Isolate * isolate = info . GetIsolate ( ) ; + + Database* db = node :: ObjectWrap :: Unwrap (info.This()); + if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsString ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a string" ) ; v8 :: Local < v8 :: String > name = ( info [ 0 ] . As < v8 :: String > ( ) ) ; + if ( info . Length ( ) <= ( 1 ) || ! info [ 1 ] -> IsFunction ( ) ) return ThrowTypeError ( "Expected " "second" " argument to be " "a function" ) ; v8 :: Local < v8 :: Function > create_instance_fn = ( info [ 1 ] . As < v8 :: Function > ( ) ) ; + + + int rc; + sqlite3_stmt *pStmt = nullptr; + + rc = sqlite3_prepare(db->db_handle, "SELECT fts5(?1)", -1, &pStmt, 0); + if (rc != SQLITE_OK) { + ThrowSqliteError(addon, db->db_handle); + return; + } + + fts5_api *fts5 = nullptr; + sqlite3_bind_pointer(pStmt, 1, (void*)&fts5, "fts5_api_ptr", nullptr); + sqlite3_step(pStmt); + rc = sqlite3_finalize(pStmt); + if (rc != SQLITE_OK) { + ThrowSqliteError(addon, db->db_handle); + return; + } + + assert(fts5 != nullptr); + + TokenizerModule* t = new TokenizerModule(isolate, create_instance_fn); + + v8::String::Utf8Value utf8(isolate, name); + fts5->xCreateTokenizer(fts5, *utf8, t, t->get_api_object(), + &TokenizerModule::xDestroy); +} +#line 452 "./src/objects/database.lzz" void Database::JS_open (v8::Local _, v8::PropertyCallbackInfo const & info) -#line 415 "./src/objects/database.lzz" +#line 452 "./src/objects/database.lzz" { info.GetReturnValue().Set( node :: ObjectWrap :: Unwrap (info.This())->open); } -#line 419 "./src/objects/database.lzz" +#line 456 "./src/objects/database.lzz" void Database::JS_inTransaction (v8::Local _, v8::PropertyCallbackInfo const & info) -#line 419 "./src/objects/database.lzz" +#line 456 "./src/objects/database.lzz" { Database* db = node :: ObjectWrap :: Unwrap (info.This()); info.GetReturnValue().Set(db->open && !static_cast(sqlite3_get_autocommit(db->db_handle))); } -#line 424 "./src/objects/database.lzz" +#line 461 "./src/objects/database.lzz" bool Database::Deserialize (v8::Local buffer, Addon * addon, sqlite3 * db_handle, bool readonly) -#line 424 "./src/objects/database.lzz" +#line 461 "./src/objects/database.lzz" { size_t length = node::Buffer::Length(buffer); unsigned char* data = (unsigned char*)sqlite3_malloc64(length); @@ -748,15 +787,15 @@ bool Database::Deserialize (v8::Local buffer, Addon * addon, sqlite return true; } -#line 449 "./src/objects/database.lzz" +#line 486 "./src/objects/database.lzz" void Database::FreeSerialization (char * data, void * _) -#line 449 "./src/objects/database.lzz" +#line 486 "./src/objects/database.lzz" { sqlite3_free(data); } -#line 453 "./src/objects/database.lzz" +#line 490 "./src/objects/database.lzz" int const Database::MAX_BUFFER_SIZE; -#line 454 "./src/objects/database.lzz" +#line 491 "./src/objects/database.lzz" int const Database::MAX_STRING_SIZE; #line 4 "./src/objects/statement.lzz" v8::Local Statement::Init (v8::Isolate * isolate, v8::Local data) @@ -1332,6 +1371,143 @@ void Backup::JS_close (v8::FunctionCallbackInfo const & info) backup->CloseHandles(); info.GetReturnValue().Set(info.This()); } +#line 3 "./src/objects/tokenizer.lzz" +Tokenizer::Tokenizer (v8::Isolate * isolate, v8::Local run_fn) +#line 6 "./src/objects/tokenizer.lzz" + : isolate (isolate), run_fn (isolate, run_fn) +#line 7 "./src/objects/tokenizer.lzz" + {} +#line 10 "./src/objects/tokenizer.lzz" +Tokenizer::~ Tokenizer () +#line 10 "./src/objects/tokenizer.lzz" + {} +#line 12 "./src/objects/tokenizer.lzz" +int Tokenizer::Run (void * pCtx, char const * pText, int nText, int (* xToken) (void *, int, char const *, int, int, int)) +#line 19 "./src/objects/tokenizer.lzz" + { + v8::HandleScope scope(isolate); + v8 :: Local < v8 :: Context > ctx = isolate -> GetCurrentContext ( ) ; + + v8::Local arg[] = { + StringFromUtf8(isolate, pText, nText) + }; + v8::Local result = run_fn.Get(isolate)->Call( + ctx, + v8::Undefined(isolate), + 1, + arg).ToLocalChecked(); + if (!result->IsArray()) { + ThrowTypeError("Expected array return value of tokenizer"); + return SQLITE_MISUSE; + } + v8::Local indices = result.As(); + int len = indices->Length(); + if (len % 3 != 0) { + return SQLITE_MISUSE; + } + for (int i = 0; i < len; i += 3) { + int64_t segment_start = + indices->Get(ctx, i).ToLocalChecked()->IntegerValue(ctx).ToChecked(); + int64_t segment_end = + indices->Get(ctx, i + 1).ToLocalChecked()->IntegerValue(ctx).ToChecked(); + v8::Local maybe_normalized = + indices->Get(ctx, i + 2).ToLocalChecked(); + if (segment_start < 0 || static_cast(segment_start) > nText) { + return SQLITE_MISUSE; + } + if (segment_end < 0 || static_cast(segment_end) > nText) { + return SQLITE_MISUSE; + } + if (segment_start > segment_end) { + return SQLITE_MISUSE; + } + + int rc; + if (maybe_normalized->IsString()) { + v8::String::Utf8Value normalized( + isolate, indices->Get(ctx, i + 2).ToLocalChecked()); + rc = xToken( + pCtx, 0, *normalized, normalized.length(), + segment_start, segment_end); + } else { + + + rc = xToken( + pCtx, 0, &pText[segment_start], segment_end - segment_start, + segment_start, segment_end); + } + + if (rc != SQLITE_OK) { + return rc; + } + } + return SQLITE_OK; +} +#line 86 "./src/objects/tokenizer.lzz" +TokenizerModule::TokenizerModule (v8::Isolate * isolate, v8::Local create_instance_fn) +#line 89 "./src/objects/tokenizer.lzz" + : isolate (isolate), create_instance_fn (isolate, create_instance_fn) +#line 89 "./src/objects/tokenizer.lzz" + {} +#line 92 "./src/objects/tokenizer.lzz" +void TokenizerModule::xDestroy (void * pCtx) +#line 92 "./src/objects/tokenizer.lzz" + { + TokenizerModule* m = static_cast(pCtx); + delete m; +} +#line 102 "./src/objects/tokenizer.lzz" +Tokenizer * TokenizerModule::CreateInstance (char const * * azArg, int nArg) +#line 102 "./src/objects/tokenizer.lzz" + { + v8::HandleScope scope(isolate); + v8 :: Local < v8 :: Context > ctx = isolate -> GetCurrentContext ( ) ; + + v8::Local params = v8::Array::New(isolate, nArg); + for (int i = 0; i < nArg; i++) { + params->Set(ctx, i, StringFromUtf8(isolate, azArg[i], -1)).ToChecked(); + } + + v8::Local arg[] = { + params, + }; + v8::Local run_fn = create_instance_fn.Get(isolate)->Call( + ctx, + v8::Undefined(isolate), + 1, + arg).ToLocalChecked().As(); + + return new Tokenizer(isolate, run_fn); +} +#line 123 "./src/objects/tokenizer.lzz" +int TokenizerModule::xCreate (void * pCtx, char const * * azArg, int nArg, Fts5Tokenizer * * ppOut) +#line 124 "./src/objects/tokenizer.lzz" + { + TokenizerModule* m = static_cast(pCtx); + *ppOut = reinterpret_cast(m->CreateInstance(azArg, nArg)); + return SQLITE_OK; +} +#line 130 "./src/objects/tokenizer.lzz" +void TokenizerModule::xDelete (Fts5Tokenizer * tokenizer) +#line 130 "./src/objects/tokenizer.lzz" + { + Tokenizer* t = reinterpret_cast(tokenizer); + delete t; +} +#line 135 "./src/objects/tokenizer.lzz" +int TokenizerModule::xTokenize (Fts5Tokenizer * tokenizer, void * pCtx, int flags, char const * pText, int nText, int (* xToken) (void *, int, char const *, int, int, int)) +#line 144 "./src/objects/tokenizer.lzz" + { + Tokenizer* t = reinterpret_cast(tokenizer); + + return t->Run(pCtx, pText, nText, xToken); +} +#line 150 "./src/objects/tokenizer.lzz" +fts5_tokenizer TokenizerModule::api_object = { + .xCreate = &xCreate, + .xDelete = &xDelete, + .xTokenize = &xTokenize, + }; #line 4 "./src/util/data-converter.lzz" void DataConverter::ThrowDataConversionError (sqlite3_context * invocation, bool isBigInt) #line 4 "./src/util/data-converter.lzz" @@ -2108,26 +2284,26 @@ Binder::Result Binder::BindArgs (v8::FunctionCallbackInfo const & return { count, bound_object }; } -#line 35 "./src/better_sqlite3.lzz" +#line 37 "./src/better_sqlite3.lzz" void Addon::JS_setErrorConstructor (v8::FunctionCallbackInfo const & info) -#line 35 "./src/better_sqlite3.lzz" +#line 37 "./src/better_sqlite3.lzz" { if ( info . Length ( ) <= ( 0 ) || ! info [ 0 ] -> IsFunction ( ) ) return ThrowTypeError ( "Expected " "first" " argument to be " "a function" ) ; v8 :: Local < v8 :: Function > SqliteError = ( info [ 0 ] . As < v8 :: Function > ( ) ) ; static_cast < Addon * > ( info . Data ( ) . As < v8 :: External > ( ) -> Value ( ) ) ->SqliteError.Reset( info . GetIsolate ( ) , SqliteError); } -#line 40 "./src/better_sqlite3.lzz" +#line 42 "./src/better_sqlite3.lzz" void Addon::Cleanup (void * ptr) -#line 40 "./src/better_sqlite3.lzz" +#line 42 "./src/better_sqlite3.lzz" { Addon* addon = static_cast(ptr); for (Database* db : addon->dbs) db->CloseHandles(); addon->dbs.clear(); delete addon; } -#line 47 "./src/better_sqlite3.lzz" +#line 49 "./src/better_sqlite3.lzz" Addon::Addon (v8::Isolate * isolate) -#line 47 "./src/better_sqlite3.lzz" +#line 49 "./src/better_sqlite3.lzz" : privileged_info (NULL), next_id (0), cs (isolate) -#line 50 "./src/better_sqlite3.lzz" +#line 52 "./src/better_sqlite3.lzz" {} #undef LZZ_INLINE diff --git a/src/better_sqlite3.hpp b/src/better_sqlite3.hpp index a3dcdc96..49d332b8 100644 --- a/src/better_sqlite3.hpp +++ b/src/better_sqlite3.hpp @@ -162,6 +162,8 @@ struct Addon; #line 21 "./src/better_sqlite3.lzz" class Statement; #line 22 "./src/better_sqlite3.lzz" +class TokenizerModule; +#line 23 "./src/better_sqlite3.lzz" class Backup; #line 1 "./src/objects/database.lzz" class Database : public node::ObjectWrap @@ -170,139 +172,141 @@ class Database : public node::ObjectWrap public: #line 4 "./src/objects/database.lzz" static v8::Local Init (v8::Isolate * isolate, v8::Local data); -#line 23 "./src/objects/database.lzz" +#line 24 "./src/objects/database.lzz" class CompareDatabase { -#line 23 "./src/objects/database.lzz" - public: #line 24 "./src/objects/database.lzz" + public: +#line 25 "./src/objects/database.lzz" bool operator () (Database const * const a, Database const * const b) const; }; -#line 28 "./src/objects/database.lzz" +#line 29 "./src/objects/database.lzz" class CompareStatement { -#line 28 "./src/objects/database.lzz" - public: #line 29 "./src/objects/database.lzz" + public: +#line 30 "./src/objects/database.lzz" bool operator () (Statement const * const a, Statement const * const b) const; }; -#line 33 "./src/objects/database.lzz" +#line 34 "./src/objects/database.lzz" class CompareBackup { -#line 33 "./src/objects/database.lzz" - public: #line 34 "./src/objects/database.lzz" + public: +#line 35 "./src/objects/database.lzz" bool operator () (Backup const * const a, Backup const * const b) const; }; -#line 40 "./src/objects/database.lzz" +#line 41 "./src/objects/database.lzz" void ThrowDatabaseError (); -#line 44 "./src/objects/database.lzz" +#line 45 "./src/objects/database.lzz" static void ThrowSqliteError (Addon * addon, sqlite3 * db_handle); -#line 48 "./src/objects/database.lzz" +#line 49 "./src/objects/database.lzz" static void ThrowSqliteError (Addon * addon, char const * message, int code); -#line 64 "./src/objects/database.lzz" +#line 65 "./src/objects/database.lzz" bool Log (v8::Isolate * isolate, sqlite3_stmt * handle); -#line 77 "./src/objects/database.lzz" - void AddStatement (Statement * stmt); #line 78 "./src/objects/database.lzz" + void AddStatement (Statement * stmt); +#line 79 "./src/objects/database.lzz" void RemoveStatement (Statement * stmt); -#line 81 "./src/objects/database.lzz" - void AddBackup (Backup * backup); #line 82 "./src/objects/database.lzz" + void AddBackup (Backup * backup); +#line 83 "./src/objects/database.lzz" void RemoveBackup (Backup * backup); -#line 86 "./src/objects/database.lzz" +#line 87 "./src/objects/database.lzz" struct State { -#line 87 "./src/objects/database.lzz" - bool const open; #line 88 "./src/objects/database.lzz" - bool busy; + bool const open; #line 89 "./src/objects/database.lzz" - bool const safe_ints; + bool busy; #line 90 "./src/objects/database.lzz" - bool const unsafe_mode; + bool const safe_ints; #line 91 "./src/objects/database.lzz" - bool was_js_error; + bool const unsafe_mode; #line 92 "./src/objects/database.lzz" - bool const has_logger; + bool was_js_error; #line 93 "./src/objects/database.lzz" - unsigned short int iterators; + bool const has_logger; #line 94 "./src/objects/database.lzz" + unsigned short int iterators; +#line 95 "./src/objects/database.lzz" Addon * const addon; }; -#line 96 "./src/objects/database.lzz" +#line 97 "./src/objects/database.lzz" State * GetState (); -#line 99 "./src/objects/database.lzz" +#line 100 "./src/objects/database.lzz" sqlite3 * GetHandle (); -#line 102 "./src/objects/database.lzz" +#line 103 "./src/objects/database.lzz" Addon * GetAddon (); -#line 107 "./src/objects/database.lzz" +#line 108 "./src/objects/database.lzz" void CloseHandles (); -#line 119 "./src/objects/database.lzz" +#line 120 "./src/objects/database.lzz" ~ Database (); -#line 124 "./src/objects/database.lzz" +#line 125 "./src/objects/database.lzz" private: -#line 126 "./src/objects/database.lzz" +#line 127 "./src/objects/database.lzz" explicit Database (v8::Isolate * isolate, Addon * addon, sqlite3 * db_handle, v8::Local logger); -#line 149 "./src/objects/database.lzz" +#line 150 "./src/objects/database.lzz" static void JS_new (v8::FunctionCallbackInfo const & info); -#line 201 "./src/objects/database.lzz" +#line 202 "./src/objects/database.lzz" static void JS_prepare (v8::FunctionCallbackInfo const & info); -#line 217 "./src/objects/database.lzz" +#line 218 "./src/objects/database.lzz" static void JS_exec (v8::FunctionCallbackInfo const & info); -#line 257 "./src/objects/database.lzz" +#line 258 "./src/objects/database.lzz" static void JS_backup (v8::FunctionCallbackInfo const & info); -#line 275 "./src/objects/database.lzz" +#line 276 "./src/objects/database.lzz" static void JS_serialize (v8::FunctionCallbackInfo const & info); -#line 297 "./src/objects/database.lzz" +#line 298 "./src/objects/database.lzz" static void JS_function (v8::FunctionCallbackInfo const & info); -#line 321 "./src/objects/database.lzz" +#line 322 "./src/objects/database.lzz" static void JS_aggregate (v8::FunctionCallbackInfo const & info); -#line 350 "./src/objects/database.lzz" +#line 351 "./src/objects/database.lzz" static void JS_table (v8::FunctionCallbackInfo const & info); -#line 370 "./src/objects/database.lzz" +#line 371 "./src/objects/database.lzz" static void JS_loadExtension (v8::FunctionCallbackInfo const & info); -#line 392 "./src/objects/database.lzz" +#line 393 "./src/objects/database.lzz" static void JS_close (v8::FunctionCallbackInfo const & info); -#line 402 "./src/objects/database.lzz" +#line 403 "./src/objects/database.lzz" static void JS_defaultSafeIntegers (v8::FunctionCallbackInfo const & info); -#line 408 "./src/objects/database.lzz" +#line 409 "./src/objects/database.lzz" static void JS_unsafeMode (v8::FunctionCallbackInfo const & info); -#line 415 "./src/objects/database.lzz" +#line 416 "./src/objects/database.lzz" + static void JS_createFTS5Tokenizer (v8::FunctionCallbackInfo const & info); +#line 452 "./src/objects/database.lzz" static void JS_open (v8::Local _, v8::PropertyCallbackInfo const & info); -#line 419 "./src/objects/database.lzz" +#line 456 "./src/objects/database.lzz" static void JS_inTransaction (v8::Local _, v8::PropertyCallbackInfo const & info); -#line 424 "./src/objects/database.lzz" +#line 461 "./src/objects/database.lzz" static bool Deserialize (v8::Local buffer, Addon * addon, sqlite3 * db_handle, bool readonly); -#line 449 "./src/objects/database.lzz" +#line 486 "./src/objects/database.lzz" static void FreeSerialization (char * data, void * _); -#line 453 "./src/objects/database.lzz" +#line 490 "./src/objects/database.lzz" static int const MAX_BUFFER_SIZE = node::Buffer::kMaxLength > INT_MAX ? INT_MAX : static_cast(node::Buffer::kMaxLength); -#line 454 "./src/objects/database.lzz" +#line 491 "./src/objects/database.lzz" static int const MAX_STRING_SIZE = v8::String::kMaxLength > INT_MAX ? INT_MAX : static_cast(v8::String::kMaxLength); -#line 456 "./src/objects/database.lzz" +#line 493 "./src/objects/database.lzz" sqlite3 * const db_handle; -#line 457 "./src/objects/database.lzz" +#line 494 "./src/objects/database.lzz" bool open; -#line 458 "./src/objects/database.lzz" +#line 495 "./src/objects/database.lzz" bool busy; -#line 459 "./src/objects/database.lzz" +#line 496 "./src/objects/database.lzz" bool safe_ints; -#line 460 "./src/objects/database.lzz" +#line 497 "./src/objects/database.lzz" bool unsafe_mode; -#line 461 "./src/objects/database.lzz" +#line 498 "./src/objects/database.lzz" bool was_js_error; -#line 462 "./src/objects/database.lzz" +#line 499 "./src/objects/database.lzz" bool const has_logger; -#line 463 "./src/objects/database.lzz" +#line 500 "./src/objects/database.lzz" unsigned short int iterators; -#line 464 "./src/objects/database.lzz" +#line 501 "./src/objects/database.lzz" Addon * const addon; -#line 465 "./src/objects/database.lzz" +#line 502 "./src/objects/database.lzz" CopyablePersistent const logger; -#line 466 "./src/objects/database.lzz" +#line 503 "./src/objects/database.lzz" std::set stmts; -#line 467 "./src/objects/database.lzz" +#line 504 "./src/objects/database.lzz" std::set backups; }; #line 1 "./src/objects/statement.lzz" @@ -469,6 +473,52 @@ class Backup : public node::ObjectWrap #line 137 "./src/objects/backup.lzz" bool unlink; }; +#line 1 "./src/objects/tokenizer.lzz" +class Tokenizer +{ +#line 2 "./src/objects/tokenizer.lzz" +public: +#line 3 "./src/objects/tokenizer.lzz" + Tokenizer (v8::Isolate * isolate, v8::Local run_fn); +#line 10 "./src/objects/tokenizer.lzz" + ~ Tokenizer (); +#line 12 "./src/objects/tokenizer.lzz" + int Run (void * pCtx, char const * pText, int nText, int (* xToken) (void *, int, char const *, int, int, int)); +#line 79 "./src/objects/tokenizer.lzz" +private: +#line 80 "./src/objects/tokenizer.lzz" + v8::Isolate * isolate; +#line 81 "./src/objects/tokenizer.lzz" + CopyablePersistent const run_fn; +}; +#line 84 "./src/objects/tokenizer.lzz" +class TokenizerModule +{ +#line 85 "./src/objects/tokenizer.lzz" +public: +#line 86 "./src/objects/tokenizer.lzz" + TokenizerModule (v8::Isolate * isolate, v8::Local create_instance_fn); +#line 92 "./src/objects/tokenizer.lzz" + static void xDestroy (void * pCtx); +#line 97 "./src/objects/tokenizer.lzz" + fts5_tokenizer * get_api_object (); +#line 101 "./src/objects/tokenizer.lzz" +private: +#line 102 "./src/objects/tokenizer.lzz" + Tokenizer * CreateInstance (char const * * azArg, int nArg); +#line 123 "./src/objects/tokenizer.lzz" + static int xCreate (void * pCtx, char const * * azArg, int nArg, Fts5Tokenizer * * ppOut); +#line 130 "./src/objects/tokenizer.lzz" + static void xDelete (Fts5Tokenizer * tokenizer); +#line 135 "./src/objects/tokenizer.lzz" + static int xTokenize (Fts5Tokenizer * tokenizer, void * pCtx, int flags, char const * pText, int nText, int (* xToken) (void *, int, char const *, int, int, int)); +#line 150 "./src/objects/tokenizer.lzz" + static fts5_tokenizer api_object; +#line 156 "./src/objects/tokenizer.lzz" + v8::Isolate * isolate; +#line 157 "./src/objects/tokenizer.lzz" + CopyablePersistent const create_instance_fn; +}; #line 1 "./src/util/data-converter.lzz" class DataConverter { @@ -775,32 +825,32 @@ class Binder #line 203 "./src/util/binder.lzz" bool success; }; -#line 34 "./src/better_sqlite3.lzz" +#line 36 "./src/better_sqlite3.lzz" struct Addon { -#line 35 "./src/better_sqlite3.lzz" +#line 37 "./src/better_sqlite3.lzz" static void JS_setErrorConstructor (v8::FunctionCallbackInfo const & info); -#line 40 "./src/better_sqlite3.lzz" +#line 42 "./src/better_sqlite3.lzz" static void Cleanup (void * ptr); -#line 47 "./src/better_sqlite3.lzz" +#line 49 "./src/better_sqlite3.lzz" explicit Addon (v8::Isolate * isolate); -#line 52 "./src/better_sqlite3.lzz" +#line 54 "./src/better_sqlite3.lzz" sqlite3_uint64 NextId (); -#line 56 "./src/better_sqlite3.lzz" +#line 58 "./src/better_sqlite3.lzz" CopyablePersistent Statement; -#line 57 "./src/better_sqlite3.lzz" +#line 59 "./src/better_sqlite3.lzz" CopyablePersistent StatementIterator; -#line 58 "./src/better_sqlite3.lzz" +#line 60 "./src/better_sqlite3.lzz" CopyablePersistent Backup; -#line 59 "./src/better_sqlite3.lzz" +#line 61 "./src/better_sqlite3.lzz" CopyablePersistent SqliteError; -#line 60 "./src/better_sqlite3.lzz" +#line 62 "./src/better_sqlite3.lzz" v8::FunctionCallbackInfo const * privileged_info; -#line 61 "./src/better_sqlite3.lzz" +#line 63 "./src/better_sqlite3.lzz" sqlite3_uint64 next_id; -#line 62 "./src/better_sqlite3.lzz" +#line 64 "./src/better_sqlite3.lzz" CS cs; -#line 63 "./src/better_sqlite3.lzz" +#line 65 "./src/better_sqlite3.lzz" std::set dbs; }; #line 16 "./src/util/macros.lzz" @@ -880,41 +930,41 @@ LZZ_INLINE int BindMap::GetSize () { return length; } -#line 77 "./src/objects/database.lzz" +#line 78 "./src/objects/database.lzz" LZZ_INLINE void Database::AddStatement (Statement * stmt) -#line 77 "./src/objects/database.lzz" +#line 78 "./src/objects/database.lzz" { stmts.insert(stmts.end(), stmt); } -#line 78 "./src/objects/database.lzz" +#line 79 "./src/objects/database.lzz" LZZ_INLINE void Database::RemoveStatement (Statement * stmt) -#line 78 "./src/objects/database.lzz" +#line 79 "./src/objects/database.lzz" { stmts.erase(stmt); } -#line 81 "./src/objects/database.lzz" +#line 82 "./src/objects/database.lzz" LZZ_INLINE void Database::AddBackup (Backup * backup) -#line 81 "./src/objects/database.lzz" +#line 82 "./src/objects/database.lzz" { backups.insert(backups.end(), backup); } -#line 82 "./src/objects/database.lzz" +#line 83 "./src/objects/database.lzz" LZZ_INLINE void Database::RemoveBackup (Backup * backup) -#line 82 "./src/objects/database.lzz" +#line 83 "./src/objects/database.lzz" { backups.erase(backup); } -#line 96 "./src/objects/database.lzz" +#line 97 "./src/objects/database.lzz" LZZ_INLINE Database::State * Database::GetState () -#line 96 "./src/objects/database.lzz" +#line 97 "./src/objects/database.lzz" { return reinterpret_cast(&open); } -#line 99 "./src/objects/database.lzz" +#line 100 "./src/objects/database.lzz" LZZ_INLINE sqlite3 * Database::GetHandle () -#line 99 "./src/objects/database.lzz" +#line 100 "./src/objects/database.lzz" { return db_handle; } -#line 102 "./src/objects/database.lzz" +#line 103 "./src/objects/database.lzz" LZZ_INLINE Addon * Database::GetAddon () -#line 102 "./src/objects/database.lzz" +#line 103 "./src/objects/database.lzz" { return addon; } @@ -945,6 +995,12 @@ LZZ_INLINE bool Backup::Compare (Backup const * const a, Backup const * const b) { return a->id < b->id; } +#line 97 "./src/objects/tokenizer.lzz" +LZZ_INLINE fts5_tokenizer * TokenizerModule::get_api_object () +#line 97 "./src/objects/tokenizer.lzz" + { + return &api_object; +} #line 39 "./src/util/custom-aggregate.lzz" LZZ_INLINE void CustomAggregate::xStepBase (sqlite3_context * invocation, int argc, sqlite3_value * * argv, CopyablePersistent const CustomAggregate::* ptrtm) #line 39 "./src/util/custom-aggregate.lzz" @@ -1022,9 +1078,9 @@ LZZ_INLINE CustomTable::VTab * CustomTable::Cursor::GetVTab () { return VTab::Upcast(base.pVtab); } -#line 52 "./src/better_sqlite3.lzz" +#line 54 "./src/better_sqlite3.lzz" LZZ_INLINE sqlite3_uint64 Addon::NextId () -#line 52 "./src/better_sqlite3.lzz" +#line 54 "./src/better_sqlite3.lzz" { return next_id++; } diff --git a/src/better_sqlite3.lzz b/src/better_sqlite3.lzz index aae5d10a..1f20ffb3 100644 --- a/src/better_sqlite3.lzz +++ b/src/better_sqlite3.lzz @@ -19,11 +19,13 @@ #insert "util/bind-map.lzz" struct Addon; class Statement; +class TokenizerModule; class Backup; #insert "objects/database.lzz" #insert "objects/statement.lzz" #insert "objects/statement-iterator.lzz" #insert "objects/backup.lzz" +#insert "objects/tokenizer.lzz" #insert "util/data-converter.lzz" #insert "util/custom-function.lzz" #insert "util/custom-aggregate.lzz" diff --git a/src/objects/database.lzz b/src/objects/database.lzz index 94b1d455..395cdaa1 100644 --- a/src/objects/database.lzz +++ b/src/objects/database.lzz @@ -14,6 +14,7 @@ public: SetPrototypeMethod(isolate, data, t, "close", JS_close); SetPrototypeMethod(isolate, data, t, "defaultSafeIntegers", JS_defaultSafeIntegers); SetPrototypeMethod(isolate, data, t, "unsafeMode", JS_unsafeMode); + SetPrototypeMethod(isolate, data, t, "createFTS5Tokenizer", JS_createFTS5Tokenizer); SetPrototypeGetter(isolate, data, t, "open", JS_open); SetPrototypeGetter(isolate, data, t, "inTransaction", JS_inTransaction); return t->GetFunction(OnlyContext).ToLocalChecked(); @@ -412,6 +413,42 @@ private: sqlite3_db_config(db->db_handle, SQLITE_DBCONFIG_DEFENSIVE, static_cast(!db->unsafe_mode), NULL); } + NODE_METHOD(JS_createFTS5Tokenizer) { + UseAddon; + UseIsolate; + + Database* db = Unwrap(info.This()); + REQUIRE_ARGUMENT_STRING(first, v8::Local name); + REQUIRE_ARGUMENT_FUNCTION(second, v8::Local create_instance_fn); + + // Get fts5_api object + int rc; + sqlite3_stmt *pStmt = nullptr; + + rc = sqlite3_prepare(db->db_handle, "SELECT fts5(?1)", -1, &pStmt, 0); + if (rc != SQLITE_OK) { + ThrowSqliteError(addon, db->db_handle); + return; + } + + fts5_api *fts5 = nullptr; + sqlite3_bind_pointer(pStmt, 1, (void*)&fts5, "fts5_api_ptr", nullptr); + sqlite3_step(pStmt); + rc = sqlite3_finalize(pStmt); + if (rc != SQLITE_OK) { + ThrowSqliteError(addon, db->db_handle); + return; + } + + assert(fts5 != nullptr); + + TokenizerModule* t = new TokenizerModule(isolate, create_instance_fn); + + v8::String::Utf8Value utf8(isolate, name); + fts5->xCreateTokenizer(fts5, *utf8, t, t->get_api_object(), + &TokenizerModule::xDestroy); + } + NODE_GETTER(JS_open) { info.GetReturnValue().Set(Unwrap(info.This())->open); } diff --git a/src/objects/tokenizer.lzz b/src/objects/tokenizer.lzz new file mode 100644 index 00000000..15144fcd --- /dev/null +++ b/src/objects/tokenizer.lzz @@ -0,0 +1,158 @@ +class Tokenizer { +public: + Tokenizer( + v8::Isolate* isolate, + v8::Local run_fn + ): isolate(isolate), + run_fn(isolate, run_fn) { + } + + ~Tokenizer() {} + + int Run( + void* pCtx, + const char *pText, + int nText, + int (*xToken)( + void* pCtx, int tflags, const char* pToken, int nToken, + int iStart, int iEnd) + ) { + v8::HandleScope scope(isolate); + UseContext; + + v8::Local arg[] = { + StringFromUtf8(isolate, pText, nText) + }; + v8::Local result = run_fn.Get(isolate)->Call( + ctx, + v8::Undefined(isolate), + 1, + arg).ToLocalChecked(); + if (!result->IsArray()) { + ThrowTypeError("Expected array return value of tokenizer"); + return SQLITE_MISUSE; + } + v8::Local indices = result.As(); + int len = indices->Length(); + if (len % 3 != 0) { + return SQLITE_MISUSE; + } + for (int i = 0; i < len; i += 3) { + int64_t segment_start = + indices->Get(ctx, i).ToLocalChecked()->IntegerValue(ctx).ToChecked(); + int64_t segment_end = + indices->Get(ctx, i + 1).ToLocalChecked()->IntegerValue(ctx).ToChecked(); + v8::Local maybe_normalized = + indices->Get(ctx, i + 2).ToLocalChecked(); + if (segment_start < 0 || static_cast(segment_start) > nText) { + return SQLITE_MISUSE; + } + if (segment_end < 0 || static_cast(segment_end) > nText) { + return SQLITE_MISUSE; + } + if (segment_start > segment_end) { + return SQLITE_MISUSE; + } + + int rc; + if (maybe_normalized->IsString()) { + v8::String::Utf8Value normalized( + isolate, indices->Get(ctx, i + 2).ToLocalChecked()); + rc = xToken( + pCtx, 0, *normalized, normalized.length(), + segment_start, segment_end); + } else { + // Optimization: if `maybe_normalized` is not provided - use original + // input string to avoid copying data. + rc = xToken( + pCtx, 0, &pText[segment_start], segment_end - segment_start, + segment_start, segment_end); + } + + if (rc != SQLITE_OK) { + return rc; + } + } + return SQLITE_OK; + } + +private: + v8::Isolate* isolate; + const CopyablePersistent run_fn; +} + +class TokenizerModule { +public: + TokenizerModule( + v8::Isolate* isolate, + v8::Local create_instance_fn + ): isolate(isolate), create_instance_fn(isolate, create_instance_fn) { + } + + static void xDestroy(void* pCtx) { + TokenizerModule* m = static_cast(pCtx); + delete m; + } + + inline fts5_tokenizer* get_api_object() { + return &api_object; + } + +private: + Tokenizer* CreateInstance(const char** azArg, int nArg) { + v8::HandleScope scope(isolate); + UseContext; + + v8::Local params = v8::Array::New(isolate, nArg); + for (int i = 0; i < nArg; i++) { + params->Set(ctx, i, StringFromUtf8(isolate, azArg[i], -1)).ToChecked(); + } + + v8::Local arg[] = { + params, + }; + v8::Local run_fn = create_instance_fn.Get(isolate)->Call( + ctx, + v8::Undefined(isolate), + 1, + arg).ToLocalChecked().As(); + + return new Tokenizer(isolate, run_fn); + } + + static int xCreate( + void* pCtx, const char** azArg, int nArg, Fts5Tokenizer** ppOut) { + TokenizerModule* m = static_cast(pCtx); + *ppOut = reinterpret_cast(m->CreateInstance(azArg, nArg)); + return SQLITE_OK; + } + + static void xDelete(Fts5Tokenizer* tokenizer) { + Tokenizer* t = reinterpret_cast(tokenizer); + delete t; + } + + static int xTokenize( + Fts5Tokenizer* tokenizer, + void *pCtx, + int flags, + const char *pText, + int nText, + int (*xToken)( + void* pCtx, int tflags, const char* pToken, int nToken, + int iStart, int iEnd) + ) { + Tokenizer* t = reinterpret_cast(tokenizer); + + return t->Run(pCtx, pText, nText, xToken); + } + + static fts5_tokenizer api_object = { + .xCreate = &xCreate, + .xDelete = &xDelete, + .xTokenize = &xTokenize, + }; + + v8::Isolate* isolate; + const CopyablePersistent create_instance_fn; +}; diff --git a/test/38.database.tokenizer.js b/test/38.database.tokenizer.js new file mode 100644 index 00000000..bb178fdf --- /dev/null +++ b/test/38.database.tokenizer.js @@ -0,0 +1,72 @@ +'use strict'; +const Database = require('../.'); + +const segmenter = new Intl.Segmenter([], { + granularity: 'word', +}); + +const DIACRITICS = /[\u0300-\u036f]/g; + +function removeDiacritics(str) { + return str.normalize('NFD').replace(DIACRITICS, ''); +} + +describe('Database#serialize()', function () { + beforeEach(function () { + this.db = new Database(':memory:'); + + this.db.createFTS5Tokenizer('js', class Tokenizer { + constructor(params) { + expect(params).to.eql(['arg1', 'arg2']); + } + + run(str) { + const result = []; + let off = 0; + for (const seg of segmenter.segment(str)) { + const len = Buffer.byteLength(seg.segment); + if (seg.isWordLike) { + const normalized = removeDiacritics(seg.segment); + result.push(off, off + len, normalized === seg.segment ? undefined : normalized); + } + off += len; + } + return result; + } + }); + + this.db.prepare("CREATE VIRTUAL TABLE fts USING fts5(content, tokenize='js arg1 arg2')").run(); + this.insertStmt = this.db.prepare("INSERT INTO fts (content) VALUES (?)"); + this.lookupStmt = this.db.prepare( + "SELECT snippet(fts, -1, '[', ']', '...', 10) " + + "FROM fts " + + "WHERE content MATCH $query").pluck(); + }); + afterEach(function () { + this.db.close(); + }); + + it("should support CJK symbols at the start", function() { + this.insertStmt.run("知识需要时间"); + const rows = this.lookupStmt.all({ query: "知*" }); + expect(rows).to.eql(["[知识]需要时间"]); + }); + + it("should support CJK symbols in the middle", function() { + this.insertStmt.run("知识需要时间"); + const rows = this.lookupStmt.all({ query: "需*" }); + expect(rows).to.eql(["知识[需要]时间"]); + }); + + it("should support normalization", function() { + this.insertStmt.run("dïācrîtįcs"); + const rows = this.lookupStmt.all({ query: "diacritics*" }); + expect(rows).to.eql(["[dïācrîtįcs]"]); + }); + + it("should support punctuation", function() { + this.insertStmt.run("hello!world! how are you?"); + const rows = this.lookupStmt.all({ query: "h*" }); + expect(rows).to.eql(["[hello]!world! [how] are you?"]); + }); +});