Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MOD-6540: Support EMPTY indexing for TEXT fields (#4622) #4637

Merged
merged 5 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions commands.json
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@
"token": "WITHSUFFIXTRIE",
"optional": true
},
{
"name": "isempty",
"type": "pure-token",
"token": "ISEMPTY",
"optional": true
},
{
"name": "sortable",
"type": "block",
Expand Down
5 changes: 2 additions & 3 deletions src/doc_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,13 @@
return 1;
}

int DocTable_SetByteOffsets(DocTable *t, RSDocumentMetadata *dmd, RSByteOffsets *v) {
void DocTable_SetByteOffsets(RSDocumentMetadata *dmd, RSByteOffsets *v) {
if (!dmd) {
return 0;
return;

Check warning on line 205 in src/doc_table.c

View check run for this annotation

Codecov / codecov/patch

src/doc_table.c#L205

Added line #L205 was not covered by tests
}

dmd->byteOffsets = v;
dmd->flags |= Document_HasOffsetVector;
return 1;
}

/* Put a new document into the table, assign it an incremental id and store the metadata in the
Expand Down
2 changes: 1 addition & 1 deletion src/doc_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ int DocTable_SetSortingVector(DocTable *t, RSDocumentMetadata *dmd, RSSortingVec
/* Set the offset vector for a document. This contains the byte offsets of each token found in
* the document. This is used for highlighting
*/
int DocTable_SetByteOffsets(DocTable *t, RSDocumentMetadata *dmd, RSByteOffsets *offsets);
void DocTable_SetByteOffsets(RSDocumentMetadata *dmd, RSByteOffsets *offsets);

/** Get the docId of a key if it exists in the table, or 0 if it doesnt */
t_docId DocTable_GetId(const DocTable *dt, const char *s, size_t n);
Expand Down
24 changes: 9 additions & 15 deletions src/document.c
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@
}

aCtx->tokenizer = GetTokenizer(doc->language, aCtx->fwIdx->stemmer, sp->stopwords);
// aCtx->doc->docId = 0;
return aCtx;
}

Expand Down Expand Up @@ -321,10 +320,7 @@
}

void AddDocumentCtx_Free(RSAddDocumentCtx *aCtx) {
/**
* Free preprocessed data; this is the only reliable place
* to do it
*/
// Free preprocessed data; this is the only reliable place to do it.
for (size_t ii = 0; ii < aCtx->doc->numFields; ++ii) {
if (FIELD_IS_VALID(aCtx, ii)) {
if (FIELD_IS(aCtx->fspecs + ii, INDEXFLD_T_TAG) && aCtx->fdatas[ii].tags) {
Expand Down Expand Up @@ -405,6 +401,7 @@
size_t fl;
const char *c = DocumentField_GetValueCStr(field, &fl);
size_t valueCount = (field->unionType != FLD_VAR_T_ARRAY ? 1 : field->arrayLen);
bool indexesEmpty = FieldSpec_IndexesEmpty(fs);

if (FieldSpec_IsSortable(fs)) {
if (field->unionType != FLD_VAR_T_ARRAY) {
Expand Down Expand Up @@ -449,8 +446,12 @@
aCtx->tokenizer->Start(aCtx->tokenizer, (char *)c, fl, options);

Token tok = {0};
uint32_t newTokPos;
while (0 != (newTokPos = aCtx->tokenizer->Next(aCtx->tokenizer, &tok))) {
while (0 != aCtx->tokenizer->Next(aCtx->tokenizer, &tok)) {
if (!indexesEmpty && strlen(tok.tok) == 0) {
// Skip empty values if the field should not index them
// Empty tokens are returned only if the original value was empty
continue;
}
forwardIndexTokenFunc(&tokCtx, &tok);
}
uint32_t lastTokPos = aCtx->tokenizer->ctx.lastOffset;
Expand Down Expand Up @@ -544,16 +545,10 @@
return 0;
case FLD_VAR_T_ARRAY:
fdata->isMulti = 1;
// TODO: GEOMETRY - parse geometries from string
//fdata->arrGeometry = ...
return 0;
default:
return -1;
}

// TODO: GEOMETRY
// If this is a sortable geomtry value - copy the value to the sorting vector

}

FIELD_BULK_INDEXER(geometryIndexer) {
Expand Down Expand Up @@ -694,8 +689,7 @@
break;
case FLD_VAR_T_BLOB_ARRAY:
case FLD_VAR_T_NUM:
case FLD_VAR_T_GEOMETRY:
RS_LOG_ASSERT(0, "Oops");
RS_LOG_ASSERT(0, "Unsupported field type for GEO index");

Check warning on line 692 in src/document.c

View check run for this annotation

Codecov / codecov/patch

src/document.c#L692

Added line #L692 was not covered by tests
}

const char *str = NULL;
Expand Down
2 changes: 1 addition & 1 deletion src/document_basic.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,11 @@ int Document_LoadSchemaFieldHash(Document *doc, RedisSearchCtx *sctx, QueryError
RedisModule_FreeString(sctx->redisCtx, payload_rms);
}

// Load indexed fields from the document
doc->fields = rm_calloc(nitems, sizeof(*doc->fields));
for (size_t ii = 0; ii < spec->numFields; ++ii) {
FieldSpec *field = &spec->fields[ii];
RedisModuleString *v = NULL;
// Hash command is not related to other type such as JSON
RedisModule_HashGet(k, REDISMODULE_HASH_CFIELDS, field->path, &v, NULL);
if (v == NULL) {
continue;
Expand Down
3 changes: 1 addition & 2 deletions src/forward_index.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ static void ForwardIndex_HandleToken(ForwardIndex *idx, const char *tok, size_t
// LG_DEBUG("token %.*s, hval %d\n", t.len, t.s, hval);
ForwardIndexEntry *h = NULL;
int isNew = 0;
uint32_t hash = hashKey(tok, tokLen);
uint32_t hash = hashKey(tok, tokLen); // NULL for ""
khIdxEntry *kh = makeEntry(idx, tok, tokLen, hash, &isNew);
h = &kh->ent;

Expand Down Expand Up @@ -295,7 +295,6 @@ ForwardIndexIterator ForwardIndex_Iterate(ForwardIndex *i) {
iter.hits = i->hits;
iter.curBucketIdx = 0;
iter.curEnt = NULL;
// khTable_Dump(iter.hits);
return iter;
}

Expand Down
12 changes: 6 additions & 6 deletions src/forward_index.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
/*
* Copyright Redis Ltd. 2016 - present
* Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
* the Server Side Public License v1 (SSPLv1).
*/
/*
* Copyright Redis Ltd. 2016 - present
* Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
* the Server Side Public License v1 (SSPLv1).
*/

#ifndef __FORWARD_INDEX_H__
#define __FORWARD_INDEX_H__
#include "redisearch.h"
Expand Down
15 changes: 8 additions & 7 deletions src/fragmenter.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
/*
* Copyright Redis Ltd. 2016 - present
* Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
* the Server Side Public License v1 (SSPLv1).
*/
/*
* Copyright Redis Ltd. 2016 - present
* Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
* the Server Side Public License v1 (SSPLv1).
*/

#include "fragmenter.h"
#include "toksep.h"
#include "tokenize.h"
Expand Down Expand Up @@ -42,7 +42,8 @@ static int Fragment_HasTerm(const Fragment *frag, uint32_t termId) {
// If this is the first time the term appears in the fragment, increment the
// fragment's score by the term's score. Otherwise, increment it by half
// the fragment's score. This allows for better 'blended' results.
for (size_t ii = 0; ii < Fragment_GetNumTerms(frag); ii++) {
size_t n_terms = Fragment_GetNumTerms(frag);
for (size_t ii = 0; ii < n_terms; ii++) {
if (locs[ii].termId == termId) {
return 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/highlight_processor.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ static int hlpNext(ResultProcessor *rbase, SearchResult *r) {
// The current result should not contain an index result
const RSIndexResult *ir = r->indexResult ? r->indexResult : getIndexResult(rbase, r->docId);

// we can't work withot the inex result, just return QUEUED
// we can't work without the index result, just return QUEUED
if (!ir) {
return RS_RESULT_OK;
}
Expand Down
4 changes: 2 additions & 2 deletions src/index.c
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ void AddIntersectIterator(IndexIterator *parentIter, IndexIterator *childIter) {
ii->its[ii->num - 1] = childIter;
}

IndexIterator *NewIntersecIterator(IndexIterator **its_, size_t num, DocTable *dt,
IndexIterator *NewIntersectIterator(IndexIterator **its_, size_t num, DocTable *dt,
t_fieldMask fieldMask, int maxSlop, int inOrder, double weight) {
// printf("Creating new intersection iterator with fieldMask=%llx\n", fieldMask);
IntersectIterator *ctx = rm_calloc(1, sizeof(*ctx));
Expand Down Expand Up @@ -1302,7 +1302,7 @@ IndexIterator *NewOptionalIterator(IndexIterator *it, t_docId maxDocId, double w
return ret;
}

/* Wildcard iterator, matchin ALL documents in the index. This is used for one thing only -
/* Wildcard iterator, matching ALL documents in the index. This is used for one thing only -
* purely negative queries. If the root of the query is a negative expression, we cannot process
* it
* without a positive expression. So we create a wildcard iterator that basically just iterates
Expand Down
2 changes: 1 addition & 1 deletion src/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ void UI_Foreach(IndexIterator *it, void (*callback)(IndexReader *it));
* negative number, we will allow at most maxSlop intervening positions between the terms. If
* maxSlop is set and inOrder is 1, we assert that the terms are in
* order. I.e anexact match has maxSlop of 0 and inOrder 1. */
IndexIterator *NewIntersecIterator(IndexIterator **its, size_t num, DocTable *t,
IndexIterator *NewIntersectIterator(IndexIterator **its, size_t num, DocTable *t,
t_fieldMask fieldMask, int maxSlop, int inOrder, double weight);

/* Add an iterator to an intersect iterator */
Expand Down
14 changes: 8 additions & 6 deletions src/indexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ static void writeCurEntries(DocumentIndexer *indexer, RSAddDocumentCtx *aCtx, Re
RedisModuleKey *idxKey = NULL;
bool isNew;
InvertedIndex *invidx = Redis_OpenInvertedIndexEx(ctx, entry->term, entry->len, 1, &isNew, &idxKey);
if (isNew) {
if (isNew && strlen(entry->term) != 0) {
IndexSpec_AddTerm(spec, entry->term, entry->len);
}
if (invidx) {
Expand All @@ -108,9 +108,11 @@ static void writeCurEntries(DocumentIndexer *indexer, RSAddDocumentCtx *aCtx, Re
}
}

if (spec->suffixMask & entry->fieldMask && entry->term[0] != STEM_PREFIX
&& entry->term[0] != PHONETIC_PREFIX
&& entry->term[0] != SYNONYM_PREFIX_CHAR) {
if (spec->suffixMask & entry->fieldMask
&& entry->term[0] != STEM_PREFIX
&& entry->term[0] != PHONETIC_PREFIX
&& entry->term[0] != SYNONYM_PREFIX_CHAR
&& strlen(entry->term) != 0) {
addSuffixTrie(spec->suffix, entry->term, entry->len);
}

Expand Down Expand Up @@ -198,7 +200,7 @@ static void doAssignIds(RSAddDocumentCtx *cur, RedisSearchCtx *ctx) {

if (cur->byteOffsets) {
ByteOffsetWriter_Move(&cur->offsetsWriter, cur->byteOffsets);
DocTable_SetByteOffsets(&spec->docs, md, cur->byteOffsets);
DocTable_SetByteOffsets(md, cur->byteOffsets);
cur->byteOffsets = NULL;
}
DMD_Return(md);
Expand Down Expand Up @@ -278,7 +280,7 @@ static void Indexer_Process(DocumentIndexer *indexer, RSAddDocumentCtx *aCtx) {
Document *doc = aCtx->doc;

/**
* Document ID assignment:
* Document ID & sorting-vector assignment:
* In order to hold the GIL for as short a time as possible, we assign
* document IDs in bulk. We begin using the first document ID that is assumed
* to be zero.
Expand Down
7 changes: 3 additions & 4 deletions src/inverted_index.c
Original file line number Diff line number Diff line change
Expand Up @@ -328,13 +328,13 @@
printf("InvertedIndex {\n");
++indent;
PRINT_INDENT(indent);
printf("numDocs %u, lastId %ld, size %u\n", idx->numDocs, idx->lastId, idx->size);
printf("numDocs %u, lastId %llu, size %u\n", idx->numDocs, idx->lastId, idx->size);

Check warning on line 331 in src/inverted_index.c

View check run for this annotation

Codecov / codecov/patch

src/inverted_index.c#L331

Added line #L331 was not covered by tests

RSIndexResult *res = NULL;
IndexReader *ir = NewNumericReader(NULL, idx, NULL ,0, 0, false);
while (INDEXREAD_OK == IR_Read(ir, &res)) {
PRINT_INDENT(indent);
printf("value %f, docId %lu\n", res->num.value, res->docId);
printf("value %f, docId %llu\n", res->num.value, res->docId);

Check warning on line 337 in src/inverted_index.c

View check run for this annotation

Codecov / codecov/patch

src/inverted_index.c#L337

Added line #L337 was not covered by tests
}
IR_Free(ir);
--indent;
Expand All @@ -348,7 +348,7 @@
printf("IndexBlock {\n");
++indent;
PRINT_INDENT(indent);
printf("numEntries %u, firstId %lu, lastId %lu, \n", b->numEntries, b->firstId, b->lastId);
printf("numEntries %u, firstId %llu, lastId %llu, \n", b->numEntries, b->firstId, b->lastId);

Check warning on line 351 in src/inverted_index.c

View check run for this annotation

Codecov / codecov/patch

src/inverted_index.c#L351

Added line #L351 was not covered by tests
--indent;
PRINT_INDENT(indent);
printf("}\n");
Expand Down Expand Up @@ -440,7 +440,6 @@
// 1. Full encoding - docId, freq, flags, offset
case Index_StoreFreqs | Index_StoreTermOffsets | Index_StoreFieldFlags:
return encodeFull;

case Index_StoreFreqs | Index_StoreTermOffsets | Index_StoreFieldFlags | Index_WideSchema:
return encodeFullWide;

Expand Down
22 changes: 3 additions & 19 deletions src/json.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,11 @@
}
break;
case JSONType_Object:
if (fieldType == INDEXFLD_T_GEOMETRY || (fieldType == INDEXFLD_T_TAG)) {
if (fieldType == INDEXFLD_T_GEOMETRY) {
// A GEOSHAPE field can be represented as GEOJSON "geoshape" object
// An EMPTY TAG field can index empty objects.
rv = REDISMODULE_OK;
} else {
QueryError_SetError(status, QUERY_EPARSEARGS, "Invalid JSON type: Object type can represent only GEOMETRY and EMPTY TAG fields");
QueryError_SetError(status, QUERY_EPARSEARGS, "Invalid JSON type: Object type can represent only GEOMETRY fields");
}
break;
// null type is not supported
Expand Down Expand Up @@ -525,22 +524,7 @@
}
break;
case JSONType_Object:
if (FieldSpec_IndexesEmpty(fs)) {
size_t len;
japi->getLen(json, &len);
if (len == 0) {
// We wish to index empty JSON objects as empty values
df->strlen = 0;
df->strval = rm_strdup("");
df->unionType = FLD_VAR_T_CSTR;
} else {
// Only empty objects are indexed for TAGs (that index empty values)
rv = REDISMODULE_ERR;
}
} else {
// Only empty objects are indexed for TAGs (that index empty values)
rv = REDISMODULE_ERR;
}
rv = REDISMODULE_ERR;

Check warning on line 527 in src/json.c

View check run for this annotation

Codecov / codecov/patch

src/json.c#L527

Added line #L527 was not covered by tests
break;
case JSONType__EOF:
RS_LOG_ASSERT(0, "Should not happen");
Expand Down
32 changes: 2 additions & 30 deletions src/numeric_index.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
++indent;

PRINT_INDENT(indent);
printf("numEntries %lu, numRanges %lu, lastDocId %ld\n", t->numEntries, t->numRanges, t->lastDocId);
printf("numEntries %lu, numRanges %lu, lastDocId %llu\n", t->numEntries, t->numRanges, t->lastDocId);

Check warning on line 38 in src/numeric_index.c

View check run for this annotation

Codecov / codecov/patch

src/numeric_index.c#L38

Added line #L38 was not covered by tests
NumericRangeNode_Dump(t->root, indent + 1);

--indent;
Expand Down Expand Up @@ -657,35 +657,7 @@

NumericRangeTree *OpenNumericIndex(RedisSearchCtx *ctx, RedisModuleString *keyName,
RedisModuleKey **idxKey) {

NumericRangeTree *t;
if (!ctx->spec->keysDict) {
RedisModuleKey *key_s = NULL;

if (!idxKey) {
idxKey = &key_s;
}

*idxKey = RedisModule_OpenKey(ctx->redisCtx, keyName, REDISMODULE_READ | REDISMODULE_WRITE);

int type = RedisModule_KeyType(*idxKey);
if (type != REDISMODULE_KEYTYPE_EMPTY &&
RedisModule_ModuleTypeGetType(*idxKey) != NumericIndexType) {
return NULL;
}

/* Create an empty value object if the key is currently empty. */
if (type == REDISMODULE_KEYTYPE_EMPTY) {
t = NewNumericRangeTree();
ctx->spec->stats.invertedSize += t->root->range->invertedIndexSize;
RedisModule_ModuleTypeSetValue((*idxKey), NumericIndexType, t);
} else {
t = RedisModule_ModuleTypeGetValue(*idxKey);
}
} else {
t = openNumericKeysDict(ctx->spec, keyName, 1);
}
return t;
return openNumericKeysDict(ctx->spec, keyName, 1);
}

void __numericIndex_memUsageCallback(NumericRangeNode *n, void *ctx) {
Expand Down
14 changes: 8 additions & 6 deletions src/numeric_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ typedef struct {
size_t appearances;
} CardinalityValue;

/* A numeric range is a node in a numeric range tree, representing a range of values bunched
* together.
* Since we do not know the distribution of scores ahead, we use a splitting approach - we start
* with single value nodes, and when a node passes some cardinality we split it.
* We save the minimum and maximum values inside the node, and when we split we split by finding the
* median value */
/* A numeric range is a node in a numeric range tree, representing a range of
* values bunched together.
* Since we do not know the distribution of scores ahead, we use a splitting
* approach - we start with single value nodes, and when a node passes some
* cardinality we split it.
* We save the minimum and maximum values inside the node, and when we split we
* split by finding the median value.
*/
typedef struct {
double minVal;
double maxVal;
Expand Down
Loading
Loading