Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MOD-6540: Support EMPTY indexing for TEXT fields #4622

Merged
merged 11 commits into from
May 12, 2024
5 changes: 2 additions & 3 deletions src/doc_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,13 @@
return 1;
}

int DocTable_SetByteOffsets(DocTable *t, RSDocumentMetadata *dmd, RSByteOffsets *v) {
void DocTable_SetByteOffsets(RSDocumentMetadata *dmd, RSByteOffsets *v) {
if (!dmd) {
return 0;
return;

Check warning on line 205 in src/doc_table.c

View check run for this annotation

Codecov / codecov/patch

src/doc_table.c#L205

Added line #L205 was not covered by tests
}

dmd->byteOffsets = v;
dmd->flags |= Document_HasOffsetVector;
return 1;
}

/* Put a new document into the table, assign it an incremental id and store the metadata in the
Expand Down
2 changes: 1 addition & 1 deletion src/doc_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ int DocTable_SetSortingVector(DocTable *t, RSDocumentMetadata *dmd, RSSortingVec
/* Set the offset vector for a document. This contains the byte offsets of each token found in
* the document. This is used for highlighting
*/
int DocTable_SetByteOffsets(DocTable *t, RSDocumentMetadata *dmd, RSByteOffsets *offsets);
void DocTable_SetByteOffsets(RSDocumentMetadata *dmd, RSByteOffsets *offsets);

/** Get the docId of a key if it exists in the table, or 0 if it doesnt */
t_docId DocTable_GetId(const DocTable *dt, const char *s, size_t n);
Expand Down
25 changes: 10 additions & 15 deletions src/document.c
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@
}

aCtx->tokenizer = GetTokenizer(doc->language, aCtx->fwIdx->stemmer, sp->stopwords);
// aCtx->doc->docId = 0;
return aCtx;
}

Expand Down Expand Up @@ -321,10 +320,7 @@
}

void AddDocumentCtx_Free(RSAddDocumentCtx *aCtx) {
/**
* Free preprocessed data; this is the only reliable place
* to do it
*/
// Free preprocessed data; this is the only reliable place to do it.
for (size_t ii = 0; ii < aCtx->doc->numFields; ++ii) {
if (FIELD_IS_VALID(aCtx, ii)) {
if (FIELD_IS(aCtx->fspecs + ii, INDEXFLD_T_TAG) && aCtx->fdatas[ii].tags) {
Expand Down Expand Up @@ -405,6 +401,7 @@
size_t fl;
const char *c = DocumentField_GetValueCStr(field, &fl);
size_t valueCount = (field->unionType != FLD_VAR_T_ARRAY ? 1 : field->arrayLen);
bool indexesEmpty = FieldSpec_IndexesEmpty(fs);

if (FieldSpec_IsSortable(fs)) {
if (field->unionType != FLD_VAR_T_ARRAY) {
Expand Down Expand Up @@ -449,8 +446,12 @@
aCtx->tokenizer->Start(aCtx->tokenizer, (char *)c, fl, options);

Token tok = {0};
uint32_t newTokPos;
while (0 != (newTokPos = aCtx->tokenizer->Next(aCtx->tokenizer, &tok))) {
while (0 != aCtx->tokenizer->Next(aCtx->tokenizer, &tok)) {
if (strlen(tok.tok) == 0 && !indexesEmpty) {
raz-mon marked this conversation as resolved.
Show resolved Hide resolved
// Skip empty values if the field should not index them
// Empty tokens are returned only if the original value was empty
continue;
}
forwardIndexTokenFunc(&tokCtx, &tok);
}
uint32_t lastTokPos = aCtx->tokenizer->ctx.lastOffset;
Expand Down Expand Up @@ -544,16 +545,10 @@
return 0;
case FLD_VAR_T_ARRAY:
fdata->isMulti = 1;
// TODO: GEOMETRY - parse geometries from string
//fdata->arrGeometry = ...
return 0;
default:
return -1;
}

// TODO: GEOMETRY
// If this is a sortable geomtry value - copy the value to the sorting vector

}

FIELD_BULK_INDEXER(geometryIndexer) {
Expand Down Expand Up @@ -694,8 +689,8 @@
break;
case FLD_VAR_T_BLOB_ARRAY:
case FLD_VAR_T_NUM:
case FLD_VAR_T_GEOMETRY:
RS_LOG_ASSERT(0, "Oops");
RS_LOG_ASSERT(0, "Unsupported field type for GEO index");

Check warning on line 692 in src/document.c

View check run for this annotation

Codecov / codecov/patch

src/document.c#L692

Added line #L692 was not covered by tests
RS_LOG_ASSERT(0, "Unsupported field type GEOMETRIY for GEO index");
raz-mon marked this conversation as resolved.
Show resolved Hide resolved
}

const char *str = NULL;
Expand Down
2 changes: 1 addition & 1 deletion src/document_basic.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,11 @@ int Document_LoadSchemaFieldHash(Document *doc, RedisSearchCtx *sctx, QueryError
RedisModule_FreeString(sctx->redisCtx, payload_rms);
}

// Load indexed fields from the document
doc->fields = rm_calloc(nitems, sizeof(*doc->fields));
for (size_t ii = 0; ii < spec->numFields; ++ii) {
FieldSpec *field = &spec->fields[ii];
RedisModuleString *v = NULL;
// Hash command is not related to other type such as JSON
RedisModule_HashGet(k, REDISMODULE_HASH_CFIELDS, field->path, &v, NULL);
if (v == NULL) {
continue;
Expand Down
3 changes: 1 addition & 2 deletions src/forward_index.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ static void ForwardIndex_HandleToken(ForwardIndex *idx, const char *tok, size_t
// LG_DEBUG("token %.*s, hval %d\n", t.len, t.s, hval);
ForwardIndexEntry *h = NULL;
int isNew = 0;
uint32_t hash = hashKey(tok, tokLen);
uint32_t hash = hashKey(tok, tokLen); // NULL for ""
khIdxEntry *kh = makeEntry(idx, tok, tokLen, hash, &isNew);
h = &kh->ent;

Expand Down Expand Up @@ -295,7 +295,6 @@ ForwardIndexIterator ForwardIndex_Iterate(ForwardIndex *i) {
iter.hits = i->hits;
iter.curBucketIdx = 0;
iter.curEnt = NULL;
// khTable_Dump(iter.hits);
return iter;
}

Expand Down
12 changes: 6 additions & 6 deletions src/forward_index.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
/*
* Copyright Redis Ltd. 2016 - present
* Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
* the Server Side Public License v1 (SSPLv1).
*/
/*
* Copyright Redis Ltd. 2016 - present
* Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
* the Server Side Public License v1 (SSPLv1).
*/

#ifndef __FORWARD_INDEX_H__
#define __FORWARD_INDEX_H__
#include "redisearch.h"
Expand Down
15 changes: 8 additions & 7 deletions src/fragmenter.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
/*
* Copyright Redis Ltd. 2016 - present
* Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
* the Server Side Public License v1 (SSPLv1).
*/
/*
* Copyright Redis Ltd. 2016 - present
* Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
* the Server Side Public License v1 (SSPLv1).
*/

#include "fragmenter.h"
#include "toksep.h"
#include "tokenize.h"
Expand Down Expand Up @@ -42,7 +42,8 @@ static int Fragment_HasTerm(const Fragment *frag, uint32_t termId) {
// If this is the first time the term appears in the fragment, increment the
// fragment's score by the term's score. Otherwise, increment it by half
// the fragment's score. This allows for better 'blended' results.
for (size_t ii = 0; ii < Fragment_GetNumTerms(frag); ii++) {
size_t n_terms = Fragment_GetNumTerms(frag);
for (size_t ii = 0; ii < n_terms; ii++) {
if (locs[ii].termId == termId) {
return 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/highlight_processor.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ static int hlpNext(ResultProcessor *rbase, SearchResult *r) {
// The current result should not contain an index result
const RSIndexResult *ir = r->indexResult ? r->indexResult : getIndexResult(rbase, r->docId);

// we can't work withot the inex result, just return QUEUED
// we can't work without the index result, just return QUEUED
if (!ir) {
return RS_RESULT_OK;
}
Expand Down
4 changes: 2 additions & 2 deletions src/index.c
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ void AddIntersectIterator(IndexIterator *parentIter, IndexIterator *childIter) {
ii->its[ii->num - 1] = childIter;
}

IndexIterator *NewIntersecIterator(IndexIterator **its_, size_t num, DocTable *dt,
IndexIterator *NewIntersectIterator(IndexIterator **its_, size_t num, DocTable *dt,
t_fieldMask fieldMask, int maxSlop, int inOrder, double weight) {
// printf("Creating new intersection iterator with fieldMask=%llx\n", fieldMask);
IntersectIterator *ctx = rm_calloc(1, sizeof(*ctx));
Expand Down Expand Up @@ -1302,7 +1302,7 @@ IndexIterator *NewOptionalIterator(IndexIterator *it, t_docId maxDocId, double w
return ret;
}

/* Wildcard iterator, matchin ALL documents in the index. This is used for one thing only -
/* Wildcard iterator, matching ALL documents in the index. This is used for one thing only -
* purely negative queries. If the root of the query is a negative expression, we cannot process
* it
* without a positive expression. So we create a wildcard iterator that basically just iterates
Expand Down
2 changes: 1 addition & 1 deletion src/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ void UI_Foreach(IndexIterator *it, void (*callback)(IndexReader *it));
* negative number, we will allow at most maxSlop intervening positions between the terms. If
* maxSlop is set and inOrder is 1, we assert that the terms are in
* order. I.e anexact match has maxSlop of 0 and inOrder 1. */
IndexIterator *NewIntersecIterator(IndexIterator **its, size_t num, DocTable *t,
IndexIterator *NewIntersectIterator(IndexIterator **its, size_t num, DocTable *t,
t_fieldMask fieldMask, int maxSlop, int inOrder, double weight);

/* Add an iterator to an intersect iterator */
Expand Down
14 changes: 8 additions & 6 deletions src/indexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ static void writeCurEntries(DocumentIndexer *indexer, RSAddDocumentCtx *aCtx, Re
RedisModuleKey *idxKey = NULL;
bool isNew;
InvertedIndex *invidx = Redis_OpenInvertedIndexEx(ctx, entry->term, entry->len, 1, &isNew, &idxKey);
if (isNew) {
if (isNew && strlen(entry->term) != 0) {
IndexSpec_AddTerm(spec, entry->term, entry->len);
}
if (invidx) {
Expand All @@ -108,9 +108,11 @@ static void writeCurEntries(DocumentIndexer *indexer, RSAddDocumentCtx *aCtx, Re
}
}

if (spec->suffixMask & entry->fieldMask && entry->term[0] != STEM_PREFIX
&& entry->term[0] != PHONETIC_PREFIX
&& entry->term[0] != SYNONYM_PREFIX_CHAR) {
if (spec->suffixMask & entry->fieldMask
&& entry->term[0] != STEM_PREFIX
&& entry->term[0] != PHONETIC_PREFIX
&& entry->term[0] != SYNONYM_PREFIX_CHAR
&& strlen(entry->term) != 0) {
addSuffixTrie(spec->suffix, entry->term, entry->len);
}

Expand Down Expand Up @@ -198,7 +200,7 @@ static void doAssignIds(RSAddDocumentCtx *cur, RedisSearchCtx *ctx) {

if (cur->byteOffsets) {
ByteOffsetWriter_Move(&cur->offsetsWriter, cur->byteOffsets);
DocTable_SetByteOffsets(&spec->docs, md, cur->byteOffsets);
DocTable_SetByteOffsets(md, cur->byteOffsets);
cur->byteOffsets = NULL;
}
DMD_Return(md);
Expand Down Expand Up @@ -278,7 +280,7 @@ static void Indexer_Process(DocumentIndexer *indexer, RSAddDocumentCtx *aCtx) {
Document *doc = aCtx->doc;

/**
* Document ID assignment:
* Document ID & sorting-vector assignment:
* In order to hold the GIL for as short a time as possible, we assign
* document IDs in bulk. We begin using the first document ID that is assumed
* to be zero.
Expand Down
7 changes: 3 additions & 4 deletions src/inverted_index.c
Original file line number Diff line number Diff line change
Expand Up @@ -336,13 +336,13 @@
printf("InvertedIndex {\n");
++indent;
PRINT_INDENT(indent);
printf("numDocs %u, lastId %ld, size %u\n", idx->numDocs, idx->lastId, idx->size);
printf("numDocs %u, lastId %llu, size %u\n", idx->numDocs, idx->lastId, idx->size);

Check warning on line 339 in src/inverted_index.c

View check run for this annotation

Codecov / codecov/patch

src/inverted_index.c#L339

Added line #L339 was not covered by tests

RSIndexResult *res = NULL;
IndexReader *ir = NewNumericReader(NULL, idx, NULL ,0, 0, false);
while (INDEXREAD_OK == IR_Read(ir, &res)) {
PRINT_INDENT(indent);
printf("value %f, docId %lu\n", res->num.value, res->docId);
printf("value %f, docId %llu\n", res->num.value, res->docId);

Check warning on line 345 in src/inverted_index.c

View check run for this annotation

Codecov / codecov/patch

src/inverted_index.c#L345

Added line #L345 was not covered by tests
}
IR_Free(ir);
--indent;
Expand All @@ -356,7 +356,7 @@
printf("IndexBlock {\n");
++indent;
PRINT_INDENT(indent);
printf("numEntries %u, firstId %lu, lastId %lu, \n", b->numEntries, b->firstId, b->lastId);
printf("numEntries %u, firstId %llu, lastId %llu, \n", b->numEntries, b->firstId, b->lastId);

Check warning on line 359 in src/inverted_index.c

View check run for this annotation

Codecov / codecov/patch

src/inverted_index.c#L359

Added line #L359 was not covered by tests
--indent;
PRINT_INDENT(indent);
printf("}\n");
Expand Down Expand Up @@ -448,7 +448,6 @@
// 1. Full encoding - docId, freq, flags, offset
case Index_StoreFreqs | Index_StoreTermOffsets | Index_StoreFieldFlags:
return encodeFull;

case Index_StoreFreqs | Index_StoreTermOffsets | Index_StoreFieldFlags | Index_WideSchema:
return encodeFullWide;

Expand Down
22 changes: 3 additions & 19 deletions src/json.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,11 @@
}
break;
case JSONType_Object:
if (fieldType == INDEXFLD_T_GEOMETRY || (fieldType == INDEXFLD_T_TAG)) {
if (fieldType == INDEXFLD_T_GEOMETRY) {
// A GEOSHAPE field can be represented as GEOJSON "geoshape" object
// An EMPTY TAG field can index empty objects.
rv = REDISMODULE_OK;
} else {
QueryError_SetError(status, QUERY_EPARSEARGS, "Invalid JSON type: Object type can represent only GEOMETRY and EMPTY TAG fields");
QueryError_SetError(status, QUERY_EPARSEARGS, "Invalid JSON type: Object type can represent only GEOMETRY fields");
}
break;
// null type is not supported
Expand Down Expand Up @@ -525,22 +524,7 @@
}
break;
case JSONType_Object:
if (FieldSpec_IndexesEmpty(fs)) {
size_t len;
japi->getLen(json, &len);
if (len == 0) {
// We wish to index empty JSON objects as empty values
df->strlen = 0;
df->strval = rm_strdup("");
df->unionType = FLD_VAR_T_CSTR;
} else {
// Only empty objects are indexed for TAGs (that index empty values)
rv = REDISMODULE_ERR;
}
} else {
// Only empty objects are indexed for TAGs (that index empty values)
rv = REDISMODULE_ERR;
}
rv = REDISMODULE_ERR;

Check warning on line 527 in src/json.c

View check run for this annotation

Codecov / codecov/patch

src/json.c#L527

Added line #L527 was not covered by tests
break;
case JSONType__EOF:
RS_LOG_ASSERT(0, "Should not happen");
Expand Down
31 changes: 2 additions & 29 deletions src/numeric_index.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
++indent;

PRINT_INDENT(indent);
printf("numEntries %lu, numRanges %lu, lastDocId %ld\n", t->numEntries, t->numRanges, t->lastDocId);
printf("numEntries %lu, numRanges %lu, lastDocId %llu\n", t->numEntries, t->numRanges, t->lastDocId);

Check warning on line 38 in src/numeric_index.c

View check run for this annotation

Codecov / codecov/patch

src/numeric_index.c#L38

Added line #L38 was not covered by tests
NumericRangeNode_Dump(t->root, indent + 1);

--indent;
Expand Down Expand Up @@ -655,34 +655,7 @@

NumericRangeTree *OpenNumericIndex(RedisSearchCtx *ctx, RedisModuleString *keyName,
RedisModuleKey **idxKey) {

NumericRangeTree *t;
if (!ctx->spec->keysDict) {
RedisModuleKey *key_s = NULL;

if (!idxKey) {
idxKey = &key_s;
}

*idxKey = RedisModule_OpenKey(ctx->redisCtx, keyName, REDISMODULE_READ | REDISMODULE_WRITE);

int type = RedisModule_KeyType(*idxKey);
if (type != REDISMODULE_KEYTYPE_EMPTY &&
RedisModule_ModuleTypeGetType(*idxKey) != NumericIndexType) {
return NULL;
}

/* Create an empty value object if the key is currently empty. */
if (type == REDISMODULE_KEYTYPE_EMPTY) {
t = NewNumericRangeTree();
RedisModule_ModuleTypeSetValue((*idxKey), NumericIndexType, t);
} else {
t = RedisModule_ModuleTypeGetValue(*idxKey);
}
} else {
t = openNumericKeysDict(ctx->spec, keyName, 1);
}
return t;
return openNumericKeysDict(ctx->spec, keyName, 1);
raz-mon marked this conversation as resolved.
Show resolved Hide resolved
}

void __numericIndex_memUsageCallback(NumericRangeNode *n, void *ctx) {
Expand Down
14 changes: 8 additions & 6 deletions src/numeric_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ typedef struct {
size_t appearances;
} CardinalityValue;

/* A numeric range is a node in a numeric range tree, representing a range of values bunched
* together.
* Since we do not know the distribution of scores ahead, we use a splitting approach - we start
* with single value nodes, and when a node passes some cardinality we split it.
* We save the minimum and maximum values inside the node, and when we split we split by finding the
* median value */
/* A numeric range is a node in a numeric range tree, representing a range of
* values bunched together.
* Since we do not know the distribution of scores ahead, we use a splitting
* approach - we start with single value nodes, and when a node passes some
* cardinality we split it.
* We save the minimum and maximum values inside the node, and when we split we
* split by finding the median value.
*/
typedef struct {
double minVal;
double maxVal;
Expand Down
Loading
Loading