RediSearch · raz-mon · May 22, 2024 · May 12, 2024 · May 16, 2024 · May 16, 2024
diff --git a/commands.json b/commands.json
@@ -206,6 +206,12 @@
             "token": "WITHSUFFIXTRIE",
             "optional": true
           },
+          {
+            "name": "isempty",
+            "type": "pure-token",
+            "token": "ISEMPTY",
+            "optional": true
+          },
           {
             "name": "sortable",
             "type": "block",

diff --git a/src/doc_table.c b/src/doc_table.c
@@ -200,14 +200,13 @@
   return 1;
 }
 
-int DocTable_SetByteOffsets(DocTable *t, RSDocumentMetadata *dmd, RSByteOffsets *v) {
+void DocTable_SetByteOffsets(RSDocumentMetadata *dmd, RSByteOffsets *v) {
   if (!dmd) {
-    return 0;
+    return;
   }
 
   dmd->byteOffsets = v;
   dmd->flags |= Document_HasOffsetVector;
-  return 1;
 }
 
 /* Put a new document into the table, assign it an incremental id and store the metadata in the

diff --git a/src/doc_table.h b/src/doc_table.h
@@ -126,7 +126,7 @@ int DocTable_SetSortingVector(DocTable *t, RSDocumentMetadata *dmd, RSSortingVec
 /* Set the offset vector for a document. This contains the byte offsets of each token found in
  * the document. This is used for highlighting
  */
-int DocTable_SetByteOffsets(DocTable *t, RSDocumentMetadata *dmd, RSByteOffsets *offsets);
+void DocTable_SetByteOffsets(RSDocumentMetadata *dmd, RSByteOffsets *offsets);
 
 /** Get the docId of a key if it exists in the table, or 0 if it doesnt */
 t_docId DocTable_GetId(const DocTable *dt, const char *s, size_t n);

diff --git a/src/document.c b/src/document.c
@@ -219,7 +219,6 @@
   }
 
   aCtx->tokenizer = GetTokenizer(doc->language, aCtx->fwIdx->stemmer, sp->stopwords);
-//  aCtx->doc->docId = 0;
   return aCtx;
 }
 
@@ -321,10 +320,7 @@
 }
 
 void AddDocumentCtx_Free(RSAddDocumentCtx *aCtx) {
-  /**
-   * Free preprocessed data; this is the only reliable place
-   * to do it
-   */
+  // Free preprocessed data; this is the only reliable place to do it.
   for (size_t ii = 0; ii < aCtx->doc->numFields; ++ii) {
     if (FIELD_IS_VALID(aCtx, ii)) {
       if (FIELD_IS(aCtx->fspecs + ii, INDEXFLD_T_TAG) && aCtx->fdatas[ii].tags) {
@@ -405,6 +401,7 @@
   size_t fl;
   const char *c = DocumentField_GetValueCStr(field, &fl);
   size_t valueCount = (field->unionType != FLD_VAR_T_ARRAY ? 1 : field->arrayLen);
+  bool indexesEmpty = FieldSpec_IndexesEmpty(fs);
 
   if (FieldSpec_IsSortable(fs)) {
     if (field->unionType != FLD_VAR_T_ARRAY) {
@@ -449,8 +446,12 @@
       aCtx->tokenizer->Start(aCtx->tokenizer, (char *)c, fl, options);
 
       Token tok = {0};
-      uint32_t newTokPos;
-      while (0 != (newTokPos = aCtx->tokenizer->Next(aCtx->tokenizer, &tok))) {
+      while (0 != aCtx->tokenizer->Next(aCtx->tokenizer, &tok)) {
+        if (!indexesEmpty && strlen(tok.tok) == 0) {
+          // Skip empty values if the field should not index them
+          // Empty tokens are returned only if the original value was empty
+          continue;
+        }
         forwardIndexTokenFunc(&tokCtx, &tok);
       }
       uint32_t lastTokPos = aCtx->tokenizer->ctx.lastOffset;
@@ -544,16 +545,10 @@
       return 0;
     case FLD_VAR_T_ARRAY:
       fdata->isMulti = 1;
-      // TODO: GEOMETRY - parse geometries from string
-      //fdata->arrGeometry = ...
       return 0;
     default:
       return -1;
   }
-
-  // TODO: GEOMETRY
-  // If this is a sortable geomtry value - copy the value to the sorting vector
-
 }
 
 FIELD_BULK_INDEXER(geometryIndexer) {
@@ -694,8 +689,7 @@
       break;
     case FLD_VAR_T_BLOB_ARRAY:
     case FLD_VAR_T_NUM:
-    case FLD_VAR_T_GEOMETRY:
-      RS_LOG_ASSERT(0, "Oops");
+      RS_LOG_ASSERT(0, "Unsupported field type for GEO index");
   }
 
   const char *str = NULL;

diff --git a/src/document_basic.c b/src/document_basic.c
@@ -135,11 +135,11 @@ int Document_LoadSchemaFieldHash(Document *doc, RedisSearchCtx *sctx, QueryError
     RedisModule_FreeString(sctx->redisCtx, payload_rms);
   }
 
+  // Load indexed fields from the document
   doc->fields = rm_calloc(nitems, sizeof(*doc->fields));
   for (size_t ii = 0; ii < spec->numFields; ++ii) {
     FieldSpec *field = &spec->fields[ii];
     RedisModuleString *v = NULL;
-    // Hash command is not related to other type such as JSON
     RedisModule_HashGet(k, REDISMODULE_HASH_CFIELDS, field->path, &v, NULL);
     if (v == NULL) {
       continue;

diff --git a/src/forward_index.c b/src/forward_index.c
@@ -182,7 +182,7 @@ static void ForwardIndex_HandleToken(ForwardIndex *idx, const char *tok, size_t
   // LG_DEBUG("token %.*s, hval %d\n", t.len, t.s, hval);
   ForwardIndexEntry *h = NULL;
   int isNew = 0;
-  uint32_t hash = hashKey(tok, tokLen);
+  uint32_t hash = hashKey(tok, tokLen); // NULL for ""
   khIdxEntry *kh = makeEntry(idx, tok, tokLen, hash, &isNew);
   h = &kh->ent;
 
@@ -295,7 +295,6 @@ ForwardIndexIterator ForwardIndex_Iterate(ForwardIndex *i) {
   iter.hits = i->hits;
   iter.curBucketIdx = 0;
   iter.curEnt = NULL;
-  // khTable_Dump(iter.hits);
   return iter;
 }
 

diff --git a/src/forward_index.h b/src/forward_index.h
@@ -1,9 +1,9 @@
-/*
- * Copyright Redis Ltd. 2016 - present
- * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
- * the Server Side Public License v1 (SSPLv1).
- */
-
+/*
+ * Copyright Redis Ltd. 2016 - present
+ * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ * the Server Side Public License v1 (SSPLv1).
+ */
+
 #ifndef __FORWARD_INDEX_H__
 #define __FORWARD_INDEX_H__
 #include "redisearch.h"

diff --git a/src/fragmenter.c b/src/fragmenter.c
@@ -1,9 +1,9 @@
-/*
- * Copyright Redis Ltd. 2016 - present
- * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
- * the Server Side Public License v1 (SSPLv1).
- */
-
+/*
+ * Copyright Redis Ltd. 2016 - present
+ * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ * the Server Side Public License v1 (SSPLv1).
+ */
+
 #include "fragmenter.h"
 #include "toksep.h"
 #include "tokenize.h"
@@ -42,7 +42,8 @@ static int Fragment_HasTerm(const Fragment *frag, uint32_t termId) {
   // If this is the first time the term appears in the fragment, increment the
   // fragment's score by the term's score. Otherwise, increment it by half
   // the fragment's score. This allows for better 'blended' results.
-  for (size_t ii = 0; ii < Fragment_GetNumTerms(frag); ii++) {
+  size_t n_terms = Fragment_GetNumTerms(frag);
+  for (size_t ii = 0; ii < n_terms; ii++) {
     if (locs[ii].termId == termId) {
       return 1;
     }

diff --git a/src/highlight_processor.c b/src/highlight_processor.c
@@ -296,7 +296,7 @@ static int hlpNext(ResultProcessor *rbase, SearchResult *r) {
   // The current result should not contain an index result
   const RSIndexResult *ir = r->indexResult ? r->indexResult : getIndexResult(rbase, r->docId);
 
-  // we can't work withot the inex result, just return QUEUED
+  // we can't work without the index result, just return QUEUED
   if (!ir) {
     return RS_RESULT_OK;
   }

diff --git a/src/index.c b/src/index.c
@@ -709,7 +709,7 @@ void AddIntersectIterator(IndexIterator *parentIter, IndexIterator *childIter) {
   ii->its[ii->num - 1] = childIter;
 }
 
-IndexIterator *NewIntersecIterator(IndexIterator **its_, size_t num, DocTable *dt,
+IndexIterator *NewIntersectIterator(IndexIterator **its_, size_t num, DocTable *dt,
                                    t_fieldMask fieldMask, int maxSlop, int inOrder, double weight) {
   // printf("Creating new intersection iterator with fieldMask=%llx\n", fieldMask);
   IntersectIterator *ctx = rm_calloc(1, sizeof(*ctx));
@@ -1302,7 +1302,7 @@ IndexIterator *NewOptionalIterator(IndexIterator *it, t_docId maxDocId, double w
   return ret;
 }
 
-/* Wildcard iterator, matchin ALL documents in the index. This is used for one thing only -
+/* Wildcard iterator, matching ALL documents in the index. This is used for one thing only -
  * purely negative queries. If the root of the query is a negative expression, we cannot process
  * it
  * without a positive expression. So we create a wildcard iterator that basically just iterates

diff --git a/src/index.h b/src/index.h
@@ -56,7 +56,7 @@ void UI_Foreach(IndexIterator *it, void (*callback)(IndexReader *it));
  * negative number, we will allow at most maxSlop intervening positions between the terms. If
  * maxSlop is set and inOrder is 1, we assert that the terms are in
  * order. I.e anexact match has maxSlop of 0 and inOrder 1.  */
-IndexIterator *NewIntersecIterator(IndexIterator **its, size_t num, DocTable *t,
+IndexIterator *NewIntersectIterator(IndexIterator **its, size_t num, DocTable *t,
                                    t_fieldMask fieldMask, int maxSlop, int inOrder, double weight);
 
 /* Add an iterator to an intersect iterator */

diff --git a/src/indexer.c b/src/indexer.c
@@ -96,7 +96,7 @@ static void writeCurEntries(DocumentIndexer *indexer, RSAddDocumentCtx *aCtx, Re
     RedisModuleKey *idxKey = NULL;
     bool isNew;
     InvertedIndex *invidx = Redis_OpenInvertedIndexEx(ctx, entry->term, entry->len, 1, &isNew, &idxKey);
-    if (isNew) {
+    if (isNew && strlen(entry->term) != 0) {
       IndexSpec_AddTerm(spec, entry->term, entry->len);
     }
     if (invidx) {
@@ -108,9 +108,11 @@ static void writeCurEntries(DocumentIndexer *indexer, RSAddDocumentCtx *aCtx, Re
       }
     }
 
-    if (spec->suffixMask & entry->fieldMask && entry->term[0] != STEM_PREFIX
-                                            && entry->term[0] != PHONETIC_PREFIX
-                                            && entry->term[0] != SYNONYM_PREFIX_CHAR) {
+    if (spec->suffixMask & entry->fieldMask
+        && entry->term[0] != STEM_PREFIX
+        && entry->term[0] != PHONETIC_PREFIX
+        && entry->term[0] != SYNONYM_PREFIX_CHAR
+        && strlen(entry->term) != 0) {
       addSuffixTrie(spec->suffix, entry->term, entry->len);
     }
 
@@ -198,7 +200,7 @@ static void doAssignIds(RSAddDocumentCtx *cur, RedisSearchCtx *ctx) {
 
     if (cur->byteOffsets) {
       ByteOffsetWriter_Move(&cur->offsetsWriter, cur->byteOffsets);
-      DocTable_SetByteOffsets(&spec->docs, md, cur->byteOffsets);
+      DocTable_SetByteOffsets(md, cur->byteOffsets);
       cur->byteOffsets = NULL;
     }
     DMD_Return(md);
@@ -278,7 +280,7 @@ static void Indexer_Process(DocumentIndexer *indexer, RSAddDocumentCtx *aCtx) {
   Document *doc = aCtx->doc;
 
   /**
-   * Document ID assignment:
+   * Document ID & sorting-vector assignment:
    * In order to hold the GIL for as short a time as possible, we assign
    * document IDs in bulk. We begin using the first document ID that is assumed
    * to be zero.

diff --git a/src/inverted_index.c b/src/inverted_index.c
@@ -328,13 +328,13 @@
   printf("InvertedIndex {\n");
   ++indent;
   PRINT_INDENT(indent);
-  printf("numDocs %u, lastId %ld, size %u\n", idx->numDocs, idx->lastId, idx->size);
+  printf("numDocs %u, lastId %llu, size %u\n", idx->numDocs, idx->lastId, idx->size);
 
   RSIndexResult *res = NULL;
   IndexReader *ir = NewNumericReader(NULL, idx, NULL ,0, 0, false);
   while (INDEXREAD_OK == IR_Read(ir, &res)) {
     PRINT_INDENT(indent);
-    printf("value %f, docId %lu\n", res->num.value, res->docId);
+    printf("value %f, docId %llu\n", res->num.value, res->docId);
   }
   IR_Free(ir);
   --indent;
@@ -348,7 +348,7 @@
   printf("IndexBlock {\n");
   ++indent;
   PRINT_INDENT(indent);
-  printf("numEntries %u, firstId %lu, lastId %lu, \n", b->numEntries, b->firstId, b->lastId);
+  printf("numEntries %u, firstId %llu, lastId %llu, \n", b->numEntries, b->firstId, b->lastId);
   --indent;
   PRINT_INDENT(indent);
   printf("}\n");
@@ -440,7 +440,6 @@
     // 1. Full encoding - docId, freq, flags, offset
     case Index_StoreFreqs | Index_StoreTermOffsets | Index_StoreFieldFlags:
       return encodeFull;
-
     case Index_StoreFreqs | Index_StoreTermOffsets | Index_StoreFieldFlags | Index_WideSchema:
       return encodeFullWide;
 

diff --git a/src/json.c b/src/json.c
@@ -133,12 +133,11 @@
     }
     break;
   case JSONType_Object:
-    if (fieldType == INDEXFLD_T_GEOMETRY || (fieldType == INDEXFLD_T_TAG)) {
+    if (fieldType == INDEXFLD_T_GEOMETRY) {
       // A GEOSHAPE field can be represented as GEOJSON "geoshape" object
-      // An EMPTY TAG field can index empty objects.
       rv = REDISMODULE_OK;
     } else {
-      QueryError_SetError(status, QUERY_EPARSEARGS, "Invalid JSON type: Object type can represent only GEOMETRY and EMPTY TAG fields");
+      QueryError_SetError(status, QUERY_EPARSEARGS, "Invalid JSON type: Object type can represent only GEOMETRY fields");
     }
     break;
   // null type is not supported
@@ -525,22 +524,7 @@
       }
       break;
     case JSONType_Object:
-      if (FieldSpec_IndexesEmpty(fs)) {
-        size_t len;
-        japi->getLen(json, &len);
-        if (len == 0) {
-          // We wish to index empty JSON objects as empty values
-          df->strlen = 0;
-          df->strval = rm_strdup("");
-          df->unionType = FLD_VAR_T_CSTR;
-        } else {
-          // Only empty objects are indexed for TAGs (that index empty values)
-          rv = REDISMODULE_ERR;
-        }
-      } else {
-        // Only empty objects are indexed for TAGs (that index empty values)
-        rv = REDISMODULE_ERR;
-      }
+      rv = REDISMODULE_ERR;
       break;
     case JSONType__EOF:
       RS_LOG_ASSERT(0, "Should not happen");

diff --git a/src/numeric_index.c b/src/numeric_index.c
@@ -35,7 +35,7 @@
   ++indent;
 
   PRINT_INDENT(indent);
-  printf("numEntries %lu,  numRanges %lu, lastDocId %ld\n", t->numEntries, t->numRanges, t->lastDocId);
+  printf("numEntries %lu,  numRanges %lu, lastDocId %llu\n", t->numEntries, t->numRanges, t->lastDocId);
   NumericRangeNode_Dump(t->root, indent + 1);
 
   --indent;
@@ -657,35 +657,7 @@
 
 NumericRangeTree *OpenNumericIndex(RedisSearchCtx *ctx, RedisModuleString *keyName,
                                    RedisModuleKey **idxKey) {
-
-  NumericRangeTree *t;
-  if (!ctx->spec->keysDict) {
-    RedisModuleKey *key_s = NULL;
-
-    if (!idxKey) {
-      idxKey = &key_s;
-    }
-
-    *idxKey = RedisModule_OpenKey(ctx->redisCtx, keyName, REDISMODULE_READ | REDISMODULE_WRITE);
-
-    int type = RedisModule_KeyType(*idxKey);
-    if (type != REDISMODULE_KEYTYPE_EMPTY &&
-        RedisModule_ModuleTypeGetType(*idxKey) != NumericIndexType) {
-      return NULL;
-    }
-
-    /* Create an empty value object if the key is currently empty. */
-    if (type == REDISMODULE_KEYTYPE_EMPTY) {
-      t = NewNumericRangeTree();
-      ctx->spec->stats.invertedSize += t->root->range->invertedIndexSize;
-      RedisModule_ModuleTypeSetValue((*idxKey), NumericIndexType, t);
-    } else {
-      t = RedisModule_ModuleTypeGetValue(*idxKey);
-    }
-  } else {
-    t = openNumericKeysDict(ctx->spec, keyName, 1);
-  }
-  return t;
+  return openNumericKeysDict(ctx->spec, keyName, 1);
 }
 
 void __numericIndex_memUsageCallback(NumericRangeNode *n, void *ctx) {

diff --git a/src/numeric_index.h b/src/numeric_index.h
@@ -29,12 +29,14 @@ typedef struct {
   size_t appearances;
 } CardinalityValue;
 
-/* A numeric range is a node in a numeric range tree, representing a range of values bunched
- * together.
- * Since we do not know the distribution of scores ahead, we use a splitting approach - we start
- * with single value nodes, and when a node passes some cardinality we split it.
- * We save the minimum and maximum values inside the node, and when we split we split by finding the
- * median value */
+/* A numeric range is a node in a numeric range tree, representing a range of
+ * values bunched together.
+ * Since we do not know the distribution of scores ahead, we use a splitting
+ * approach - we start with single value nodes, and when a node passes some
+ * cardinality we split it.
+ * We save the minimum and maximum values inside the node, and when we split we
+ * split by finding the median value.
+ */
 typedef struct {
   double minVal;
   double maxVal;