fixed union filtering with field masks

RediSearch · Dec 11, 2016 · bc6a18d · bc6a18d
1 parent 2bf78eb
commit bc6a18d
Show file tree

Hide file tree

Showing 10 changed files with 89 additions and 65 deletions.
diff --git a/src/index.c b/src/index.c
@@ -94,15 +94,17 @@ int IR_Read(void *ctx, IndexHit *e) {
 
   // add tf-idf score of the entry to the hit
   if (rc == INDEXREAD_OK) {
-    // LG_DEBUG("docId %d Flags 0x%x, field mask 0x%x, intersection: %x",
-    // e->docId, e->flags,
-    // ir->fieldMask, e->flags & ir->fieldMask);
+    // printf("docId %d Flags 0x%x, field mask 0x%x, intersection: %x\n",
+    //  e->docId, e->flags,
+    //  ir->fieldMask, e->flags & ir->fieldMask);
     if (!(e->flags & ir->fieldMask)) {
-      // LG_DEBUG("Skipping %d", e->docId);
+      // pri/ntf("Skipping %d\n", e->docId);
       return INDEXREAD_NOTFOUND;
     }
 
     e->totalFreq = tfidf(freq, ir->header.numDocs);
+    ++ir->len;
+    // printf("hit %d\n", ir->len);
   }
   e->type = H_RAW;
 
@@ -187,9 +189,17 @@ int IR_SkipTo(void *ctx, u_int32_t docId, IndexHit *hit) {
   return INDEXREAD_EOF;
 }
 
-size_t IR_NumDocs(void *ir) { 
-  //printf("num docs: %d\n", ((IndexReader *)ir)->header.numDocs);
-  return (size_t)((IndexReader *)ir)->header.numDocs; 
+size_t IR_NumDocs(void *ctx) {
+  IndexReader *ir = ctx;
+
+  // in single word optimized mode we only know the size of the record from the
+  // header.
+  if (ir->singleWordMode) {
+    return ir->header.numDocs;
+  }
+
+  // otherwise we use our counter
+  return ir->len;
 }
 
 IndexReader *NewIndexReader(void *data, size_t datalen, SkipIndex *si,
@@ -210,6 +220,7 @@ IndexReader *NewIndexReaderBuf(Buffer *buf, SkipIndex *si, DocTable *dt,
   ret->skipIdxPos = 0;
   ret->skipIdx = NULL;
   ret->docTable = dt;
+  ret->len = 0;
   ret->singleWordMode = singleWordMode;
   // only use score index on single words, no field filter and large entries
   ret->useScoreIndex = sci != NULL && singleWordMode && fieldMask == 0xff &&
@@ -416,45 +427,54 @@ int UI_Read(void *ctx, IndexHit *hit) {
     return 0;
   }
 
-  int minIdx = -1;
+
+  int numActive = 0;
   do {
     // find the minimal iterator
     t_docId minDocId = __UINT32_MAX__;
-    minIdx = -1;
+    int minIdx = -1;
+    numActive = 0;
+    int rc = INDEXREAD_EOF;
     for (int i = 0; i < ui->num; i++) {
       IndexIterator *it = ui->its[i];
 
       if (it == NULL)
         continue;
 
-      // if (it->HasNext(it->ctx)) {
-      // if this hit is behind the min id - read the next entry
-      if (ui->currentHits[i].docId <= ui->minDocId || ui->minDocId == 0) {
-        if (it->Read(it->ctx, &ui->currentHits[i]) != INDEXREAD_OK) {
-          continue;
+      rc = INDEXREAD_OK;
+      //if (it->HasNext(it->ctx)) {
+        // if this hit is behind the min id - read the next entry
+        if (ui->currentHits[i].docId <= ui->minDocId || ui->minDocId == 0) {
+          rc = INDEXREAD_NOTFOUND;
+          // read while we're not at the end and perhaps the flags do not match
+          while (rc == INDEXREAD_NOTFOUND) {
+            rc = it->Read(it->ctx, &ui->currentHits[i]);
+          }              
         }
-      }
-      if (ui->currentHits[i].docId < minDocId) {
-        minDocId = ui->currentHits[i].docId;
-        minIdx = i;
-      }
-      //}
+
+        if (rc != INDEXREAD_EOF) {
+          numActive++;
+        } 
+
+        if (rc == INDEXREAD_OK && ui->currentHits[i].docId < minDocId) {
+          minDocId = ui->currentHits[i].docId;
+          minIdx = i;
+        }
+//      }
+
     }
+
+    // take the minimum entry and yield it
+    if (minIdx != -1) {
 
-    // not found a new minimal docId
-    if (minIdx == -1) {
-      return INDEXREAD_EOF;
+      *hit = ui->currentHits[minIdx];
+      hit->type = H_UNION;
+      ui->minDocId = ui->currentHits[minIdx].docId;
+      ui->len++;
+      return INDEXREAD_OK;
     }
 
-    *hit = ui->currentHits[minIdx];
-    hit->type = H_UNION;
-    ui->minDocId = ui->currentHits[minIdx].docId;
-
-    ui->len++;
-
-    return INDEXREAD_OK;
-
-  } while (minIdx >= 0);
+  } while (numActive > 0);
 
   return INDEXREAD_EOF;
 }
@@ -540,9 +560,7 @@ void UnionIterator_Free(IndexIterator *it) {
   free(it);
 }
 
-size_t UI_Len(void *ctx) {
-  return ((UnionContext *)ctx)->len;
-}
+size_t UI_Len(void *ctx) { return ((UnionContext *)ctx)->len; }
 
 void ReadIterator_Free(IndexIterator *it) {
   if (it == NULL) {
@@ -752,7 +770,4 @@ int II_HasNext(void *ctx) {
 
 t_docId II_LastDocId(void *ctx) { return ((IntersectContext *)ctx)->lastDocId; }
 
-
-size_t II_Len(void *ctx) {
-  return ((IntersectContext *)ctx)->len;
-}
+size_t II_Len(void *ctx) { return ((IntersectContext *)ctx)->len; }
diff --git a/src/index.h b/src/index.h
@@ -83,6 +83,8 @@ typedef struct indexReader {
     ScoreIndex *scoreIndex;
     int useScoreIndex;
     u_char fieldMask;
+
+    size_t len;
 } IndexReader;
 
 /* An IndexWriter writes forward index entries to an index buffer */

diff --git a/src/numeric_index.c b/src/numeric_index.c
@@ -19,7 +19,7 @@ int numericFilter_Match(NumericFilter *f, double score) {
 #define NUMERIC_INDEX_KEY_FMT "num:%s/%s"
 
 RedisModuleString *fmtNumericIndexKey(RedisSearchCtx *ctx, const char *field) {
-    return RMUtil_CreateFormattedString(ctx->redisCtx, NUMERIC_INDEX_KEY_FMT, ctx->spec->name,
+    return RedisModule_CreateStringPrintf(ctx->redisCtx, NUMERIC_INDEX_KEY_FMT, ctx->spec->name,
                                         field);
 }
 
@@ -46,7 +46,7 @@ int NumerIndex_Add(NumericIndex *idx, t_docId docId, double score) {
     if (idx->key == NULL) return REDISMODULE_ERR;
 
     return RedisModule_ZsetAdd(idx->key, score,
-                               RMUtil_CreateFormattedString(idx->ctx->redisCtx, "%u", docId), NULL);
+                               RedisModule_CreateStringPrintf(idx->ctx->redisCtx, "%u", docId), NULL);
 }
 
 int NumericFilter_Read(void *ctx, IndexHit *e) {

diff --git a/src/query.c b/src/query.c
@@ -71,10 +71,11 @@ IndexIterator *query_EvalLoadStage(Query *q, QueryStage *stage) {
     // if there's only one word in the query and no special field filtering,
     // and we are not paging beyond MAX_SCOREINDEX_SIZE 
     // we can just use the optimized score index
-
+    
     int isSingleWord = q->numTokens == 1 && q->root->nchildren == 1 &&
             q->fieldMask == 0xff &&
             q->offset + q->limit <= MAX_SCOREINDEX_SIZE;
+    //printf("singleword? %d, numTokens: %d, fields %x\n", isSingleWord, q->numTokens, q->fieldMask);
 
     IndexReader *ir = Redis_OpenReader(q->ctx, stage->value, strlen(stage->value), q->docTable,
                                        isSingleWord, q->fieldMask);

diff --git a/src/redis_index.c b/src/redis_index.c
@@ -11,19 +11,19 @@
 */
 RedisModuleString *fmtRedisTermKey(RedisSearchCtx *ctx, const char *term,
                                    size_t len) {
-  return RMUtil_CreateFormattedString(ctx->redisCtx, TERM_KEY_FORMAT,
+  return RedisModule_CreateStringPrintf(ctx->redisCtx, TERM_KEY_FORMAT,
                                       ctx->spec->name, len, term);
 }
 
 RedisModuleString *fmtRedisSkipIndexKey(RedisSearchCtx *ctx, const char *term,
                                         size_t len) {
-  return RMUtil_CreateFormattedString(ctx->redisCtx, SKIPINDEX_KEY_FORMAT,
+  return RedisModule_CreateStringPrintf(ctx->redisCtx, SKIPINDEX_KEY_FORMAT,
                                       ctx->spec->name, len, term);
 }
 
 RedisModuleString *fmtRedisScoreIndexKey(RedisSearchCtx *ctx, const char *term,
                                          size_t len) {
-  return RMUtil_CreateFormattedString(ctx->redisCtx, SCOREINDEX_KEY_FORMAT,
+  return RedisModule_CreateStringPrintf(ctx->redisCtx, SCOREINDEX_KEY_FORMAT,
                                       ctx->spec->name, len, term);
 }
 /**
@@ -470,7 +470,7 @@ int Redis_DropIndex(RedisSearchCtx *ctx, int deleteDocuments) {
       RedisModule_CloseKey(k);
     }
 
-    RedisModuleString *dmd = RMUtil_CreateFormattedString(
+    RedisModuleString *dmd = RedisModule_CreateStringPrintf(
         ctx->redisCtx, DOCTABLE_KEY_FMT, ctx->spec->name);
     RedisModule_Call(ctx->redisCtx, "DEL", "cccs", REDISINDEX_DOCKEY_MAP,
                      REDISINDEX_DOCIDS_MAP, REDISINDEX_DOCIDCOUNTER, dmd);

diff --git a/src/rmutil/strings.c b/src/rmutil/strings.c
@@ -6,18 +6,18 @@
 
 #include "sds.h"
 
-RedisModuleString *RMUtil_CreateFormattedString(RedisModuleCtx *ctx, const char *fmt, ...) {
-    sds s = sdsempty();
+// RedisModuleString *RMUtil_CreateFormattedString(RedisModuleCtx *ctx, const char *fmt, ...) {
+//     sds s = sdsempty();
 
-    va_list ap;
-    va_start(ap, fmt);
-    s = sdscatvprintf(s, fmt, ap);
-    va_end(ap);
+//     va_list ap;
+//     va_start(ap, fmt);
+//     s = sdscatvprintf(s, fmt, ap);
+//     va_end(ap);
 
-    RedisModuleString *ret = RedisModule_CreateString(ctx, (const char *)s, sdslen(s));
-    sdsfree(s);
-    return ret;
-}
+//     RedisModuleString *ret = RedisModule_CreateString(ctx, (const char *)s, sdslen(s));
+//     sdsfree(s);
+//     return ret;
+// }
 
 int RMUtil_StringEquals(RedisModuleString *s1, RedisModuleString *s2) {
 
@@ -26,8 +26,9 @@ int RMUtil_StringEquals(RedisModuleString *s1, RedisModuleString *s2) {
     size_t l1, l2;
     c1 = RedisModule_StringPtrLen(s1, &l1);
     c2 = RedisModule_StringPtrLen(s2, &l2);
-
-    return strncasecmp(c1, c2, MIN(l1,l2)) == 0;
+    if (l1 != l2) return 0;
+
+    return strncmp(c1, c2, l1) == 0;
 }
 
 int RMUtil_StringEqualsC(RedisModuleString *s1, const char *s2) {
@@ -36,9 +37,9 @@ int RMUtil_StringEqualsC(RedisModuleString *s1, const char *s2) {
     const char *c1;
     size_t l1, l2 = strlen(s2);
     c1 = RedisModule_StringPtrLen(s1, &l1);
+    if (l1 != l2) return 0;
 
-
-    return strncasecmp(c1, s2, MIN(l1,l2)) == 0;
+    return strncmp(c1, s2, l1) == 0;
 }
 
 void RMUtil_StringToLower(RedisModuleString *s) {

diff --git a/src/rmutil/strings.h b/src/rmutil/strings.h
@@ -7,7 +7,10 @@
 * Create a new RedisModuleString object from a printf-style format and arguments.
 * Note that RedisModuleString objects CANNOT be used as formatting arguments.
 */
-RedisModuleString *RMUtil_CreateFormattedString(RedisModuleCtx *ctx, const char *fmt, ...);
+// DEPRECATED since it was added to the RedisModule API. Replaced with a macro below
+//RedisModuleString *RMUtil_CreateFormattedString(RedisModuleCtx *ctx, const char *fmt, ...);
+#define RMUtil_CreateFormattedString RedisModule_CreateStringPrintf
+
 
 /* Return 1 if the two strings are equal. Case *sensitive* */
 int RMUtil_StringEquals(RedisModuleString *s1, RedisModuleString *s2);

diff --git a/src/spec.c b/src/spec.c
@@ -91,7 +91,7 @@ void IndexSpec_Free(IndexSpec *spec) {
 /* Saves the spec as a LIST, containing basically the arguments needed to recreate the spec */
 int IndexSpec_Save(RedisModuleCtx *ctx, IndexSpec *sp) {
     RedisModuleKey *k =
-        RedisModule_OpenKey(ctx, RMUtil_CreateFormattedString(ctx, "idx:%s", sp->name),
+        RedisModule_OpenKey(ctx, RedisModule_CreateStringPrintf(ctx, "idx:%s", sp->name),
                             REDISMODULE_READ | REDISMODULE_WRITE);
     if (k == NULL) {
         return REDISMODULE_ERR;
@@ -107,7 +107,7 @@ int IndexSpec_Save(RedisModuleCtx *ctx, IndexSpec *sp) {
             RedisModule_CreateString(ctx, sp->fields[i].name, strlen(sp->fields[i].name)));
         if (sp->fields[i].type == F_FULLTEXT) {
             RedisModule_ListPush(k, REDISMODULE_LIST_TAIL,
-                                 RMUtil_CreateFormattedString(ctx, "%f", sp->fields[i].weight));
+                                 RedisModule_CreateStringPrintf(ctx, "%f", sp->fields[i].weight));
         } else {
             RedisModule_ListPush(k, REDISMODULE_LIST_TAIL,
                                  RedisModule_CreateString(ctx, NUMERIC_STR, strlen(NUMERIC_STR)));
@@ -123,7 +123,7 @@ int IndexSpec_Load(RedisModuleCtx *ctx, IndexSpec *sp, const char *name) {
     sp->name = name;
 
     RedisModuleCallReply *resp = RedisModule_Call(
-        ctx, "LRANGE", "scc", RMUtil_CreateFormattedString(ctx, "idx:%s", sp->name), "0", "-1");
+        ctx, "LRANGE", "scc", RedisModule_CreateStringPrintf(ctx, "idx:%s", sp->name), "0", "-1");
     if (resp == NULL || RedisModule_CallReplyType(resp) != REDISMODULE_REPLY_ARRAY) {
         return REDISMODULE_ERR;
     }

diff --git a/src/tests/test_index.c b/src/tests/test_index.c
@@ -247,8 +247,9 @@ int testUnion() {
                     15, 16, 18, 20, 21, 24, 27, 30};
   int i = 0;
   while (ui->Read(ui->ctx, &h) != INDEXREAD_EOF) {
+     printf("%d <=> %d\n", h.docId, expected[i]);
     ASSERT(h.docId == expected[i++]);
-    // printf("%d, ", h.docId);
+     //printf("%d, ", h.docId);
   }
   IW_Free(w);
   IW_Free(w2);

diff --git a/src/tokenize.c b/src/tokenize.c
@@ -68,6 +68,7 @@ int _tokenize(TokenizerCtx *ctx) {
                 t.s = strndup(stem, sl);
                 t.type = DT_STEM;
                 t.len = sl;
+                t.fieldId = ctx->fieldId;
                 t.stringFreeable = 1;
                 if (ctx->tokenFunc(ctx->tokenFuncCtx, t) != 0) {
                     break;