Skip to content

Commit

Permalink
Be more efficient when resetting stemmers
Browse files Browse the repository at this point in the history
  • Loading branch information
mnunberg committed Jun 25, 2018
1 parent 902a435 commit ec42b4d
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 8 deletions.
13 changes: 10 additions & 3 deletions src/forward_index.c
Expand Up @@ -68,10 +68,15 @@ static void ForwardIndex_InitCommon(ForwardIndex *idx, Document *doc, uint32_t i
idx->idxFlags = idxFlags;
idx->maxFreq = 0;
idx->totalFreq = 0;
if (idx->stemmer) {

if (idx->stemmer && !ResetStemmer(idx->stemmer, SnowballStemmer, doc->language)) {
idx->stemmer->Free(idx->stemmer);
idx->stemmer = NULL;
}

if (!idx->stemmer) {
idx->stemmer = NewStemmer(SnowballStemmer, doc->language);
}
idx->stemmer = NewStemmer(SnowballStemmer, doc->language);
}

ForwardIndex *NewForwardIndex(Document *doc, uint32_t idxFlags) {
Expand All @@ -81,7 +86,9 @@ ForwardIndex *NewForwardIndex(Document *doc, uint32_t idxFlags) {
BlkAlloc_Init(&idx->entries);

static const KHTableProcs procs = {
.Alloc = allocBucketEntry, .Compare = khtCompare, .Hash = khtHash,
.Alloc = allocBucketEntry,
.Compare = khtCompare,
.Hash = khtHash,
};

size_t termCount = estimtateTermCount(doc);
Expand Down
36 changes: 31 additions & 5 deletions src/stemmer.c
Expand Up @@ -2,6 +2,7 @@
#include <string.h>
#include <stdio.h>
#include <sys/param.h>
#include <assert.h>
#include "dep/snowball/include/libstemmer.h"

const char *__supportedLanguages[] = {"arabic", "danish", "dutch", "english", "finnish",
Expand Down Expand Up @@ -61,6 +62,13 @@ void __sbstemmer_Free(Stemmer *s) {
free(s);
}

static int sbstemmer_Reset(Stemmer *stemmer, StemmerType type, const char *language) {
if (type != stemmer->type || strcmp(stemmer->language, language)) {
return 0;
}
return 1;
}

Stemmer *__newSnowballStemmer(const char *language) {
struct sb_stemmer *sb = sb_stemmer_new(language, NULL);
// No stemmer available for this language
Expand All @@ -78,16 +86,34 @@ Stemmer *__newSnowballStemmer(const char *language) {
ret->ctx = ctx;
ret->Stem = __sbstemmer_Stem;
ret->Free = __sbstemmer_Free;
ret->Reset = sbstemmer_Reset;
return ret;
}

Stemmer *NewStemmer(StemmerType type, const char *language) {
switch (type) {
case SnowballStemmer:
Stemmer *ret = NULL;
if (type == SnowballStemmer) {
ret = __newSnowballStemmer(language);
if (!ret) {
return NULL;
}
} else {
fprintf(stderr, "Invalid stemmer type");
return NULL;
}

return __newSnowballStemmer(language);
for (const char **s = __supportedLanguages; *s; s++) {
if (!strcmp(language, *s)) {
ret->language = *s;
break;
}
}

fprintf(stderr, "Invalid stemmer type");
return NULL;
assert(ret->language);
ret->type = type;
return ret;
}

int ResetStemmer(Stemmer *stemmer, StemmerType type, const char *language) {
return stemmer->Reset && stemmer->Reset(stemmer, type, language);
}
9 changes: 9 additions & 0 deletions src/stemmer.h
Expand Up @@ -14,10 +14,19 @@ typedef struct stemmer {
void *ctx;
const char *(*Stem)(void *ctx, const char *word, size_t len, size_t *outlen);
void (*Free)(struct stemmer *);

// Attempts to reset the stemmer using the given language and type. Returns 0
// if this stemmer cannot be reused.
int (*Reset)(struct stemmer *, StemmerType type, const char *language);

const char *language;
StemmerType type; // Type of stemmer
} Stemmer;

Stemmer *NewStemmer(StemmerType type, const char *language);

int ResetStemmer(Stemmer *stemmer, StemmerType type, const char *language);

/* check if a language is supported by our stemmers */
int IsSupportedLanguage(const char *language, size_t len);

Expand Down

0 comments on commit ec42b4d

Please sign in to comment.