Merge pull request #10 from RedisLabsModules/unicode_trie

Unicode trie
RediSearch · Dec 6, 2016 · e2e82f3 · e2e82f3
2 parents 80989a2 + d67f82d
commit e2e82f3
Show file tree

Hide file tree

Showing 66 changed files with 14,700 additions and 155 deletions.
diff --git a/src/Makefile b/src/Makefile
@@ -21,6 +21,7 @@ REDIS=redis_buffer.o module.o redis_index.o query.o spec.o
 UTILOBJS=util/heap.o util/logging.o util/fnv.o
 RMUTILOBJS=rmutil/librmutil.a
 LIBTRIE=trie/libtrie.a
+LIBNU=dep/libnu/libnu.a
 TESTS=test.o
 
 SRCDIR := $(shell pwd)
@@ -60,18 +61,21 @@ tests:
 snowball:
 	$(MAKE) -C dep/snowball libstemmer.o
 
+libnu:
+	$(MAKE) -C dep/libnu
+
 .c.xo:
 	$(CC) -I. $(CFLAGS) $(SHOBJ_CFLAGS) -fPIC -c $< -o $@
 
 module.so: $(MODULE)
-	$(LD) -o $@ $(VARINT) $(INDEX) $(TEXT) $(REDIS) $(UTILOBJS) $(RMUTILOBJS) $(LIBTRIE) $(SHOBJ_LDFLAGS) $(LIBS) -lc -lm -Bsymbolic
+	$(LD) -o $@ $(VARINT) $(INDEX) $(TEXT) $(REDIS) $(UTILOBJS) $(RMUTILOBJS) $(LIBTRIE) $(SHOBJ_LDFLAGS) $(LIBS) $(LIBNU) -lc -lm -Bsymbolic
 
 
 release: CFLAGS += $(RELEASEFLAGS)
-release: util rmutil snowball trie | module.so
+release: util rmutil snowball libnu trie | module.so
 
 debug: CFLAGS += $(DEBUGFLAGS)
-debug: util rmutil snowball trie | module.so
+debug: util rmutil snowball libnu trie | module.so
 
 
 clean:

diff --git a/src/dep/libnu/LICENSE b/src/dep/libnu/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2013 Aleksey Tulinov <aleksey.tulinov@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/src/dep/libnu/Makefile b/src/dep/libnu/Makefile
@@ -0,0 +1,30 @@
+# find the OS
+uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
+
+# Compile flags for linux / osx
+ifeq ($(uname_S),Linux)
+	CFLAGS ?= -W -Wall -fno-common -g -ggdb -fPIC -std=c99 -O2
+	CPPFLAGS ?= -W -Wall -fno-common -g -ggdb
+else
+	CFLAGS ?= -W -Wall -dynamic -fno-common -g -fPIC -ggdb -std=c99 -O2
+	CPPFLAGS ?= -W -Wall -dynamic -fno-common -g -ggdb -O2
+endif
+
+SOURCEDIR = .
+CC_SOURCES = $(wildcard $(SOURCEDIR)/*.c)
+CC_OBJECTS = $(patsubst $(SOURCEDIR)/%.c, $(SOURCEDIR)/%.o, $(CC_SOURCES))
+
+.SUFFIXES: .c .cc .o
+
+all: libnu.a
+
+# $(SOURCEDIR)/%.o: $(SOURCEDIR)/%.c
+# 	$(CC) -I. $(SHOBJ_CFLAGS) -fPIC -fpermissive -c $< -o $@
+
+# test1.xo: ../redismodule.h
+
+libnu.a: $(CC_OBJECTS)
+	ar rcs $@ $^
+
+clean:
+	rm -rf *.xo *.so *.o *.a
diff --git a/src/dep/libnu/README.md b/src/dep/libnu/README.md
@@ -0,0 +1,5 @@
+# Libnu
+
+The files in this folder are taken from the (excellent) **nunicode** library by Aleksey Tulinov.
+
+See [https://bitbucket.org/alekseyt/nunicode](https://bitbucket.org/alekseyt/nunicode)
diff --git a/src/dep/libnu/casemap.h b/src/dep/libnu/casemap.h
@@ -0,0 +1,135 @@
+#ifndef NU_TOUPPER_H
+#define NU_TOUPPER_H
+
+#include <stdint.h>
+
+#include "config.h"
+#include "defines.h"
+#include "strings.h"
+#include "udb.h"
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+/** Synonim to nu_casemap_read. It is recommended to use
+ * nu_casemap_read instead.
+ */
+#define NU_CASEMAP_DECODING_FUNCTION NU_UDB_DECODING_FUNCTION
+/** Read (decoding) function for use with transformation results of
+ * casemapping functions. E.g. nu_casemap_read(nu_tolower(0x0041));
+ * will read first codepoint of 'A' transformed to lower case.
+ */
+#define nu_casemap_read (nu_udb_read)
+
+/** Casemap codepoint
+ *
+ * @ingroup transformations
+ */
+typedef nu_transformation_t nu_casemapping_t;
+
+#ifdef NU_WITH_TOUPPER
+
+/** Return uppercase value of codepoint. Uncoditional casemapping.
+ *
+ * @ingroup transformations
+ * @param codepoint unicode codepoint
+ * @return uppercase codepoint or 0 if mapping doesn't exist
+ */
+NU_EXPORT
+const char* nu_toupper(uint32_t codepoint);
+
+/** Return uppercase value of codepoint. Context-sensitivity is not
+ * implemented internally, returned result is equal to calling nu_toupper()
+ * on corresponding codepoint.
+ *
+ * @ingroup transformations_internal
+ * @param encoded pointer to encoded string
+ * @param limit memory limit of encoded string or NU_UNLIMITED
+ * @param read read (decoding) function
+ * @param u (optional) codepoint which was (or wasn't) transformed
+ * @param transform output value of codepoint transformed into uppercase or 0
+ * if mapping doesn't exist. Can't be NULL, supposed to be decoded with
+ * nu_casemap_read
+ * @param context not used
+ * @return pointer to the next codepoint in string
+ */
+NU_EXPORT
+const char* _nu_toupper(const char *encoded, const char *limit, nu_read_iterator_t read,
+	uint32_t *u, const char **transform,
+	void *context);
+
+#endif /* NU_WITH_TOUPPER */
+
+#ifdef NU_WITH_TOLOWER
+
+/** Return lowercase value of codepoint. Unconditional casemapping.
+ *
+ * @ingroup transformations
+ * @param codepoint unicode codepoint
+ * @return lowercase codepoint or 0 if mapping doesn't exist
+ */
+NU_EXPORT
+const char* nu_tolower(uint32_t codepoint);
+
+/** Return lowercase value of codepoint. Will transform uppercase
+ * Sigma ('Σ') into final sigma ('ς') if it occurs at string boundary or
+ * followed by U+0000. Might require single read-ahead when
+ * encountering Sigma.
+ *
+ * @ingroup transformations_internal
+ * @param encoded pointer to encoded string
+ * @param limit memory limit of encoded string or NU_UNLIMITED
+ * @param read read (decoding) function
+ * @param u (optional) codepoint which was (or wasn't) transformed
+ * @param transform output value of codepoint transformed into lowercase or 0
+ * if mapping doesn't exist. Can't be NULL, supposed to be decoded with
+ * nu_casemap_read
+ * @param context not used
+ * @return pointer to the next codepoint in string
+ */
+NU_EXPORT
+const char* _nu_tolower(const char *encoded, const char *limit, nu_read_iterator_t read,
+	uint32_t *u, const char **transform,
+	void *context);
+
+#endif /* NU_WITH_TOLOWER */
+
+#ifdef NU_WITH_TOFOLD
+
+/** Return value of codepoint with case differences eliminated
+ *
+ * @ingroup transformations
+ * @param codepoint unicode codepoint
+ * @return casefolded codepoint or 0 if mapping doesn't exist
+ */
+NU_EXPORT
+const char* nu_tofold(uint32_t codepoint);
+
+/** Return value of codepoint with case differences eliminated.
+ * Context-sensitivity is not implemented internally, returned result is equal
+ * to calling nu_tofold() on corresponding codepoint.
+ *
+ * @ingroup transformations_internal
+ * @param encoded pointer to encoded string
+ * @param limit memory limit of encoded string or NU_UNLIMITED
+ * @param read read (decoding) function
+ * @param u (optional) codepoint which was (or wasn't) transformed
+ * @param transform output value of casefolded codepoint or 0
+ * if mapping doesn't exist. Can't be NULL, supposed to be decoded with
+ * nu_casemap_read
+ * @param context not used
+ * @return pointer to the next codepoint in string
+ */
+NU_EXPORT
+const char* _nu_tofold(const char *encoded, const char *limit, nu_read_iterator_t read,
+	uint32_t *u, const char **transform,
+	void *context);
+
+#endif /* NU_WITH_TOFOLD */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_TOUPPER_H */
diff --git a/src/dep/libnu/casemap_internal.h b/src/dep/libnu/casemap_internal.h
@@ -0,0 +1,21 @@
+#ifndef NU_CASEMAP_INTERNAL_H
+#define NU_CASEMAP_INTERNAL_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "udb.h"
+
+/** Casemap codepoint
+ *
+ * @ingroup transformations
+ */
+static inline
+const char* _nu_to_something(uint32_t codepoint,
+	const int16_t *G, size_t G_SIZE,
+	const uint32_t *VALUES_C, const uint16_t *VALUES_I, const uint8_t *COMBINED) {
+
+	return nu_udb_lookup(codepoint, G, G_SIZE, VALUES_C, VALUES_I, COMBINED);
+}
+
+#endif /* NU_CASEMAP_INTERNAL_H */
diff --git a/src/dep/libnu/cesu8.c b/src/dep/libnu/cesu8.c
@@ -0,0 +1,52 @@
+#include "cesu8.h"
+
+#ifdef NU_WITH_CESU8_READER
+#ifdef NU_WITH_VALIDATION
+
+int nu_cesu8_validread(const char *encoded, size_t max_len) {
+	const unsigned char *up = (const unsigned char *)(encoded);
+
+	/* i guess there is no way to detect misplaceed CESU-8
+	 * trail surrogate alone, it will produce valid UTF-8 sequence
+	 * greater than U+10000 */
+
+	/* 6-bytes sequence
+	 *
+	 * 11101101 followed by 1010xxxx should be
+	 * then followed by xxxxxxxx 11101101 1011xxxx xxxxxxxx */
+	if (*(up) == 0xED && (*(up + 1) & 0xF0) == 0xA0) {
+		if (max_len < 6) {
+			return 0;
+		}
+
+		if (*(up + 3) != 0xED || (*(up + 4) & 0xF0) != 0xB0) {
+			return 0;
+		}
+
+		return 6;
+	}
+
+	return utf8_validread_basic(encoded, max_len);
+}
+
+#endif /* NU_WITH_VALIDATION */
+#endif /* NU_WITH_CESU8_READER */
+
+#ifdef NU_WITH_CESU8_WRITER
+
+char* nu_cesu8_write(uint32_t unicode, char *cesu8) {
+	unsigned codepoint_len = cesu8_codepoint_length(unicode);
+
+	if (cesu8 != 0) {
+		switch (codepoint_len) {
+		case 1: *cesu8 = (char)(unicode); break;
+		case 2: b2_utf8(unicode, cesu8); break;
+		case 3: b3_utf8(unicode, cesu8); break;
+		default: b6_cesu8(unicode, cesu8); break; /* len == 6 */
+		}
+	}
+
+	return cesu8 + codepoint_len;
+}
+
+#endif /* NU_WITH_CESU8_WRITER */