Skip to content

Commit

Permalink
Merge pull request #10 from RedisLabsModules/unicode_trie
Browse files Browse the repository at this point in the history
Unicode trie
  • Loading branch information
dvirsky committed Dec 6, 2016
2 parents 80989a2 + d67f82d commit e2e82f3
Show file tree
Hide file tree
Showing 66 changed files with 14,700 additions and 155 deletions.
10 changes: 7 additions & 3 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ REDIS=redis_buffer.o module.o redis_index.o query.o spec.o
UTILOBJS=util/heap.o util/logging.o util/fnv.o
RMUTILOBJS=rmutil/librmutil.a
LIBTRIE=trie/libtrie.a
LIBNU=dep/libnu/libnu.a
TESTS=test.o

SRCDIR := $(shell pwd)
Expand Down Expand Up @@ -60,18 +61,21 @@ tests:
snowball:
$(MAKE) -C dep/snowball libstemmer.o

libnu:
$(MAKE) -C dep/libnu

.c.xo:
$(CC) -I. $(CFLAGS) $(SHOBJ_CFLAGS) -fPIC -c $< -o $@

module.so: $(MODULE)
$(LD) -o $@ $(VARINT) $(INDEX) $(TEXT) $(REDIS) $(UTILOBJS) $(RMUTILOBJS) $(LIBTRIE) $(SHOBJ_LDFLAGS) $(LIBS) -lc -lm -Bsymbolic
$(LD) -o $@ $(VARINT) $(INDEX) $(TEXT) $(REDIS) $(UTILOBJS) $(RMUTILOBJS) $(LIBTRIE) $(SHOBJ_LDFLAGS) $(LIBS) $(LIBNU) -lc -lm -Bsymbolic


release: CFLAGS += $(RELEASEFLAGS)
release: util rmutil snowball trie | module.so
release: util rmutil snowball libnu trie | module.so

debug: CFLAGS += $(DEBUGFLAGS)
debug: util rmutil snowball trie | module.so
debug: util rmutil snowball libnu trie | module.so


clean:
Expand Down
19 changes: 19 additions & 0 deletions src/dep/libnu/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Copyright (c) 2013 Aleksey Tulinov <aleksey.tulinov@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
30 changes: 30 additions & 0 deletions src/dep/libnu/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# find the OS
uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')

# Compile flags for linux / osx
ifeq ($(uname_S),Linux)
CFLAGS ?= -W -Wall -fno-common -g -ggdb -fPIC -std=c99 -O2
CPPFLAGS ?= -W -Wall -fno-common -g -ggdb
else
CFLAGS ?= -W -Wall -dynamic -fno-common -g -fPIC -ggdb -std=c99 -O2
CPPFLAGS ?= -W -Wall -dynamic -fno-common -g -ggdb -O2
endif

SOURCEDIR = .
CC_SOURCES = $(wildcard $(SOURCEDIR)/*.c)
CC_OBJECTS = $(patsubst $(SOURCEDIR)/%.c, $(SOURCEDIR)/%.o, $(CC_SOURCES))

.SUFFIXES: .c .cc .o

all: libnu.a

# $(SOURCEDIR)/%.o: $(SOURCEDIR)/%.c
# $(CC) -I. $(SHOBJ_CFLAGS) -fPIC -fpermissive -c $< -o $@

# test1.xo: ../redismodule.h

libnu.a: $(CC_OBJECTS)
ar rcs $@ $^

clean:
rm -rf *.xo *.so *.o *.a
5 changes: 5 additions & 0 deletions src/dep/libnu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Libnu

The files in this folder are taken from the (excellent) **nunicode** library by Aleksey Tulinov.

See [https://bitbucket.org/alekseyt/nunicode](https://bitbucket.org/alekseyt/nunicode)
135 changes: 135 additions & 0 deletions src/dep/libnu/casemap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#ifndef NU_TOUPPER_H
#define NU_TOUPPER_H

#include <stdint.h>

#include "config.h"
#include "defines.h"
#include "strings.h"
#include "udb.h"

#if defined (__cplusplus) || defined (c_plusplus)
extern "C" {
#endif

/** Synonim to nu_casemap_read. It is recommended to use
* nu_casemap_read instead.
*/
#define NU_CASEMAP_DECODING_FUNCTION NU_UDB_DECODING_FUNCTION
/** Read (decoding) function for use with transformation results of
* casemapping functions. E.g. nu_casemap_read(nu_tolower(0x0041));
* will read first codepoint of 'A' transformed to lower case.
*/
#define nu_casemap_read (nu_udb_read)

/** Casemap codepoint
*
* @ingroup transformations
*/
typedef nu_transformation_t nu_casemapping_t;

#ifdef NU_WITH_TOUPPER

/** Return uppercase value of codepoint. Uncoditional casemapping.
*
* @ingroup transformations
* @param codepoint unicode codepoint
* @return uppercase codepoint or 0 if mapping doesn't exist
*/
NU_EXPORT
const char* nu_toupper(uint32_t codepoint);

/** Return uppercase value of codepoint. Context-sensitivity is not
* implemented internally, returned result is equal to calling nu_toupper()
* on corresponding codepoint.
*
* @ingroup transformations_internal
* @param encoded pointer to encoded string
* @param limit memory limit of encoded string or NU_UNLIMITED
* @param read read (decoding) function
* @param u (optional) codepoint which was (or wasn't) transformed
* @param transform output value of codepoint transformed into uppercase or 0
* if mapping doesn't exist. Can't be NULL, supposed to be decoded with
* nu_casemap_read
* @param context not used
* @return pointer to the next codepoint in string
*/
NU_EXPORT
const char* _nu_toupper(const char *encoded, const char *limit, nu_read_iterator_t read,
uint32_t *u, const char **transform,
void *context);

#endif /* NU_WITH_TOUPPER */

#ifdef NU_WITH_TOLOWER

/** Return lowercase value of codepoint. Unconditional casemapping.
*
* @ingroup transformations
* @param codepoint unicode codepoint
* @return lowercase codepoint or 0 if mapping doesn't exist
*/
NU_EXPORT
const char* nu_tolower(uint32_t codepoint);

/** Return lowercase value of codepoint. Will transform uppercase
* Sigma ('Σ') into final sigma ('ς') if it occurs at string boundary or
* followed by U+0000. Might require single read-ahead when
* encountering Sigma.
*
* @ingroup transformations_internal
* @param encoded pointer to encoded string
* @param limit memory limit of encoded string or NU_UNLIMITED
* @param read read (decoding) function
* @param u (optional) codepoint which was (or wasn't) transformed
* @param transform output value of codepoint transformed into lowercase or 0
* if mapping doesn't exist. Can't be NULL, supposed to be decoded with
* nu_casemap_read
* @param context not used
* @return pointer to the next codepoint in string
*/
NU_EXPORT
const char* _nu_tolower(const char *encoded, const char *limit, nu_read_iterator_t read,
uint32_t *u, const char **transform,
void *context);

#endif /* NU_WITH_TOLOWER */

#ifdef NU_WITH_TOFOLD

/** Return value of codepoint with case differences eliminated
*
* @ingroup transformations
* @param codepoint unicode codepoint
* @return casefolded codepoint or 0 if mapping doesn't exist
*/
NU_EXPORT
const char* nu_tofold(uint32_t codepoint);

/** Return value of codepoint with case differences eliminated.
* Context-sensitivity is not implemented internally, returned result is equal
* to calling nu_tofold() on corresponding codepoint.
*
* @ingroup transformations_internal
* @param encoded pointer to encoded string
* @param limit memory limit of encoded string or NU_UNLIMITED
* @param read read (decoding) function
* @param u (optional) codepoint which was (or wasn't) transformed
* @param transform output value of casefolded codepoint or 0
* if mapping doesn't exist. Can't be NULL, supposed to be decoded with
* nu_casemap_read
* @param context not used
* @return pointer to the next codepoint in string
*/
NU_EXPORT
const char* _nu_tofold(const char *encoded, const char *limit, nu_read_iterator_t read,
uint32_t *u, const char **transform,
void *context);

#endif /* NU_WITH_TOFOLD */

#if defined (__cplusplus) || defined (c_plusplus)
}
#endif

#endif /* NU_TOUPPER_H */
21 changes: 21 additions & 0 deletions src/dep/libnu/casemap_internal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#ifndef NU_CASEMAP_INTERNAL_H
#define NU_CASEMAP_INTERNAL_H

#include <stdint.h>
#include <sys/types.h>

#include "udb.h"

/** Casemap codepoint
*
* @ingroup transformations
*/
static inline
const char* _nu_to_something(uint32_t codepoint,
const int16_t *G, size_t G_SIZE,
const uint32_t *VALUES_C, const uint16_t *VALUES_I, const uint8_t *COMBINED) {

return nu_udb_lookup(codepoint, G, G_SIZE, VALUES_C, VALUES_I, COMBINED);
}

#endif /* NU_CASEMAP_INTERNAL_H */
52 changes: 52 additions & 0 deletions src/dep/libnu/cesu8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "cesu8.h"

#ifdef NU_WITH_CESU8_READER
#ifdef NU_WITH_VALIDATION

int nu_cesu8_validread(const char *encoded, size_t max_len) {
const unsigned char *up = (const unsigned char *)(encoded);

/* i guess there is no way to detect misplaceed CESU-8
* trail surrogate alone, it will produce valid UTF-8 sequence
* greater than U+10000 */

/* 6-bytes sequence
*
* 11101101 followed by 1010xxxx should be
* then followed by xxxxxxxx 11101101 1011xxxx xxxxxxxx */
if (*(up) == 0xED && (*(up + 1) & 0xF0) == 0xA0) {
if (max_len < 6) {
return 0;
}

if (*(up + 3) != 0xED || (*(up + 4) & 0xF0) != 0xB0) {
return 0;
}

return 6;
}

return utf8_validread_basic(encoded, max_len);
}

#endif /* NU_WITH_VALIDATION */
#endif /* NU_WITH_CESU8_READER */

#ifdef NU_WITH_CESU8_WRITER

char* nu_cesu8_write(uint32_t unicode, char *cesu8) {
unsigned codepoint_len = cesu8_codepoint_length(unicode);

if (cesu8 != 0) {
switch (codepoint_len) {
case 1: *cesu8 = (char)(unicode); break;
case 2: b2_utf8(unicode, cesu8); break;
case 3: b3_utf8(unicode, cesu8); break;
default: b6_cesu8(unicode, cesu8); break; /* len == 6 */
}
}

return cesu8 + codepoint_len;
}

#endif /* NU_WITH_CESU8_WRITER */

0 comments on commit e2e82f3

Please sign in to comment.