-
Notifications
You must be signed in to change notification settings - Fork 513
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from RedisLabsModules/unicode_trie
Unicode trie
- Loading branch information
Showing
66 changed files
with
14,700 additions
and
155 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
Copyright (c) 2013 Aleksey Tulinov <aleksey.tulinov@gmail.com> | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# find the OS | ||
uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') | ||
|
||
# Compile flags for linux / osx | ||
ifeq ($(uname_S),Linux) | ||
CFLAGS ?= -W -Wall -fno-common -g -ggdb -fPIC -std=c99 -O2 | ||
CPPFLAGS ?= -W -Wall -fno-common -g -ggdb | ||
else | ||
CFLAGS ?= -W -Wall -dynamic -fno-common -g -fPIC -ggdb -std=c99 -O2 | ||
CPPFLAGS ?= -W -Wall -dynamic -fno-common -g -ggdb -O2 | ||
endif | ||
|
||
SOURCEDIR = . | ||
CC_SOURCES = $(wildcard $(SOURCEDIR)/*.c) | ||
CC_OBJECTS = $(patsubst $(SOURCEDIR)/%.c, $(SOURCEDIR)/%.o, $(CC_SOURCES)) | ||
|
||
.SUFFIXES: .c .cc .o | ||
|
||
all: libnu.a | ||
|
||
# $(SOURCEDIR)/%.o: $(SOURCEDIR)/%.c | ||
# $(CC) -I. $(SHOBJ_CFLAGS) -fPIC -fpermissive -c $< -o $@ | ||
|
||
# test1.xo: ../redismodule.h | ||
|
||
libnu.a: $(CC_OBJECTS) | ||
ar rcs $@ $^ | ||
|
||
clean: | ||
rm -rf *.xo *.so *.o *.a |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Libnu | ||
|
||
The files in this folder are taken from the (excellent) **nunicode** library by Aleksey Tulinov. | ||
|
||
See [https://bitbucket.org/alekseyt/nunicode](https://bitbucket.org/alekseyt/nunicode) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
#ifndef NU_TOUPPER_H | ||
#define NU_TOUPPER_H | ||
|
||
#include <stdint.h> | ||
|
||
#include "config.h" | ||
#include "defines.h" | ||
#include "strings.h" | ||
#include "udb.h" | ||
|
||
#if defined (__cplusplus) || defined (c_plusplus) | ||
extern "C" { | ||
#endif | ||
|
||
/** Synonim to nu_casemap_read. It is recommended to use | ||
* nu_casemap_read instead. | ||
*/ | ||
#define NU_CASEMAP_DECODING_FUNCTION NU_UDB_DECODING_FUNCTION | ||
/** Read (decoding) function for use with transformation results of | ||
* casemapping functions. E.g. nu_casemap_read(nu_tolower(0x0041)); | ||
* will read first codepoint of 'A' transformed to lower case. | ||
*/ | ||
#define nu_casemap_read (nu_udb_read) | ||
|
||
/** Casemap codepoint | ||
* | ||
* @ingroup transformations | ||
*/ | ||
typedef nu_transformation_t nu_casemapping_t; | ||
|
||
#ifdef NU_WITH_TOUPPER | ||
|
||
/** Return uppercase value of codepoint. Uncoditional casemapping. | ||
* | ||
* @ingroup transformations | ||
* @param codepoint unicode codepoint | ||
* @return uppercase codepoint or 0 if mapping doesn't exist | ||
*/ | ||
NU_EXPORT | ||
const char* nu_toupper(uint32_t codepoint); | ||
|
||
/** Return uppercase value of codepoint. Context-sensitivity is not | ||
* implemented internally, returned result is equal to calling nu_toupper() | ||
* on corresponding codepoint. | ||
* | ||
* @ingroup transformations_internal | ||
* @param encoded pointer to encoded string | ||
* @param limit memory limit of encoded string or NU_UNLIMITED | ||
* @param read read (decoding) function | ||
* @param u (optional) codepoint which was (or wasn't) transformed | ||
* @param transform output value of codepoint transformed into uppercase or 0 | ||
* if mapping doesn't exist. Can't be NULL, supposed to be decoded with | ||
* nu_casemap_read | ||
* @param context not used | ||
* @return pointer to the next codepoint in string | ||
*/ | ||
NU_EXPORT | ||
const char* _nu_toupper(const char *encoded, const char *limit, nu_read_iterator_t read, | ||
uint32_t *u, const char **transform, | ||
void *context); | ||
|
||
#endif /* NU_WITH_TOUPPER */ | ||
|
||
#ifdef NU_WITH_TOLOWER | ||
|
||
/** Return lowercase value of codepoint. Unconditional casemapping. | ||
* | ||
* @ingroup transformations | ||
* @param codepoint unicode codepoint | ||
* @return lowercase codepoint or 0 if mapping doesn't exist | ||
*/ | ||
NU_EXPORT | ||
const char* nu_tolower(uint32_t codepoint); | ||
|
||
/** Return lowercase value of codepoint. Will transform uppercase | ||
* Sigma ('Σ') into final sigma ('ς') if it occurs at string boundary or | ||
* followed by U+0000. Might require single read-ahead when | ||
* encountering Sigma. | ||
* | ||
* @ingroup transformations_internal | ||
* @param encoded pointer to encoded string | ||
* @param limit memory limit of encoded string or NU_UNLIMITED | ||
* @param read read (decoding) function | ||
* @param u (optional) codepoint which was (or wasn't) transformed | ||
* @param transform output value of codepoint transformed into lowercase or 0 | ||
* if mapping doesn't exist. Can't be NULL, supposed to be decoded with | ||
* nu_casemap_read | ||
* @param context not used | ||
* @return pointer to the next codepoint in string | ||
*/ | ||
NU_EXPORT | ||
const char* _nu_tolower(const char *encoded, const char *limit, nu_read_iterator_t read, | ||
uint32_t *u, const char **transform, | ||
void *context); | ||
|
||
#endif /* NU_WITH_TOLOWER */ | ||
|
||
#ifdef NU_WITH_TOFOLD | ||
|
||
/** Return value of codepoint with case differences eliminated | ||
* | ||
* @ingroup transformations | ||
* @param codepoint unicode codepoint | ||
* @return casefolded codepoint or 0 if mapping doesn't exist | ||
*/ | ||
NU_EXPORT | ||
const char* nu_tofold(uint32_t codepoint); | ||
|
||
/** Return value of codepoint with case differences eliminated. | ||
* Context-sensitivity is not implemented internally, returned result is equal | ||
* to calling nu_tofold() on corresponding codepoint. | ||
* | ||
* @ingroup transformations_internal | ||
* @param encoded pointer to encoded string | ||
* @param limit memory limit of encoded string or NU_UNLIMITED | ||
* @param read read (decoding) function | ||
* @param u (optional) codepoint which was (or wasn't) transformed | ||
* @param transform output value of casefolded codepoint or 0 | ||
* if mapping doesn't exist. Can't be NULL, supposed to be decoded with | ||
* nu_casemap_read | ||
* @param context not used | ||
* @return pointer to the next codepoint in string | ||
*/ | ||
NU_EXPORT | ||
const char* _nu_tofold(const char *encoded, const char *limit, nu_read_iterator_t read, | ||
uint32_t *u, const char **transform, | ||
void *context); | ||
|
||
#endif /* NU_WITH_TOFOLD */ | ||
|
||
#if defined (__cplusplus) || defined (c_plusplus) | ||
} | ||
#endif | ||
|
||
#endif /* NU_TOUPPER_H */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#ifndef NU_CASEMAP_INTERNAL_H | ||
#define NU_CASEMAP_INTERNAL_H | ||
|
||
#include <stdint.h> | ||
#include <sys/types.h> | ||
|
||
#include "udb.h" | ||
|
||
/** Casemap codepoint | ||
* | ||
* @ingroup transformations | ||
*/ | ||
static inline | ||
const char* _nu_to_something(uint32_t codepoint, | ||
const int16_t *G, size_t G_SIZE, | ||
const uint32_t *VALUES_C, const uint16_t *VALUES_I, const uint8_t *COMBINED) { | ||
|
||
return nu_udb_lookup(codepoint, G, G_SIZE, VALUES_C, VALUES_I, COMBINED); | ||
} | ||
|
||
#endif /* NU_CASEMAP_INTERNAL_H */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#include "cesu8.h" | ||
|
||
#ifdef NU_WITH_CESU8_READER | ||
#ifdef NU_WITH_VALIDATION | ||
|
||
int nu_cesu8_validread(const char *encoded, size_t max_len) { | ||
const unsigned char *up = (const unsigned char *)(encoded); | ||
|
||
/* i guess there is no way to detect misplaceed CESU-8 | ||
* trail surrogate alone, it will produce valid UTF-8 sequence | ||
* greater than U+10000 */ | ||
|
||
/* 6-bytes sequence | ||
* | ||
* 11101101 followed by 1010xxxx should be | ||
* then followed by xxxxxxxx 11101101 1011xxxx xxxxxxxx */ | ||
if (*(up) == 0xED && (*(up + 1) & 0xF0) == 0xA0) { | ||
if (max_len < 6) { | ||
return 0; | ||
} | ||
|
||
if (*(up + 3) != 0xED || (*(up + 4) & 0xF0) != 0xB0) { | ||
return 0; | ||
} | ||
|
||
return 6; | ||
} | ||
|
||
return utf8_validread_basic(encoded, max_len); | ||
} | ||
|
||
#endif /* NU_WITH_VALIDATION */ | ||
#endif /* NU_WITH_CESU8_READER */ | ||
|
||
#ifdef NU_WITH_CESU8_WRITER | ||
|
||
char* nu_cesu8_write(uint32_t unicode, char *cesu8) { | ||
unsigned codepoint_len = cesu8_codepoint_length(unicode); | ||
|
||
if (cesu8 != 0) { | ||
switch (codepoint_len) { | ||
case 1: *cesu8 = (char)(unicode); break; | ||
case 2: b2_utf8(unicode, cesu8); break; | ||
case 3: b3_utf8(unicode, cesu8); break; | ||
default: b6_cesu8(unicode, cesu8); break; /* len == 6 */ | ||
} | ||
} | ||
|
||
return cesu8 + codepoint_len; | ||
} | ||
|
||
#endif /* NU_WITH_CESU8_WRITER */ |
Oops, something went wrong.