diff --git a/src/debug_commads.c b/src/debug_commads.c index 8cefb4bccf..44b5c95c86 100644 --- a/src/debug_commads.c +++ b/src/debug_commads.c @@ -140,7 +140,7 @@ static void DumpPhoneticHash(RedisModuleCtx *ctx, RedisModuleString *term) { char *primary = NULL; char *secondary = NULL; - PhoneticManager_ExpandPhonerics(NULL, term_c, len, &primary, &secondary); + PhoneticManager_ExpandPhonetics(NULL, term_c, len, &primary, &secondary); RedisModule_ReplyWithArray(ctx, 2); RedisModule_ReplyWithStringBuffer(ctx, primary, strlen(primary)); diff --git a/src/dep/phonetics/CMakeLists.txt b/src/dep/phonetics/CMakeLists.txt index 39c7abe21d..e87789d017 100644 --- a/src/dep/phonetics/CMakeLists.txt +++ b/src/dep/phonetics/CMakeLists.txt @@ -1 +1 @@ -ADD_LIBRARY(metaphone OBJECT double_metaphone.cpp) +ADD_LIBRARY(metaphone OBJECT double_metaphone.c) diff --git a/src/dep/phonetics/double_metaphone.cpp b/src/dep/phonetics/double_metaphone.c similarity index 60% rename from src/dep/phonetics/double_metaphone.cpp rename to src/dep/phonetics/double_metaphone.c index a1237abeed..67a95d1a13 100644 --- a/src/dep/phonetics/double_metaphone.cpp +++ b/src/dep/phonetics/double_metaphone.c @@ -1,90 +1,147 @@ -#include "double_metaphone.h" +/* COPYRIGHT NOTICE + * + * This code was pulled directly from the Text-DoubleMetaphone perl package, + * version 0.07 + * + * The README mentions that the copyright is: + * + * Copyright 2000, Maurice Aubrey . + * All rights reserved. + + * This code is based heavily on the C++ implementation by + * Lawrence Philips and incorporates several bug fixes courtesy + * of Kevin Atkinson . + * + * This module is free software; you may redistribute it and/or + * modify it under the same terms as Perl itself. + */ -#include -#include -#include #include #include #include #include #include #include +#include "double_metaphone.h" + +/* + * * If META_USE_PERL_MALLOC is defined we use Perl's memory routines. + * */ +#ifdef META_USE_PERL_MALLOC + +#include "EXTERN.h" +#include "perl.h" +#define META_MALLOC(v, n, t) New(1, v, n, t) +#define META_REALLOC(v, n, t) Renew(v, n, t) +#define META_FREE(x) Safefree((x)) + +#else + +#define META_MALLOC(v, n, t) (v = (t *)malloc(((n) * sizeof(t)))) +#define META_REALLOC(v, n, t) (v = (t *)realloc((v), ((n) * sizeof(t)))) +#define META_FREE(x) free((x)) -#include "double_metaphone_capi.h" +#endif /* META_USE_PERL_MALLOC */ -// TODO: Change references from unsigned int to int, or remove redundant comparisons -#pragma GCC diagnostic ignored "-Wtautological-compare" +static metastring *NewMetaString(const char *init_str) { + metastring *s; + char empty_string[] = ""; -const unsigned int max_length = 32; + META_MALLOC(s, 1, metastring); + assert(s != NULL); -void MakeUpper(string &s) { - for (unsigned int i = 0; i < s.length(); i++) { - s[i] = toupper(s[i]); + if (init_str == NULL) init_str = empty_string; + s->length = strlen(init_str); + /* preallocate a bit more for potential growth */ + s->bufsize = s->length + 7; + + META_MALLOC(s->str, s->bufsize, char); + assert(s->str != NULL); + + strncpy(s->str, init_str, s->length + 1); + s->free_string_on_destroy = 1; + + return s; +} + +static void DestroyMetaString(metastring *s) { + if (s == NULL) return; + + if (s->free_string_on_destroy && (s->str != NULL)) META_FREE(s->str); + + META_FREE(s); +} + +static void IncreaseBuffer(metastring *s, int chars_needed) { + META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char); + assert(s->str != NULL); + s->bufsize = s->bufsize + chars_needed + 10; +} + +static void MakeUpper(metastring *s) { + char *i; + + for (i = s->str; *i; i++) { + *i = toupper(*i); } } -int IsVowel(string &s, unsigned int pos) { +static int IsVowel(metastring *s, int pos) { char c; - if ((pos < 0) || (pos >= s.length())) return 0; + if ((pos < 0) || (pos >= s->length)) return 0; - c = s[pos]; - if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') || (c == 'U') || (c == 'Y')) { - return 1; - } + c = *(s->str + pos); + if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') || (c == 'U') || (c == 'Y')) return 1; return 0; } -int SlavoGermanic(string &s) { - if ((char *)strstr(s.c_str(), "W")) +static int SlavoGermanic(metastring *s) { + if ((char *)strstr(s->str, "W")) return 1; - else if ((char *)strstr(s.c_str(), "K")) + else if ((char *)strstr(s->str, "K")) return 1; - else if ((char *)strstr(s.c_str(), "CZ")) + else if ((char *)strstr(s->str, "CZ")) return 1; - else if ((char *)strstr(s.c_str(), "WITZ")) + else if ((char *)strstr(s->str, "WITZ")) return 1; else return 0; } -char GetAt(string &s, unsigned int pos) { - if ((pos < 0) || (pos >= s.length())) { - return '\0'; - } +static int GetLength(metastring *s) { + return s->length; +} - return s[pos]; +static char GetAt(metastring *s, int pos) { + if ((pos < 0) || (pos >= s->length)) return '\0'; + + return ((char)*(s->str + pos)); } -void SetAt(string &s, unsigned int pos, char c) { - if ((pos < 0) || (pos >= s.length())) { - return; - } +static void SetAt(metastring *s, int pos, char c) { + if ((pos < 0) || (pos >= s->length)) return; - s[pos] = c; + *(s->str + pos) = c; } /* - Caveats: the START value is 0 based + Caveats: the START value is 0 based */ -int StringAt(string &s, unsigned int start, unsigned int length, ...) { +static int StringAt(metastring *s, int start, int length, ...) { char *test; - const char *pos; + char *pos; va_list ap; - if ((start < 0) || (start >= s.length())) { - return 0; - } + if ((start < 0) || (start >= s->length)) return 0; - pos = (s.c_str() + start); + pos = (s->str + start); va_start(ap, length); do { test = va_arg(ap, char *); - if (*test && (strncmp(pos, test, length) == 0)) { - return 1; - } + if (*test && (strncmp(pos, test, length) == 0)) return 1; } while (strcmp(test, "")); va_end(ap); @@ -92,44 +149,56 @@ int StringAt(string &s, unsigned int start, unsigned int length, ...) { return 0; } -void DoubleMetaphone(const string &str, vector *codes) { +static void MetaphAdd(metastring *s, const char *new_str) { + int add_length; + + if (new_str == NULL) return; + + add_length = strlen(new_str); + if ((s->length + add_length) > (s->bufsize - 1)) { + IncreaseBuffer(s, add_length); + } + + strcat(s->str, new_str); + s->length += add_length; +} + +void DoubleMetaphone(const char *str, char **primary_pp, char **secondary_pp) { int length; - string original; - string primary; - string secondary; + metastring *original; + metastring *primary; + metastring *secondary; int current; int last; current = 0; /* we need the real length and last prior to padding */ - length = str.length(); + length = strlen(str); last = length - 1; - original = str; // make a copy + original = NewMetaString(str); /* Pad original so we can index beyond end */ - original += " "; + MetaphAdd(original, " "); - primary = ""; - secondary = ""; + primary = NewMetaString(""); + secondary = NewMetaString(""); + primary->free_string_on_destroy = 0; + secondary->free_string_on_destroy = 0; MakeUpper(original); /* skip these when at start of word */ - if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", "")) { - current += 1; - } + if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", "")) current += 1; /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */ if (GetAt(original, 0) == 'X') { - primary += "S"; /* 'Z' maps to 'S' */ - secondary += "S"; + MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */ + MetaphAdd(secondary, "S"); current += 1; } /* main loop */ - while ((primary.length() < max_length) || (secondary.length() < max_length)) { - if (current >= length) { - break; - } + while ((primary->length < 4) || (secondary->length < 4)) { + if (current >= length) break; switch (GetAt(original, current)) { case 'A': @@ -140,16 +209,17 @@ void DoubleMetaphone(const string &str, vector *codes) { case 'Y': if (current == 0) { /* all init vowels now map to 'A' */ - primary += "A"; - secondary += "A"; + MetaphAdd(primary, "A"); + MetaphAdd(secondary, "A"); } current += 1; break; case 'B': + /* "-mb", e.g", "dumb", already skipped over... */ - primary += "P"; - secondary += "P"; + MetaphAdd(primary, "P"); + MetaphAdd(secondary, "P"); if (GetAt(original, current + 1) == 'B') current += 2; @@ -157,14 +227,14 @@ void DoubleMetaphone(const string &str, vector *codes) { current += 1; break; -#if 0 -// This is broken because this is a multibyte on UTF8 +#if 0 // This is 2018 and nobody is using Latin1 case 'Ç': - primary += "S"; - secondary += "S"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "S"); current += 1; break; #endif + case 'C': /* various germanic */ if ((current > 1) && !IsVowel(original, current - 2) && @@ -172,24 +242,24 @@ void DoubleMetaphone(const string &str, vector *codes) { ((GetAt(original, current + 2) != 'I') && ((GetAt(original, current + 2) != 'E') || StringAt(original, (current - 2), 6, "BACHER", "MACHER", "")))) { - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); current += 2; break; } /* special case 'caesar' */ if ((current == 0) && StringAt(original, current, 6, "CAESAR", "")) { - primary += "S"; - secondary += "S"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "S"); current += 2; break; } /* italian 'chianti' */ if (StringAt(original, current, 4, "CHIA", "")) { - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); current += 2; break; } @@ -197,8 +267,8 @@ void DoubleMetaphone(const string &str, vector *codes) { if (StringAt(original, current, 2, "CH", "")) { /* find 'michael' */ if ((current > 0) && StringAt(original, current, 4, "CHAE", "")) { - primary += "K"; - secondary += "X"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "X"); current += 2; break; } @@ -208,37 +278,36 @@ void DoubleMetaphone(const string &str, vector *codes) { (StringAt(original, (current + 1), 5, "HARAC", "HARIS", "") || StringAt(original, (current + 1), 3, "HOR", "HYM", "HIA", "HEM", "")) && !StringAt(original, 0, 5, "CHORE", "")) { - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); current += 2; break; } /* germanic, greek, or otherwise 'ch' for 'kh' sound */ - if ((StringAt(original, 0, 4, "VAN ", "VON ", "") || - StringAt(original, 0, 3, "SCH", "")) || + if ((StringAt(original, 0, 4, "VAN ", "VON ", "") || StringAt(original, 0, 3, "SCH", "")) /* 'architect but not 'arch', 'orchestra', 'orchid' */ - StringAt(original, (current - 2), 6, "ORCHES", "ARCHIT", "ORCHID", "") || + || StringAt(original, (current - 2), 6, "ORCHES", "ARCHIT", "ORCHID", "") || StringAt(original, (current + 2), 1, "T", "S", "") || - ((StringAt(original, (current - 1), 1, "A", "O", "U", "E", "") || (current == 0)) && + ((StringAt(original, (current - 1), 1, "A", "O", "U", "E", "") || (current == 0)) /* e.g., 'wachtler', 'wechsler', but not 'tichner' */ - StringAt(original, (current + 2), 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", - " ", ""))) { - primary += "K"; - secondary += "K"; + && StringAt(original, (current + 2), 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", + " ", ""))) { + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); } else { if (current > 0) { if (StringAt(original, 0, 2, "MC", "")) { /* e.g., "McHugh" */ - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); } else { - primary += "X"; - secondary += "K"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "K"); } } else { - primary += "X"; - secondary += "X"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "X"); } } current += 2; @@ -247,16 +316,16 @@ void DoubleMetaphone(const string &str, vector *codes) { /* e.g, 'czerny' */ if (StringAt(original, current, 2, "CZ", "") && !StringAt(original, (current - 2), 4, "WICZ", "")) { - primary += "S"; - secondary += "X"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "X"); current += 2; break; } /* e.g., 'focaccia' */ if (StringAt(original, (current + 1), 3, "CIA", "")) { - primary += "X"; - secondary += "X"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "X"); current += 3; break; } @@ -270,26 +339,26 @@ void DoubleMetaphone(const string &str, vector *codes) { /* 'accident', 'accede' 'succeed' */ if (((current == 1) && (GetAt(original, current - 1) == 'A')) || StringAt(original, (current - 1), 5, "UCCEE", "UCCES", "")) { - primary += "KS"; - secondary += "KS"; + MetaphAdd(primary, "KS"); + MetaphAdd(secondary, "KS"); /* 'bacci', 'bertucci', other italian */ } else { - primary += "X"; - secondary += "X"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "X"); } current += 3; break; } else { /* Pierce's rule */ - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); current += 2; break; } } if (StringAt(original, current, 2, "CK", "CG", "CQ", "")) { - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); current += 2; break; } @@ -297,19 +366,19 @@ void DoubleMetaphone(const string &str, vector *codes) { if (StringAt(original, current, 2, "CI", "CE", "CY", "")) { /* italian vs. english */ if (StringAt(original, current, 3, "CIO", "CIE", "CIA", "")) { - primary += "S"; - secondary += "X"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "X"); } else { - primary += "S"; - secondary += "S"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "S"); } current += 2; break; } /* else */ - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); /* name sent in 'mac caffrey', 'mac gregor */ if (StringAt(original, (current + 1), 2, " C", " Q", " G", "")) @@ -325,29 +394,29 @@ void DoubleMetaphone(const string &str, vector *codes) { if (StringAt(original, current, 2, "DG", "")) { if (StringAt(original, (current + 2), 1, "I", "E", "Y", "")) { /* e.g. 'edge' */ - primary += "J"; - secondary += "J"; + MetaphAdd(primary, "J"); + MetaphAdd(secondary, "J"); current += 3; break; } else { /* e.g. 'edgar' */ - primary += "TK"; - secondary += "TK"; + MetaphAdd(primary, "TK"); + MetaphAdd(secondary, "TK"); current += 2; break; } } if (StringAt(original, current, 2, "DT", "DD", "")) { - primary += "T"; - secondary += "T"; + MetaphAdd(primary, "T"); + MetaphAdd(secondary, "T"); current += 2; break; } /* else */ - primary += "T"; - secondary += "T"; + MetaphAdd(primary, "T"); + MetaphAdd(secondary, "T"); current += 1; break; @@ -356,15 +425,15 @@ void DoubleMetaphone(const string &str, vector *codes) { current += 2; else current += 1; - primary += "F"; - secondary += "F"; + MetaphAdd(primary, "F"); + MetaphAdd(secondary, "F"); break; case 'G': if (GetAt(original, current + 1) == 'H') { if ((current > 0) && !IsVowel(original, current - 1)) { - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); current += 2; break; } @@ -373,33 +442,34 @@ void DoubleMetaphone(const string &str, vector *codes) { /* 'ghislane', ghiradelli */ if (current == 0) { if (GetAt(original, current + 2) == 'I') { - primary += "J"; - secondary += "J"; + MetaphAdd(primary, "J"); + MetaphAdd(secondary, "J"); } else { - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); } current += 2; break; } } /* Parker's rule (with some further refinements) - e.g., 'hugh' */ - if (((current > 1) && StringAt(original, (current - 2), 1, "B", "H", "D", "")) || + if (((current > 1) && StringAt(original, (current - 2), 1, "B", "H", "D", "")) /* e.g., 'bough' */ - ((current > 2) && StringAt(original, (current - 3), 1, "B", "H", "D", "")) || + || ((current > 2) && StringAt(original, (current - 3), 1, "B", "H", "D", "")) /* e.g., 'broughton' */ - ((current > 3) && StringAt(original, (current - 4), 1, "B", "H", ""))) { + || ((current > 3) && StringAt(original, (current - 4), 1, "B", "H", ""))) { current += 2; break; } else { /* e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' */ if ((current > 2) && (GetAt(original, current - 1) == 'U') && StringAt(original, (current - 3), 1, "C", "G", "L", "R", "T", "")) { - primary += "F"; - secondary += "F"; + MetaphAdd(primary, "F"); + MetaphAdd(secondary, "F"); } else if ((current > 0) && GetAt(original, current - 1) != 'I') { - primary += "K"; - secondary += "K"; + + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); } current += 2; @@ -409,17 +479,17 @@ void DoubleMetaphone(const string &str, vector *codes) { if (GetAt(original, current + 1) == 'N') { if ((current == 1) && IsVowel(original, 0) && !SlavoGermanic(original)) { - primary += "KN"; - secondary += "N"; + MetaphAdd(primary, "KN"); + MetaphAdd(secondary, "N"); } else /* not e.g. 'cagney' */ if (!StringAt(original, (current + 2), 2, "EY", "") && (GetAt(original, current + 1) != 'Y') && !SlavoGermanic(original)) { - primary += "N"; - secondary += "KN"; + MetaphAdd(primary, "N"); + MetaphAdd(secondary, "KN"); } else { - primary += "KN"; - secondary += "KN"; + MetaphAdd(primary, "KN"); + MetaphAdd(secondary, "KN"); } current += 2; break; @@ -427,8 +497,8 @@ void DoubleMetaphone(const string &str, vector *codes) { /* 'tagliaro' */ if (StringAt(original, (current + 1), 2, "LI", "") && !SlavoGermanic(original)) { - primary += "KL"; - secondary += "L"; + MetaphAdd(primary, "KL"); + MetaphAdd(secondary, "L"); current += 2; break; } @@ -437,8 +507,8 @@ void DoubleMetaphone(const string &str, vector *codes) { if ((current == 0) && ((GetAt(original, current + 1) == 'Y') || StringAt(original, (current + 1), 2, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER", ""))) { - primary += "K"; - secondary += "J"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "J"); current += 2; break; } @@ -449,8 +519,8 @@ void DoubleMetaphone(const string &str, vector *codes) { !StringAt(original, 0, 6, "DANGER", "RANGER", "MANGER", "") && !StringAt(original, (current - 1), 1, "E", "I", "") && !StringAt(original, (current - 1), 3, "RGY", "OGY", "")) { - primary += "K"; - secondary += "J"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "J"); current += 2; break; } @@ -462,16 +532,16 @@ void DoubleMetaphone(const string &str, vector *codes) { if ((StringAt(original, 0, 4, "VAN ", "VON ", "") || StringAt(original, 0, 3, "SCH", "")) || StringAt(original, (current + 1), 2, "ET", "")) { - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); } else { /* always soft if french ending */ if (StringAt(original, (current + 1), 4, "IER ", "")) { - primary += "J"; - secondary += "J"; + MetaphAdd(primary, "J"); + MetaphAdd(secondary, "J"); } else { - primary += "J"; - secondary += "K"; + MetaphAdd(primary, "J"); + MetaphAdd(secondary, "K"); } } current += 2; @@ -482,15 +552,15 @@ void DoubleMetaphone(const string &str, vector *codes) { current += 2; else current += 1; - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); break; case 'H': /* only keep if first & before vowel or btw. 2 vowels */ if (((current == 0) || IsVowel(original, current - 1)) && IsVowel(original, current + 1)) { - primary += "H"; - secondary += "H"; + MetaphAdd(primary, "H"); + MetaphAdd(secondary, "H"); current += 2; } else /* also takes care of 'HH' */ current += 1; @@ -501,35 +571,35 @@ void DoubleMetaphone(const string &str, vector *codes) { if (StringAt(original, current, 4, "JOSE", "") || StringAt(original, 0, 4, "SAN ", "")) { if (((current == 0) && (GetAt(original, current + 4) == ' ')) || StringAt(original, 0, 4, "SAN ", "")) { - primary += "H"; - secondary += "H"; + MetaphAdd(primary, "H"); + MetaphAdd(secondary, "H"); } else { - primary += "J"; - secondary += "H"; + MetaphAdd(primary, "J"); + MetaphAdd(secondary, "H"); } current += 1; break; } if ((current == 0) && !StringAt(original, current, 4, "JOSE", "")) { - primary += "J"; /* Yankelovich/Jankelowicz */ - secondary += "A"; + MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */ + MetaphAdd(secondary, "A"); } else { /* spanish pron. of e.g. 'bajador' */ if (IsVowel(original, current - 1) && !SlavoGermanic(original) && ((GetAt(original, current + 1) == 'A') || (GetAt(original, current + 1) == 'O'))) { - primary += "J"; - secondary += "H"; + MetaphAdd(primary, "J"); + MetaphAdd(secondary, "H"); } else { if (current == last) { - primary += "J"; - secondary += ""; + MetaphAdd(primary, "J"); + MetaphAdd(secondary, ""); } else { if (!StringAt(original, (current + 1), 1, "L", "T", "K", "S", "N", "M", "B", "Z", "") && !StringAt(original, (current - 1), 1, "S", "K", "L", "")) { - primary += "J"; - secondary += "J"; + MetaphAdd(primary, "J"); + MetaphAdd(secondary, "J"); } } } @@ -546,8 +616,8 @@ void DoubleMetaphone(const string &str, vector *codes) { current += 2; else current += 1; - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); break; case 'L': @@ -558,53 +628,51 @@ void DoubleMetaphone(const string &str, vector *codes) { ((StringAt(original, (last - 1), 2, "AS", "OS", "") || StringAt(original, last, 1, "A", "O", "")) && StringAt(original, (current - 1), 4, "ALLE", ""))) { - primary += "L"; - secondary += ""; + MetaphAdd(primary, "L"); + MetaphAdd(secondary, ""); current += 2; break; } current += 2; } else current += 1; - primary += "L"; - secondary += "L"; + MetaphAdd(primary, "L"); + MetaphAdd(secondary, "L"); break; case 'M': if ((StringAt(original, (current - 1), 3, "UMB", "") && - (((current + 1) == last) || StringAt(original, (current + 2), 2, "ER", ""))) || + (((current + 1) == last) || StringAt(original, (current + 2), 2, "ER", ""))) /* 'dumb','thumb' */ - (GetAt(original, current + 1) == 'M')) { + || (GetAt(original, current + 1) == 'M')) current += 2; - } else { + else current += 1; - } - primary += "M"; - secondary += "M"; + MetaphAdd(primary, "M"); + MetaphAdd(secondary, "M"); break; case 'N': - if (GetAt(original, current + 1) == 'N') { + if (GetAt(original, current + 1) == 'N') current += 2; - } else { + else current += 1; - } - primary += "N"; - secondary += "N"; + MetaphAdd(primary, "N"); + MetaphAdd(secondary, "N"); break; -#if 0 -// Multibyte UTF8! +#if 0 // UTF8, not Latin1 case 'Ñ': current += 1; - primary += "N"; - secondary += "N"; + MetaphAdd(primary, "N"); + MetaphAdd(secondary, "N"); break; #endif + case 'P': if (GetAt(original, current + 1) == 'H') { - primary += "F"; - secondary += "F"; + MetaphAdd(primary, "F"); + MetaphAdd(secondary, "F"); current += 2; break; } @@ -614,8 +682,8 @@ void DoubleMetaphone(const string &str, vector *codes) { current += 2; else current += 1; - primary += "P"; - secondary += "P"; + MetaphAdd(primary, "P"); + MetaphAdd(secondary, "P"); break; case 'Q': @@ -623,8 +691,8 @@ void DoubleMetaphone(const string &str, vector *codes) { current += 2; else current += 1; - primary += "K"; - secondary += "K"; + MetaphAdd(primary, "K"); + MetaphAdd(secondary, "K"); break; case 'R': @@ -632,11 +700,11 @@ void DoubleMetaphone(const string &str, vector *codes) { if ((current == last) && !SlavoGermanic(original) && StringAt(original, (current - 2), 2, "IE", "") && !StringAt(original, (current - 4), 2, "ME", "MA", "")) { - primary += ""; - secondary += "R"; + MetaphAdd(primary, ""); + MetaphAdd(secondary, "R"); } else { - primary += "R"; - secondary += "R"; + MetaphAdd(primary, "R"); + MetaphAdd(secondary, "R"); } if (GetAt(original, current + 1) == 'R') @@ -654,8 +722,8 @@ void DoubleMetaphone(const string &str, vector *codes) { /* special case 'sugar-' */ if ((current == 0) && StringAt(original, current, 5, "SUGAR", "")) { - primary += "X"; - secondary += "S"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "S"); current += 1; break; } @@ -663,11 +731,11 @@ void DoubleMetaphone(const string &str, vector *codes) { if (StringAt(original, current, 2, "SH", "")) { /* germanic */ if (StringAt(original, (current + 1), 4, "HEIM", "HOEK", "HOLM", "HOLZ", "")) { - primary += "S"; - secondary += "S"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "S"); } else { - primary += "X"; - secondary += "X"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "X"); } current += 2; break; @@ -677,11 +745,11 @@ void DoubleMetaphone(const string &str, vector *codes) { if (StringAt(original, current, 3, "SIO", "SIA", "") || StringAt(original, current, 4, "SIAN", "")) { if (!SlavoGermanic(original)) { - primary += "S"; - secondary += "X"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "X"); } else { - primary += "S"; - secondary += "S"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "S"); } current += 3; break; @@ -691,8 +759,8 @@ void DoubleMetaphone(const string &str, vector *codes) { also, -sz- in slavic language altho in hungarian it is pronounced 's' */ if (((current == 0) && StringAt(original, (current + 1), 1, "M", "N", "L", "W", "")) || StringAt(original, (current + 1), 1, "Z", "")) { - primary += "S"; - secondary += "X"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "X"); if (StringAt(original, (current + 1), 1, "Z", "")) current += 2; else @@ -702,26 +770,25 @@ void DoubleMetaphone(const string &str, vector *codes) { if (StringAt(original, current, 2, "SC", "")) { /* Schlesinger's rule */ - if (GetAt(original, current + 2) == 'H') { - /* dutch origin, e.g. 'school', 'schooner' */ + if (GetAt(original, current + 2) == 'H') /* dutch origin, e.g. 'school', 'schooner' */ { if (StringAt(original, (current + 3), 2, "OO", "ER", "EN", "UY", "ED", "EM", "")) { /* 'schermerhorn', 'schenker' */ if (StringAt(original, (current + 3), 2, "ER", "EN", "")) { - primary += "X"; - secondary += "SK"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "SK"); } else { - primary += "SK"; - secondary += "SK"; + MetaphAdd(primary, "SK"); + MetaphAdd(secondary, "SK"); } current += 3; break; } else { if ((current == 0) && !IsVowel(original, 3) && (GetAt(original, 3) != 'W')) { - primary += "X"; - secondary += "S"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "S"); } else { - primary += "X"; - secondary += "X"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "X"); } current += 3; break; @@ -729,25 +796,25 @@ void DoubleMetaphone(const string &str, vector *codes) { } if (StringAt(original, (current + 2), 1, "I", "E", "Y", "")) { - primary += "S"; - secondary += "S"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "S"); current += 3; break; } /* else */ - primary += "SK"; - secondary += "SK"; + MetaphAdd(primary, "SK"); + MetaphAdd(secondary, "SK"); current += 3; break; } /* french e.g. 'resnais', 'artois' */ if ((current == last) && StringAt(original, (current - 2), 2, "AI", "OI", "")) { - primary += ""; - secondary += "S"; + MetaphAdd(primary, ""); + MetaphAdd(secondary, "S"); } else { - primary += "S"; - secondary += "S"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "S"); } if (StringAt(original, (current + 1), 1, "S", "Z", "")) @@ -758,15 +825,15 @@ void DoubleMetaphone(const string &str, vector *codes) { case 'T': if (StringAt(original, current, 4, "TION", "")) { - primary += "X"; - secondary += "X"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "X"); current += 3; break; } if (StringAt(original, current, 3, "TIA", "TCH", "")) { - primary += "X"; - secondary += "X"; + MetaphAdd(primary, "X"); + MetaphAdd(secondary, "X"); current += 3; break; } @@ -775,40 +842,38 @@ void DoubleMetaphone(const string &str, vector *codes) { /* special case 'thomas', 'thames' or germanic */ if (StringAt(original, (current + 2), 2, "OM", "AM", "") || StringAt(original, 0, 4, "VAN ", "VON ", "") || StringAt(original, 0, 3, "SCH", "")) { - primary += "T"; - secondary += "T"; + MetaphAdd(primary, "T"); + MetaphAdd(secondary, "T"); } else { - primary += "0"; /* yes, zero */ - secondary += "T"; + MetaphAdd(primary, "0"); /* yes, zero */ + MetaphAdd(secondary, "T"); } current += 2; break; } - if (StringAt(original, (current + 1), 1, "T", "D", "")) { + if (StringAt(original, (current + 1), 1, "T", "D", "")) current += 2; - } else { + else current += 1; - } - primary += "T"; - secondary += "T"; + MetaphAdd(primary, "T"); + MetaphAdd(secondary, "T"); break; case 'V': - if (GetAt(original, current + 1) == 'V') { + if (GetAt(original, current + 1) == 'V') current += 2; - } else { + else current += 1; - } - primary += "F"; - secondary += "F"; + MetaphAdd(primary, "F"); + MetaphAdd(secondary, "F"); break; case 'W': /* can also be in middle of word */ if (StringAt(original, current, 2, "WR", "")) { - primary += "R"; - secondary += "R"; + MetaphAdd(primary, "R"); + MetaphAdd(secondary, "R"); current += 2; break; } @@ -817,12 +882,12 @@ void DoubleMetaphone(const string &str, vector *codes) { (IsVowel(original, current + 1) || StringAt(original, current, 2, "WH", ""))) { /* Wasserman should match Vasserman */ if (IsVowel(original, current + 1)) { - primary += "A"; - secondary += "F"; + MetaphAdd(primary, "A"); + MetaphAdd(secondary, "F"); } else { /* need Uomo to match Womo */ - primary += "A"; - secondary += "A"; + MetaphAdd(primary, "A"); + MetaphAdd(secondary, "A"); } } @@ -830,16 +895,16 @@ void DoubleMetaphone(const string &str, vector *codes) { if (((current == last) && IsVowel(original, current - 1)) || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY", "") || StringAt(original, 0, 3, "SCH", "")) { - primary += ""; - secondary += "F"; + MetaphAdd(primary, ""); + MetaphAdd(secondary, "F"); current += 1; break; } /* polish e.g. 'filipowicz' */ if (StringAt(original, current, 4, "WICZ", "WITZ", "")) { - primary += "TS"; - secondary += "FX"; + MetaphAdd(primary, "TS"); + MetaphAdd(secondary, "FX"); current += 4; break; } @@ -852,8 +917,8 @@ void DoubleMetaphone(const string &str, vector *codes) { /* french e.g. breaux */ if (!((current == last) && (StringAt(original, (current - 3), 3, "IAU", "EAU", "") || StringAt(original, (current - 2), 2, "AU", "OU", "")))) { - primary += "KS"; - secondary += "KS"; + MetaphAdd(primary, "KS"); + MetaphAdd(secondary, "KS"); } if (StringAt(original, (current + 1), 1, "C", "X", "")) @@ -865,18 +930,18 @@ void DoubleMetaphone(const string &str, vector *codes) { case 'Z': /* chinese pinyin e.g. 'zhao' */ if (GetAt(original, current + 1) == 'H') { - primary += "J"; - secondary += "J"; + MetaphAdd(primary, "J"); + MetaphAdd(secondary, "J"); current += 2; break; } else if (StringAt(original, (current + 1), 2, "ZO", "ZI", "ZA", "") || (SlavoGermanic(original) && ((current > 0) && GetAt(original, current - 1) != 'T'))) { - primary += "S"; - secondary += "TS"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "TS"); } else { - primary += "S"; - secondary += "S"; + MetaphAdd(primary, "S"); + MetaphAdd(secondary, "S"); } if (GetAt(original, current + 1) == 'Z') @@ -888,28 +953,21 @@ void DoubleMetaphone(const string &str, vector *codes) { default: current += 1; } - /* printf("PRIMARY: %s\n", primary.str); - printf("SECONDARY: %s\n", secondary.str); */ + /* printf("PRIMARY: %s\n", primary->str); + printf("SECONDARY: %s\n", secondary->str); */ } - if (primary.length() > max_length) SetAt(primary, max_length, '\0'); + if (primary->length > 4) SetAt(primary, 4, '\0'); - if (secondary.length() > max_length) SetAt(secondary, max_length, '\0'); - - codes->push_back(primary); - codes->push_back(secondary); -} - -extern "C" { -void DoubleMetaphone_c(const char *str, size_t len, char **primary, char **secondary) { - vector codes; - string s = string(str, len); - DoubleMetaphone(s, &codes); - if (primary != NULL) { - *primary = strdup(codes[0].c_str()); + if (secondary->length > 4) SetAt(secondary, 4, '\0'); + if (primary_pp) { + *primary_pp = primary->str; } - if (secondary != NULL) { - *secondary = strdup(codes[1].c_str()); + if (secondary_pp) { + *secondary_pp = secondary->str; } -} + + DestroyMetaString(original); + DestroyMetaString(primary); + DestroyMetaString(secondary); } diff --git a/src/dep/phonetics/double_metaphone.h b/src/dep/phonetics/double_metaphone.h index 13f9f16224..f1f22d102a 100644 --- a/src/dep/phonetics/double_metaphone.h +++ b/src/dep/phonetics/double_metaphone.h @@ -1,11 +1,38 @@ +/* COPYRIGHT NOTICE + * + * This code was pulled directly from the Text-DoubleMetaphone perl package, + * version 0.07 + * + * The README mentions that the copyright is: + * + * Copyright 2000, Maurice Aubrey . + * All rights reserved. + + * This code is based heavily on the C++ implementation by + * Lawrence Philips and incorporates several bug fixes courtesy + * of Kevin Atkinson . + * + * This module is free software; you may redistribute it and/or + * modify it under the same terms as Perl itself. + */ + #ifndef DOUBLE_METAPHONE__H #define DOUBLE_METAPHONE__H -#include -#include +#ifdef __cplusplus +extern "C" { +#endif -using namespace std; +typedef struct { + char *str; + int length; + int bufsize; + int free_string_on_destroy; +} metastring; -void DoubleMetaphone(const string &str, vector *codes); +void DoubleMetaphone(const char *str, char **primary_pp, char **secondary_pp); -#endif /* DOUBLE_METAPHONE__H */ +#ifdef __cplusplus +} +#endif +#endif /* DOUBLE_METAPHONE__H */ \ No newline at end of file diff --git a/src/dep/phonetics/double_metaphone_capi.h b/src/dep/phonetics/double_metaphone_capi.h deleted file mode 100644 index 7b4c1431cd..0000000000 --- a/src/dep/phonetics/double_metaphone_capi.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * double_metaphone_capi.h - * - * Created on: Jul 19, 2018 - * Author: meir - */ - -#ifndef SRC_DEP_PHONETICS_DOUBLE_METAPHONE_CAPI_H_ -#define SRC_DEP_PHONETICS_DOUBLE_METAPHONE_CAPI_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -void DoubleMetaphone_c(const char* str, size_t len, char** primary, char** secondary); - -#ifdef __cplusplus -} -#endif - -#endif /* SRC_DEP_PHONETICS_DOUBLE_METAPHONE_CAPI_H_ */ diff --git a/src/ext/default.c b/src/ext/default.c index 4368381df3..92d39993cf 100644 --- a/src/ext/default.c +++ b/src/ext/default.c @@ -323,11 +323,11 @@ void StemmerExpanderFree(void *p) { * ******************************************************************************************/ void PhoneticExpand(RSQueryExpanderCtx *ctx, RSToken *token) { - char* primary = NULL; + char *primary = NULL; - PhoneticManager_ExpandPhonerics(NULL, token->str, token->len, &primary, NULL); + PhoneticManager_ExpandPhonetics(NULL, token->str, token->len, &primary, NULL); - if(primary){ + if (primary) { ctx->ExpandToken(ctx, primary, strlen(primary), 0x0); } } @@ -372,7 +372,7 @@ void DefaultExpander(RSQueryExpanderCtx *ctx, RSToken *token) { int phonetic = (*(ctx->currentNode))->opts.phonetic; SynonymExpand(ctx, token); // todo: if phonetic default check if the field spec has phonetics - if(phonetic == PHONETIC_DEFAULT || phonetic == PHONETIC_ENABLED){ + if (phonetic == PHONETIC_DEFAULT || phonetic == PHONETIC_ENABLED) { PhoneticExpand(ctx, token); } diff --git a/src/phonetic_manager.c b/src/phonetic_manager.c index 0a59513d60..4175d296ee 100644 --- a/src/phonetic_manager.c +++ b/src/phonetic_manager.c @@ -1,5 +1,5 @@ #include "phonetic_manager.h" -#include "dep/phonetics/double_metaphone_capi.h" +#include "dep/phonetics/double_metaphone.h" #include #include @@ -13,11 +13,14 @@ static void PhoneticManager_AddPrefix(char** phoneticTerm) { *phoneticTerm[0] = PHONETIC_PREFIX; } -void PhoneticManager_ExpandPhonerics(PhoneticManagerCtx* ctx, const char* term, size_t len, +void PhoneticManager_ExpandPhonetics(PhoneticManagerCtx* ctx, const char* term, size_t len, char** primary, char** secondary) { // currently ctx is irrelevant we support only one universal algorithm for all 4 languages // this phonetic manager was built for future thinking and easily add more algorithms - DoubleMetaphone_c(term, len, primary, secondary); + char bufTmp[len + 1]; + bufTmp[len] = 0; + memcpy(bufTmp, term, len); + DoubleMetaphone(bufTmp, primary, secondary); PhoneticManager_AddPrefix(primary); PhoneticManager_AddPrefix(secondary); } diff --git a/src/phonetic_manager.h b/src/phonetic_manager.h index a775aecd20..2e47efd928 100644 --- a/src/phonetic_manager.h +++ b/src/phonetic_manager.h @@ -17,7 +17,7 @@ typedef struct { char* language; } PhoneticManagerCtx; -void PhoneticManager_ExpandPhonerics(PhoneticManagerCtx* ctx, const char* term, size_t len, +void PhoneticManager_ExpandPhonetics(PhoneticManagerCtx* ctx, const char* term, size_t len, char** primary, char** secondary); #endif /* SRC_PHONETIC_MANAGER_H_ */ diff --git a/src/tokenize.c b/src/tokenize.c index fe8ec29ab6..bb6c5c96cd 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -123,7 +123,9 @@ uint32_t simpleTokenizer_Next(RSTokenizer *base, Token *t) { } if ((ctx->options & TOKENIZE_PHONETICS) && normLen >= RSGlobalConfig.minPhoneticTermLen) { - PhoneticManager_ExpandPhonerics(NULL, tok, normLen, &t->phoneticsPrimary, NULL); + // VLA: eww + char phonOrig[normLen + 1]; + PhoneticManager_ExpandPhonetics(NULL, tok, normLen, &t->phoneticsPrimary, NULL); } return ctx->lastOffset;