Skip to content

Commit

Permalink
utf8.c: Split a static fcn
Browse files Browse the repository at this point in the history
This adds a new function for changing the case of an input code point.
The difference between this and the existing function is that the new
one returns an array of UVs instead of a combination of the first code
point and UTF-8 of the whole thing, a somewhat awkward API that made
more sense when we used swashes.  That function is retained for now, at
least, but most of the work is done in the new function.
  • Loading branch information
khwilliamson committed Aug 16, 2021
1 parent 9d72285 commit d90a171
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 54 deletions.
8 changes: 8 additions & 0 deletions embed.fnc
Expand Up @@ -2499,6 +2499,14 @@ S |void |warn_on_first_deprecated_use \
|NN const char * const file \
|const unsigned line
#endif
S |UV |to_case_cp_list|const UV original \
|NN const U32 ** const remaining_list \
|NN Size_t * remaining_count \
|NN SV *invlist \
|NN const I32 * const invmap \
|NULLOK const U32 * const * const aux_tables \
|NULLOK const U8 * const aux_table_lengths \
|NN const char * const normal
S |UV |_to_utf8_case |const UV original \
|NULLOK const U8 *p \
|NN U8* ustrp \
Expand Down
1 change: 1 addition & 0 deletions embed.h
Expand Up @@ -1990,6 +1990,7 @@
#define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c)
#define is_utf8_overlong S_is_utf8_overlong
#define new_msg_hv(a,b,c) S_new_msg_hv(aTHX_ a,b,c)
#define to_case_cp_list(a,b,c,d,e,f,g,h) S_to_case_cp_list(aTHX_ a,b,c,d,e,f,g,h)
#define to_lower_latin1 S_to_lower_latin1
#define turkic_fc(a,b,c,d) S_turkic_fc(aTHX_ a,b,c,d)
#define turkic_lc(a,b,c,d) S_turkic_lc(aTHX_ a,b,c,d)
Expand Down
3 changes: 3 additions & 0 deletions proto.h
Expand Up @@ -6668,6 +6668,9 @@ STATIC HV * S_new_msg_hv(pTHX_ const char * const message, U32 categories, U32 f
#define PERL_ARGS_ASSERT_NEW_MSG_HV \
assert(message)

STATIC UV S_to_case_cp_list(pTHX_ const UV original, const U32 ** const remaining_list, Size_t * remaining_count, SV *invlist, const I32 * const invmap, const U32 * const * const aux_tables, const U8 * const aux_table_lengths, const char * const normal);
#define PERL_ARGS_ASSERT_TO_CASE_CP_LIST \
assert(remaining_list); assert(remaining_count); assert(invlist); assert(invmap); assert(normal)
STATIC U8 S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp, const char dummy)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_TO_LOWER_LATIN1
Expand Down
143 changes: 89 additions & 54 deletions utf8.c
Expand Up @@ -3166,26 +3166,34 @@ Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
}

STATIC UV
S__to_utf8_case(pTHX_ const UV original, const U8 *p,
U8* ustrp, STRLEN *lenp,
S_to_case_cp_list(pTHX_ const UV original,
const U32 ** const remaining_list,
Size_t * remaining_count,
SV *invlist, const I32 * const invmap,
const U32 * const * const aux_tables,
const U8 * const aux_table_lengths,
const char * const normal)
{
STRLEN len = 0;

/* Change the case of code point 'original' whose UTF-8 representation (assumed
* by this routine to be valid) begins at 'p'. 'normal' is a string to use
* to name the new case in any generated messages, as a fallback if the
* operation being used is not available. The new case is given by the
* data structures in the remaining arguments.
SSize_t index;
I32 base;

/* Return the changed case of code point 'original'. The first code point of
* the changed case is returned; *remaining_count will be set to how many
* other code points are in the changed case. If it is non-zero,
* *remaining_list will point to a non-modifiable array containing them;
* if zero, *remaining_list is undefined.
*
* On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
* entire changed case string, and the return value is the first code point
* in that string */
* 'normal' is a string to use to name the new case in any generated
* messages, as a fallback if the operation being used is not available.
*
* The casing to use is given by the data structures in the remaining
* arguments.
*/

PERL_ARGS_ASSERT__TO_UTF8_CASE;
PERL_ARGS_ASSERT_TO_CASE_CP_LIST;

/* Almost all results will be a single value */
*remaining_count = 0;

/* For code points that don't change case, we already know that the output
* of this function is the unchanged input, so we can skip doing look-ups
Expand All @@ -3204,7 +3212,7 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
* Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
* Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
if (original < 0x10A0) {
goto cases_to_self;
return original;
}

/* The following largish code point ranges also don't have case
Expand All @@ -3231,7 +3239,7 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
* that the test suite will start having failures to alert you
* should that happen) */
if (original < 0xA640) {
goto cases_to_self;
return original;
}

if (original >= 0xAC00) {
Expand All @@ -3242,13 +3250,13 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
"Operation \"%s\" returns its argument for"
" UTF-16 surrogate U+%04" UVXf, desc, original);
}
goto cases_to_self;
return original;
}

/* AC00..FAFF Catches Hangul syllables and private use, plus
* some others */
if (original < 0xFB00) {
goto cases_to_self;
return original;
}

if (UNLIKELY(UNICODE_IS_SUPER(original))) {
Expand All @@ -3261,12 +3269,13 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
"Operation \"%s\" returns its argument for"
" non-Unicode code point 0x%04" UVXf, desc, original);
}
goto cases_to_self;
return original;
}

#ifdef HIGHEST_CASE_CHANGING_CP
if (UNLIKELY(original > HIGHEST_CASE_CHANGING_CP)) {

goto cases_to_self;
if (UNLIKELY(original > HIGHEST_CASE_CHANGING_CP)) {
return original;
}
#endif
}
Expand All @@ -3276,64 +3285,90 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
* be given. */
}

{
unsigned int i;
const U32 * cp_list;
U8 * d;

/* 'index' is guaranteed to be non-negative, as this is an inversion
* map that covers all possible inputs. See [perl #133365] */
SSize_t index = _invlist_search(invlist, original);
I32 base = invmap[index];
index = _invlist_search(invlist, original);
base = invmap[index];

/* The data structures are set up so that if 'base' is non-negative,
* the case change is 1-to-1; and if 0, the change is to itself */
if (base >= 0) {
IV lc;

if (base == 0) {
goto cases_to_self;
if (LIKELY(base == 0)) {
return original;
}

/* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
lc = base + original - invlist_array(invlist)[index];
*lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
return lc;
if (LIKELY(base > 0)) {
return base + original - invlist_array(invlist)[index];
}


/* Here 'base' is negative. That means the mapping is 1-to-many, and
* requires an auxiliary table look up. abs(base) gives the index into
* a list of such tables which points to the proper aux table. And a
* parallel list gives the length of each corresponding aux table. */
cp_list = aux_tables[-base];
base = -base;
*remaining_list = aux_tables[base] + 1;
*remaining_count = (Size_t) (aux_table_lengths[base] - 1);

/* Create the string of UTF-8 from the mapped-to code points */
d = ustrp;
for (i = 0; i < aux_table_lengths[-base]; i++) {
d = uvchr_to_utf8(d, cp_list[i]);
}
*d = '\0';
*lenp = d - ustrp;
return (UV) aux_tables[base][0];
}

return cp_list[0];
}
STATIC UV
S__to_utf8_case(pTHX_ const UV original, const U8 *p,
U8* ustrp, STRLEN *lenp,
SV *invlist, const I32 * const invmap,
const U32 * const * const aux_tables,
const U8 * const aux_table_lengths,
const char * const normal)
{
/* Change the case of code point 'original'. If 'p' is non-NULL, it points to
* the beginning of the (assumed to be valid) UTF-8 representation of
* 'original'. 'normal' is a string to use to name the new case in any
* generated messages, as a fallback if the operation being used is not
* available. The new case is given by the data structures in the
* remaining arguments.
*
* On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
* entire changed case string, and the return value is the first code point
* in that string
*
* Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
* since the changed version may be longer than the original character. */

const U32 * remaining_list;
Size_t remaining_count;
UV first = to_case_cp_list(original,
&remaining_list, &remaining_count,
invlist, invmap,
aux_tables, aux_table_lengths,
normal);

PERL_ARGS_ASSERT__TO_UTF8_CASE;

/* If the code point maps to itself and we already have its representation,
* copy it instead of recalculating */
if (original == first && p) {
*lenp = UTF8SKIP(p);

/* Here, there was no mapping defined, which means that the code point maps
* to itself. Return the inputs */
cases_to_self:
if (p) {
len = UTF8SKIP(p);
if (p != ustrp) { /* Don't copy onto itself */
Copy(p, ustrp, len, U8);
Copy(p, ustrp, *lenp, U8);
}
*lenp = len;
}
else {
*lenp = uvchr_to_utf8(ustrp, original) - ustrp;
U8 * d = ustrp;
Size_t i;

d = uvchr_to_utf8(d, first);

for (i = 0; i < remaining_count; i++) {
d = uvchr_to_utf8(d, remaining_list[i]);
}

return original;
*d = '\0';
*lenp = d - ustrp;
}

return first;
}

Size_t
Expand Down

0 comments on commit d90a171

Please sign in to comment.