utf8.c: Split a static fcn

This adds a new function for changing the case of an input code point. The difference between this and the existing function is that the new one returns an array of UVs instead of a combination of the first code point and UTF-8 of the whole thing, a somewhat awkward API that made more sense when we used swashes. That function is retained for now, at least, but most of the work is done in the new function.
Perl · Aug 16, 2021 · d90a171 · d90a171
1 parent 9d72285
commit d90a171
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 54 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -2499,6 +2499,14 @@ S	|void	|warn_on_first_deprecated_use				    \
 				|NN const char * const file		    \
 				|const unsigned line
 #endif
+S	|UV	|to_case_cp_list|const UV original				\
+				|NN const U32 ** const remaining_list		\
+				|NN Size_t * remaining_count			\
+				|NN SV *invlist					\
+				|NN const I32 * const invmap			\
+				|NULLOK const U32 * const * const aux_tables	\
+				|NULLOK const U8 * const aux_table_lengths	\
+				|NN const char * const normal
 S	|UV	|_to_utf8_case  |const UV original				\
 				|NULLOK const U8 *p				\
 				|NN U8* ustrp					\

diff --git a/embed.h b/embed.h
@@ -1990,6 +1990,7 @@
 #define is_utf8_common(a,b,c)	S_is_utf8_common(aTHX_ a,b,c)
 #define is_utf8_overlong	S_is_utf8_overlong
 #define new_msg_hv(a,b,c)	S_new_msg_hv(aTHX_ a,b,c)
+#define to_case_cp_list(a,b,c,d,e,f,g,h)	S_to_case_cp_list(aTHX_ a,b,c,d,e,f,g,h)
 #define to_lower_latin1		S_to_lower_latin1
 #define turkic_fc(a,b,c,d)	S_turkic_fc(aTHX_ a,b,c,d)
 #define turkic_lc(a,b,c,d)	S_turkic_lc(aTHX_ a,b,c,d)

diff --git a/proto.h b/proto.h
@@ -6668,6 +6668,9 @@ STATIC HV *	S_new_msg_hv(pTHX_ const char * const message, U32 categories, U32 f
 #define PERL_ARGS_ASSERT_NEW_MSG_HV	\
 	assert(message)
 
+STATIC UV	S_to_case_cp_list(pTHX_ const UV original, const U32 ** const remaining_list, Size_t * remaining_count, SV *invlist, const I32 * const invmap, const U32 * const * const aux_tables, const U8 * const aux_table_lengths, const char * const normal);
+#define PERL_ARGS_ASSERT_TO_CASE_CP_LIST	\
+	assert(remaining_list); assert(remaining_count); assert(invlist); assert(invmap); assert(normal)
 STATIC U8	S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp, const char dummy)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_TO_LOWER_LATIN1

diff --git a/utf8.c b/utf8.c
@@ -3166,26 +3166,34 @@ Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
 }
 
 STATIC UV
-S__to_utf8_case(pTHX_ const UV original, const U8 *p,
-                      U8* ustrp, STRLEN *lenp,
+S_to_case_cp_list(pTHX_ const UV original,
+                        const U32 ** const remaining_list,
+                        Size_t * remaining_count,
                       SV *invlist, const I32 * const invmap,
                       const U32 * const * const aux_tables,
                       const U8 * const aux_table_lengths,
                       const char * const normal)
 {
-    STRLEN len = 0;
-
-    /* Change the case of code point 'original' whose UTF-8 representation (assumed
-     * by this routine to be valid) begins at 'p'.  'normal' is a string to use
-     * to name the new case in any generated messages, as a fallback if the
-     * operation being used is not available.  The new case is given by the
-     * data structures in the remaining arguments.
+    SSize_t index;
+    I32 base;
+
+    /* Return the changed case of code point 'original'.  The first code point of
+     * the changed case is returned; *remaining_count will be set to how many
+     * other code points are in the changed case.  If it is non-zero,
+     * *remaining_list will point to a non-modifiable array containing them;
+     * if zero, *remaining_list is undefined.
      *
-     * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
-     * entire changed case string, and the return value is the first code point
-     * in that string */
+     * 'normal' is a string to use to name the new case in any generated
+     * messages, as a fallback if the operation being used is not available.
+     *
+     * The casing to use is given by the data structures in the remaining
+     * arguments.
+     */
 
-    PERL_ARGS_ASSERT__TO_UTF8_CASE;
+    PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
+
+    /* Almost all results will be a single value */
+    *remaining_count = 0;
 
     /* For code points that don't change case, we already know that the output
      * of this function is the unchanged input, so we can skip doing look-ups
@@ -3204,7 +3212,7 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
          * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
          * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
         if (original < 0x10A0) {
-            goto cases_to_self;
+            return original;
         }
 
         /* The following largish code point ranges also don't have case
@@ -3231,7 +3239,7 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
              * that the test suite will start having failures to alert you
              * should that happen) */
             if (original < 0xA640) {
-                goto cases_to_self;
+                return original;
             }
 
             if (original >= 0xAC00) {
@@ -3242,13 +3250,13 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
                             "Operation \"%s\" returns its argument for"
                             " UTF-16 surrogate U+%04" UVXf, desc, original);
                     }
-                    goto cases_to_self;
+                    return original;
                 }
 
                 /* AC00..FAFF Catches Hangul syllables and private use, plus
                  * some others */
                 if (original < 0xFB00) {
-                    goto cases_to_self;
+                    return original;
                 }
 
                 if (UNLIKELY(UNICODE_IS_SUPER(original))) {
@@ -3261,12 +3269,13 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
                             "Operation \"%s\" returns its argument for"
                             " non-Unicode code point 0x%04" UVXf, desc, original);
                     }
-                    goto cases_to_self;
+                    return original;
                 }
+
 #ifdef HIGHEST_CASE_CHANGING_CP
-                if (UNLIKELY(original > HIGHEST_CASE_CHANGING_CP)) {
 
-                    goto cases_to_self;
+                if (UNLIKELY(original > HIGHEST_CASE_CHANGING_CP)) {
+                    return original;
                 }
 #endif
             }
@@ -3276,64 +3285,90 @@ S__to_utf8_case(pTHX_ const UV original, const U8 *p,
          * be given. */
     }
 
-    {
-        unsigned int i;
-        const U32 * cp_list;
-        U8 * d;
 
         /* 'index' is guaranteed to be non-negative, as this is an inversion
          * map that covers all possible inputs.  See [perl #133365] */
-        SSize_t index = _invlist_search(invlist, original);
-        I32 base = invmap[index];
+        index = _invlist_search(invlist, original);
+        base = invmap[index];
 
         /* The data structures are set up so that if 'base' is non-negative,
          * the case change is 1-to-1; and if 0, the change is to itself */
-        if (base >= 0) {
-            IV lc;
-
-            if (base == 0) {
-                goto cases_to_self;
+        if (LIKELY(base == 0)) {
+            return original;
             }
 
-            /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
-            lc = base + original - invlist_array(invlist)[index];
-            *lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
-            return lc;
+        if (LIKELY(base > 0)) {
+            return base + original - invlist_array(invlist)[index];
         }
 
+
         /* Here 'base' is negative.  That means the mapping is 1-to-many, and
          * requires an auxiliary table look up.  abs(base) gives the index into
          * a list of such tables which points to the proper aux table.  And a
          * parallel list gives the length of each corresponding aux table. */
-        cp_list = aux_tables[-base];
+        base = -base;
+        *remaining_list  = aux_tables[base] + 1;
+        *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
 
-        /* Create the string of UTF-8 from the mapped-to code points */
-        d = ustrp;
-        for (i = 0; i < aux_table_lengths[-base]; i++) {
-            d = uvchr_to_utf8(d, cp_list[i]);
-        }
-        *d = '\0';
-        *lenp = d - ustrp;
+        return (UV) aux_tables[base][0];
+}
 
-        return cp_list[0];
-    }
+STATIC UV
+S__to_utf8_case(pTHX_ const UV original, const U8 *p,
+                      U8* ustrp, STRLEN *lenp,
+                      SV *invlist, const I32 * const invmap,
+                      const U32 * const * const aux_tables,
+                      const U8 * const aux_table_lengths,
+                      const char * const normal)
+{
+    /* Change the case of code point 'original'.  If 'p' is non-NULL, it points to
+     * the beginning of the (assumed to be valid) UTF-8 representation of
+     * 'original'.  'normal' is a string to use to name the new case in any
+     * generated messages, as a fallback if the operation being used is not
+     * available.  The new case is given by the data structures in the
+     * remaining arguments.
+     *
+     * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
+     * entire changed case string, and the return value is the first code point
+     * in that string
+     *
+     * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
+     * since the changed version may be longer than the original character. */
+
+    const U32 * remaining_list;
+    Size_t remaining_count;
+    UV first = to_case_cp_list(original,
+                               &remaining_list, &remaining_count,
+                               invlist, invmap,
+                               aux_tables, aux_table_lengths,
+                               normal);
+
+    PERL_ARGS_ASSERT__TO_UTF8_CASE;
+
+    /* If the code point maps to itself and we already have its representation,
+     * copy it instead of recalculating */
+    if (original == first && p) {
+        *lenp = UTF8SKIP(p);
 
-    /* Here, there was no mapping defined, which means that the code point maps
-     * to itself.  Return the inputs */
-  cases_to_self:
-    if (p) {
-        len = UTF8SKIP(p);
         if (p != ustrp) {   /* Don't copy onto itself */
-            Copy(p, ustrp, len, U8);
+            Copy(p, ustrp, *lenp, U8);
         }
-        *lenp = len;
     }
     else {
-        *lenp = uvchr_to_utf8(ustrp, original) - ustrp;
+        U8 * d = ustrp;
+        Size_t i;
+
+        d = uvchr_to_utf8(d, first);
+
+        for (i = 0; i < remaining_count; i++) {
+            d = uvchr_to_utf8(d, remaining_list[i]);
     }
 
-    return original;
+        *d = '\0';
+        *lenp = d - ustrp;
+    }
 
+    return first;
 }
 
 Size_t