Add qr/\b{gcb}/

A function implements seeing if the space between any two characters is a grapheme cluster break. Afer I wrote this, I realized that an array lookup might be a better implementation, but the deadline for v5.22 was too close to change it. I did see that my gcc optimized it down to an array lookup. This makes the implementation of \X go from being complicated to trivial.
Perl · Feb 20, 2015 · 64935bc · 64935bc
1 parent 0e0b935
commit 64935bc
Show file tree

Hide file tree

Showing 25 changed files with 587 additions and 260 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -2276,6 +2276,7 @@ Es	|void	|to_utf8_substr	|NN regexp * prog
 Es	|bool	|to_byte_substr	|NN regexp * prog
 ERsn	|I32	|reg_check_named_buff_matched	|NN const regexp *rex \
 						|NN const regnode *scan
+EsnR	|bool	|isGCB		|const PL_GCB_enum before|const PL_GCB_enum after
 #  ifdef DEBUGGING
 Es	|void	|dump_exec_pos	|NN const char *locinput|NN const regnode *scan|NN const char *loc_regeol\
 				|NN const char *loc_bostr|NN const char *loc_reg_starttry|const bool do_utf8

diff --git a/embed.h b/embed.h
@@ -1054,6 +1054,7 @@
 #define find_byclass(a,b,c,d,e)	S_find_byclass(aTHX_ a,b,c,d,e)
 #define isFOO_lc(a,b)		S_isFOO_lc(aTHX_ a,b)
 #define isFOO_utf8_lc(a,b)	S_isFOO_utf8_lc(aTHX_ a,b)
+#define isGCB			S_isGCB
 #define reg_check_named_buff_matched	S_reg_check_named_buff_matched
 #define regcppop(a,b)		S_regcppop(aTHX_ a,b)
 #define regcppush(a,b,c)	S_regcppush(aTHX_ a,b,c)

diff --git a/embedvar.h b/embedvar.h
@@ -53,6 +53,7 @@
 #define PL_DBtrace		(vTHX->IDBtrace)
 #define PL_Dir			(vTHX->IDir)
 #define PL_Env			(vTHX->IEnv)
+#define PL_GCB_invlist		(vTHX->IGCB_invlist)
 #define PL_HasMultiCharFold	(vTHX->IHasMultiCharFold)
 #define PL_InBitmap		(vTHX->IInBitmap)
 #define PL_LIO			(vTHX->ILIO)

diff --git a/intrpvar.h b/intrpvar.h
@@ -610,6 +610,7 @@ PERLVAR(I, utf8_charname_continue, SV *)
 PERLVARA(I, utf8_swash_ptrs, POSIX_SWASH_COUNT, SV *)
 PERLVARA(I, Posix_ptrs, POSIX_CC_COUNT, SV *)
 PERLVARA(I, XPosix_ptrs, POSIX_CC_COUNT, SV *)
+PERLVAR(I, GCB_invlist, SV *)
 
 PERLVAR(I, last_swash_hv, HV *)
 PERLVAR(I, last_swash_tmps, U8 *)

diff --git a/lib/unicore/mktables b/lib/unicore/mktables
@@ -18762,6 +18762,7 @@ sub _test_break($$) {
     my @should_match = map { eval "\"$_\"" } @should_display;
 
     # If a string can be represented in both non-ut8 and utf8, test both cases
+    my $display_upgrade = "";
     UPGRADE:
     for my $to_upgrade (0 .. 1) {
 
@@ -18771,8 +18772,54 @@ sub _test_break($$) {
             next UPGRADE if utf8::is_utf8($string);
 
             utf8::upgrade($string);
+            $display_upgrade = " (utf8-upgraded)";
+        }
+
+        # The /l modifier has C after it to indicate the locale to try
+        my @modifiers = qw(a aa d lC u i);
+        push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+
+        # Test for each of the regex modifiers.
+        for my $modifier (@modifiers) {
+            my $display_locale = "";
+
+            # For /l, set the locale to what it says to.
+            if ($modifier =~ / ^ l (.*) /x) {
+                my $locale = $1;
+                $display_locale = "(locale = $locale)";
+                use Config;
+                if (defined $Config{d_setlocale}) {
+                    eval { require POSIX; import POSIX 'locale_h'; };
+                    if (defined &POSIX::LC_CTYPE) {
+                        POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
+                    }
+                }
+                $modifier = 'l';
+            }
+
+            no warnings qw(locale regexp surrogate);
+            my $pattern = "(?$modifier:$break_pattern)";
+
+            # Actually do the test
+            my $matched = $string =~ qr/$pattern/;
+            print "not " unless $matched;
+
+            # Fancy display of test results
+            $matched = ($matched) ? "matched" : "failed to match";
+            print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+
+            # Repeat with the first \B{} in the pattern.  This makes sure the
+            # code in regexec.c:find_byclass() for \B gets executed
+            if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
+                my $B_pattern = "$1$2";
+                $matched = $string =~ qr/$B_pattern/;
+                print "not " unless $matched;
+                print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+            }
         }
 
+        next if $break_type ne 'gcb';
+
         # Finally, do the \X match.
         my @matches = $string =~ /(\X)/g;
 

diff --git a/perl.c b/perl.c
@@ -33,7 +33,6 @@
 #include "perl.h"
 #include "patchlevel.h"			/* for local_patches */
 #include "XSUB.h"
-#include "charclass_invlists.h"
 
 #ifdef NETWARE
 #include "nwutil.h"	
@@ -391,6 +390,7 @@ perl_construct(pTHXx)
     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(XPosixWord_invlist);
     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
+    PL_GCB_invlist = _new_invlist_C_array(Grapheme_Cluster_Break_invlist);
 
     ENTER;
 }
@@ -1060,6 +1060,7 @@ perl_destruct(pTHXx)
         SvREFCNT_dec(PL_XPosix_ptrs[i]);
         PL_XPosix_ptrs[i] = NULL;
     }
+    PL_GCB_invlist = NULL;
 
     if (!specialWARN(PL_compiling.cop_warnings))
 	PerlMemShared_free(PL_compiling.cop_warnings);

diff --git a/perl.h b/perl.h
@@ -2685,6 +2685,7 @@ typedef struct padname PADNAME;
 #endif
 
 #include "handy.h"
+#include "charclass_invlists.h"
 
 #if defined(USE_LARGE_FILES) && !defined(NO_64_BIT_RAWIO)
 #   if LSEEKSIZE == 8 && !defined(USE_64_BIT_RAWIO)

diff --git a/pod/perlcheat.pod b/pod/perlcheat.pod
@@ -46,7 +46,7 @@ already be overwhelming.
   , =>            /a ASCII    /aa safe  {3,7}  repeat in range
   list ops        /l locale   /d  dual  |      alternation
   not             /u Unicode            []     character class
-  and             /e evaluate /ee rpts  \b     word boundary
+  and             /e evaluate /ee rpts  \b     boundary
   or xor          /g global             \z     string end
                   /o compile pat once   ()     capture
   DEBUG                                 (?:p)  no capture

diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
@@ -573,19 +573,23 @@ will be lost.
 
  # Word Boundary Opcodes:
  BOUND           no         Match "" at any word boundary using native
-                            charset rules for non-utf8
- BOUNDL          no         Match "" at any locale word boundary
- BOUNDU          no         Match "" at any word boundary using Unicode
-                            rules
- BOUNDA          no         Match "" at any word boundary using ASCII
-                            rules
+                            charset rules for non-utf8, otherwise
+                            Unicode rules
+ BOUNDL          no         Match "" at any boundary of a given type
+                            using locale rules
+ BOUNDU          no         Match "" at any boundary of a given type
+                            using Unicode rules
+ BOUNDA          no         Match "" at any boundary of a given type
+                            using ASCII rules
  NBOUND          no         Match "" at any word non-boundary using
-                            native charset rules for non-utf8
- NBOUNDL         no         Match "" at any locale word non-boundary
- NBOUNDU         no         Match "" at any word non-boundary using
+                            native charset rules for non-utf8, otherwise
                             Unicode rules
- NBOUNDA         no         Match "" at any word non-boundary using
-                            ASCII rules
+ NBOUNDL         no         Match "" at any boundary of a given type
+                            using locale rules
+ NBOUNDU         no         Match "" at any boundary of a given type
+                            using using Unicode rules
+ NBOUNDA         no         Match "" at any boundary of a given type
+                            using using ASCII rules
 
  # [Special] alternatives:
  REG_ANY         no         Match any one character (except newline).

diff --git a/pod/perldelta.pod b/pod/perldelta.pod
@@ -25,7 +25,14 @@ XXX New core language features go here.  Summarize user-visible core language
 enhancements.  Particularly prominent performance optimisations could go
 here, but most should go in the L</Performance Enhancements> section.
 
-[ List each enhancement as a =head2 entry ]
+=head2 qr/\b{gcb}/ is now handled in regular expressions
+
+C<gcb> stands for Grapheme Cluster Boundary.  It is a Unicode property
+that finds the boundary between sequences of characters that look like a
+single character to a native speaker of a language.  Perl has long had
+the ability to deal with these through the C<\X> regular escape
+sequence.  Now, there is an alternative way of handling these.  See
+L<perlrebackslash/\b{}, \b, \B{}, \B> for details.
 
 =head1 Security
 

diff --git a/pod/perldiag.pod b/pod/perldiag.pod
@@ -2894,6 +2894,12 @@ with 'useperlio'.
 (F) Your machine doesn't implement the sockatmark() functionality,
 neither as a system call nor an ioctl call (SIOCATMARK).
 
+=item '%s' is an unknown bound type in regex; marked by <-- HERE in m/%s/
+
+(F) You used C<\b{...}> or C<\B{...}> and the C<...> is not known to
+Perl.  The current valid ones are given in
+L<perlrebackslash/\b{}, \b, \B{}, \B>.
+
 =item "%s" is more clearly written simply as "%s" in regex; marked by <-- HERE in m/%s/
 
 (W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>)
@@ -6638,6 +6644,15 @@ is deprecated.  See L<perlvar/"$[">.
 form if you wish to use an empty line as the terminator of the
 here-document.
 
+=item Use of \b{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale
+
+(W locale)  You are matching a regular expression using locale rules,
+and a Unicode boundary is being matched, but the locale is not a Unicode
+one.  This doesn't make sense.  Perl will continue, assuming a Unicode
+(UTF-8) locale, but the results could well be wrong except if the locale
+happens to be ISO-8859-1 (Latin1) where this message is spurious and can
+be ignored.
+
 =item Use of chdir('') or chdir(undef) as chdir() deprecated
 
 (D deprecated) chdir() with no arguments is documented to change to
@@ -6859,6 +6874,15 @@ a range.  For these, what should happen isn't clear at all.  In
 these circumstances, Perl discards all but the first character
 of the returned sequence, which is not likely what you want.
 
+=item Using /u for '%s' instead of /%s in regex; marked by <-- HERE in m/%s/
+
+(W regexp) You used a Unicode boundary (C<\b{...}> or C<\B{...}>) in a
+portion of a regular expression where the character set modifiers C</a>
+or C</aa> are in effect.  These two modifiers indicate an ASCII
+interpretation, and this doesn't make sense for a Unicode definiton.
+The generated regular expression will compile so that the boundary uses
+all of Unicode.  No other portion of the regular expression is affected.
+
 =item Using !~ with %s doesn't make sense
 
 (F) Using the C<!~> operator with C<s///r>, C<tr///r> or C<y///r> is

diff --git a/pod/perlre.pod b/pod/perlre.pod
@@ -388,6 +388,10 @@ the pattern uses a Unicode property (C<\p{...}> or C<\P{...}>); or
 
 =item 6
 
+the pattern uses a Unicode break (C<\b{...}> or C<\B{...}>); or
+
+=item 7
+
 the pattern uses L</C<(?[ ])>>
 
 =back
@@ -770,6 +774,8 @@ X<regexp, zero-width assertion>
 X<regular expression, zero-width assertion>
 X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G>
 
+    \b{} Match at Unicode boundary of specified type
+    \B{} Match where corresponding \b{} doesn't match
     \b  Match a word boundary
     \B  Match except at a word boundary
     \A  Match only at beginning of string
@@ -778,6 +784,12 @@ X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G>
     \G  Match only at pos() (e.g. at the end-of-match position
         of prior m//g)
 
+A Unicode boundary (C<\b{}>), available starting in v5.22, is a spot
+between two characters, or before the first character in the string, or
+after the final character in the string where certain criteria defined
+by Unicode are met.  See L<perlrebackslash/\b{}, \b, \B{}, \B> for
+details.
+
 A word boundary (C<\b>) is a spot between two characters
 that has a C<\w> on one side of it and a C<\W> on the other side
 of it (in either order), counting the imaginary characters off the

diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod
@@ -66,8 +66,8 @@ as C<Not in [].>
  \1                Absolute backreference.  Not in [].
  \a                Alarm or bell.
  \A                Beginning of string.  Not in [].
- \b                Word/non-word boundary. (Backspace in []).
- \B                Not a word/non-word boundary.  Not in [].
+ \b{}, \b          Boundary. (\b is a backspace in []).
+ \B{}, \B          Not a boundary.
  \cX               Control-X.
  \C                Single octet, even under UTF-8.  Not in [].
                    (Deprecated)
@@ -134,7 +134,8 @@ description.  (For EBCDIC platforms, see L<perlebcdic/OPERATOR DIFFERENCES>.)
 =item [1]
 
 C<\b> is the backspace character only inside a character class. Outside a
-character class, C<\b> is a word/non-word boundary.
+character class, C<\b> alone is a word-character/non-word-character
+boundary, and C<\b{}> is some other type of boundary.
 
 =item [2]
 
@@ -525,10 +526,21 @@ or the beginning of that string if there was no previous match.
 
 Mnemonic: I<G>lobal.
 
-=item \b, \B
+=item \b{}, \b, \B{}, \B
 
-C<\b> matches at any place between a word and a non-word character; C<\B>
-matches at any place between characters where C<\b> doesn't match. C<\b>
+C<\b{...}>, available starting in v5.22, matches a boundary (between two
+characters, or before the first character of the string, or after the
+final character of the string) based on the Unicode rules for the
+boundary type specified inside the braces.  The currently known boundary
+types are given a few paragraphs below.  C<\B{...}> matches at any place
+between characters where C<\b{...}> of the same type doesn't match.
+
+C<\b> when not immediately followed by a C<"{"> matches at any place
+between a word (something matched by C<\w>) and a non-word character
+(C<\W>); C<\B> when not immediately followed by a C<"{"> matches at any
+place between characters where C<\b> doesn't match.
+
+C<\b>
 and C<\B> assume there's a non-word character before the beginning and after
 the end of the source string; so C<\b> will match at the beginning (or end)
 of the source string if the source string begins (or ends) with a word
@@ -537,13 +549,22 @@ character. Otherwise, C<\B> will match.
 Do not use something like C<\b=head\d\b> and expect it to match the
 beginning of a line.  It can't, because for there to be a boundary before
 the non-word "=", there must be a word character immediately previous.  
-All boundary determinations look for word characters alone, not for
-non-words characters nor for string ends.  It may help to understand how
+All plain C<\b> and C<\B> boundary determinations look for word
+characters alone, not for
+non-word characters nor for string ends.  It may help to understand how
 <\b> and <\B> work by equating them as follows:
 
     \b	really means	(?:(?<=\w)(?!\w)|(?<!\w)(?=\w))
     \B	really means	(?:(?<=\w)(?=\w)|(?<!\w)(?!\w))
 
+In contrast, C<\b{...}> always matches at the beginning and end of the
+line (and C<\B{...}> never does).  The only boundary type currently
+"Grapheme Cluster Boundary".  (Actually Perl always uses the improved
+"extended" grapheme cluster").  These are explained below under C<\X>.
+In fact, C<\X> is another way to get the same functionality.  It is
+equivalent to C</.+?\b{gcb}/>.  Use whichever is most convenient for
+your situation.
+
 Mnemonic: I<b>oundary.
 
 =back
@@ -650,6 +671,8 @@ were a single character.
 The match is greedy and non-backtracking, so that the cluster is never
 broken up into smaller components.
 
+See also L<C<\b{gcb}>|/\b{}, \b, \B{}, \B>.
+
 Mnemonic: eI<X>tended Unicode character.
 
 =back

diff --git a/pod/perlreref.pod b/pod/perlreref.pod
@@ -201,6 +201,8 @@ All are zero-width assertions.
 
    ^  Match string start (or line, if /m is used)
    $  Match string end (or line, if /m is used) or before newline
+   \b{} Match boundary of type specified within the braces
+   \B{} Match wherever \b{} doesn't match
    \b Match word boundary (between \w and \W)
    \B Match except at word boundary (between \w and \w or \W and \W)
    \A Match string start (regardless of /m)

diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
@@ -1100,7 +1100,8 @@ Level 2 - Extended Unicode Support
 
  [10] see UAX#15 "Unicode Normalization Forms"
  [11] have Unicode::Normalize but not integrated to regexes
- [12] have \X but we don't have a "Grapheme Cluster Mode"
+ [12] have \X and \b{gcb} but we don't have a "Grapheme Cluster
+      Mode"
  [14] see UAX#29, Word Boundaries
  [15] This is covered in Chapter 3.13 (in Unicode 6.0)
 
@@ -1575,8 +1576,9 @@ regular expressions outside the scope.
 
 =item *
 
-Matching any of several properties in regular expressions, namely C<\b>,
-C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes
+Matching any of several properties in regular expressions, namely
+C<\b> (without braces), C<\B> (without braces), C<\s>, C<\S>, C<\w>,
+C<\W>, and all the Posix character classes
 I<except> C<[[:ascii:]]>.
 Starting in Perl 5.14.0, regular expressions compiled within
 the scope of C<unicode_strings> use character semantics

diff --git a/proto.h b/proto.h
@@ -7432,6 +7432,9 @@ STATIC bool	S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 #define PERL_ARGS_ASSERT_ISFOO_UTF8_LC	\
 	assert(character)
 
+STATIC bool	S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
+			__attribute__warn_unused_result__;
+
 STATIC I32	S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
 			__attribute__warn_unused_result__
 			__attribute__nonnull__(1)