Skip to content

Commit

Permalink
Add qr/\b{gcb}/
Browse files Browse the repository at this point in the history
A function implements seeing if the space between any two characters is
a grapheme cluster break.  Afer I wrote this, I realized that an array
lookup might be a better implementation, but the deadline for v5.22 was
too close to change it.  I did see that my gcc optimized it down to
an array lookup.

This makes the implementation of \X go from being complicated to
trivial.
  • Loading branch information
khwilliamson committed Feb 20, 2015
1 parent 0e0b935 commit 64935bc
Show file tree
Hide file tree
Showing 25 changed files with 587 additions and 260 deletions.
1 change: 1 addition & 0 deletions embed.fnc
Expand Up @@ -2276,6 +2276,7 @@ Es |void |to_utf8_substr |NN regexp * prog
Es |bool |to_byte_substr |NN regexp * prog
ERsn |I32 |reg_check_named_buff_matched |NN const regexp *rex \
|NN const regnode *scan
EsnR |bool |isGCB |const PL_GCB_enum before|const PL_GCB_enum after
# ifdef DEBUGGING
Es |void |dump_exec_pos |NN const char *locinput|NN const regnode *scan|NN const char *loc_regeol\
|NN const char *loc_bostr|NN const char *loc_reg_starttry|const bool do_utf8
Expand Down
1 change: 1 addition & 0 deletions embed.h
Expand Up @@ -1054,6 +1054,7 @@
#define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e)
#define isFOO_lc(a,b) S_isFOO_lc(aTHX_ a,b)
#define isFOO_utf8_lc(a,b) S_isFOO_utf8_lc(aTHX_ a,b)
#define isGCB S_isGCB
#define reg_check_named_buff_matched S_reg_check_named_buff_matched
#define regcppop(a,b) S_regcppop(aTHX_ a,b)
#define regcppush(a,b,c) S_regcppush(aTHX_ a,b,c)
Expand Down
1 change: 1 addition & 0 deletions embedvar.h
Expand Up @@ -53,6 +53,7 @@
#define PL_DBtrace (vTHX->IDBtrace)
#define PL_Dir (vTHX->IDir)
#define PL_Env (vTHX->IEnv)
#define PL_GCB_invlist (vTHX->IGCB_invlist)
#define PL_HasMultiCharFold (vTHX->IHasMultiCharFold)
#define PL_InBitmap (vTHX->IInBitmap)
#define PL_LIO (vTHX->ILIO)
Expand Down
1 change: 1 addition & 0 deletions intrpvar.h
Expand Up @@ -610,6 +610,7 @@ PERLVAR(I, utf8_charname_continue, SV *)
PERLVARA(I, utf8_swash_ptrs, POSIX_SWASH_COUNT, SV *)
PERLVARA(I, Posix_ptrs, POSIX_CC_COUNT, SV *)
PERLVARA(I, XPosix_ptrs, POSIX_CC_COUNT, SV *)
PERLVAR(I, GCB_invlist, SV *)

PERLVAR(I, last_swash_hv, HV *)
PERLVAR(I, last_swash_tmps, U8 *)
Expand Down
47 changes: 47 additions & 0 deletions lib/unicore/mktables
Expand Up @@ -18762,6 +18762,7 @@ sub _test_break($$) {
my @should_match = map { eval "\"$_\"" } @should_display;

# If a string can be represented in both non-ut8 and utf8, test both cases
my $display_upgrade = "";
UPGRADE:
for my $to_upgrade (0 .. 1) {

Expand All @@ -18771,8 +18772,54 @@ sub _test_break($$) {
next UPGRADE if utf8::is_utf8($string);

utf8::upgrade($string);
$display_upgrade = " (utf8-upgraded)";
}

# The /l modifier has C after it to indicate the locale to try
my @modifiers = qw(a aa d lC u i);
push @modifiers, "l$utf8_locale" if defined $utf8_locale;

# Test for each of the regex modifiers.
for my $modifier (@modifiers) {
my $display_locale = "";

# For /l, set the locale to what it says to.
if ($modifier =~ / ^ l (.*) /x) {
my $locale = $1;
$display_locale = "(locale = $locale)";
use Config;
if (defined $Config{d_setlocale}) {
eval { require POSIX; import POSIX 'locale_h'; };
if (defined &POSIX::LC_CTYPE) {
POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
}
}
$modifier = 'l';
}

no warnings qw(locale regexp surrogate);
my $pattern = "(?$modifier:$break_pattern)";

# Actually do the test
my $matched = $string =~ qr/$pattern/;
print "not " unless $matched;

# Fancy display of test results
$matched = ($matched) ? "matched" : "failed to match";
print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";

# Repeat with the first \B{} in the pattern. This makes sure the
# code in regexec.c:find_byclass() for \B gets executed
if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
my $B_pattern = "$1$2";
$matched = $string =~ qr/$B_pattern/;
print "not " unless $matched;
print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
}
}

next if $break_type ne 'gcb';

# Finally, do the \X match.
my @matches = $string =~ /(\X)/g;

Expand Down
3 changes: 2 additions & 1 deletion perl.c
Expand Up @@ -33,7 +33,6 @@
#include "perl.h"
#include "patchlevel.h" /* for local_patches */
#include "XSUB.h"
#include "charclass_invlists.h"

#ifdef NETWARE
#include "nwutil.h"
Expand Down Expand Up @@ -391,6 +390,7 @@ perl_construct(pTHXx)
PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(XPosixWord_invlist);
PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
PL_GCB_invlist = _new_invlist_C_array(Grapheme_Cluster_Break_invlist);

ENTER;
}
Expand Down Expand Up @@ -1060,6 +1060,7 @@ perl_destruct(pTHXx)
SvREFCNT_dec(PL_XPosix_ptrs[i]);
PL_XPosix_ptrs[i] = NULL;
}
PL_GCB_invlist = NULL;

if (!specialWARN(PL_compiling.cop_warnings))
PerlMemShared_free(PL_compiling.cop_warnings);
Expand Down
1 change: 1 addition & 0 deletions perl.h
Expand Up @@ -2685,6 +2685,7 @@ typedef struct padname PADNAME;
#endif

#include "handy.h"
#include "charclass_invlists.h"

#if defined(USE_LARGE_FILES) && !defined(NO_64_BIT_RAWIO)
# if LSEEKSIZE == 8 && !defined(USE_64_BIT_RAWIO)
Expand Down
2 changes: 1 addition & 1 deletion pod/perlcheat.pod
Expand Up @@ -46,7 +46,7 @@ already be overwhelming.
, => /a ASCII /aa safe {3,7} repeat in range
list ops /l locale /d dual | alternation
not /u Unicode [] character class
and /e evaluate /ee rpts \b word boundary
and /e evaluate /ee rpts \b boundary
or xor /g global \z string end
/o compile pat once () capture
DEBUG (?:p) no capture
Expand Down
26 changes: 15 additions & 11 deletions pod/perldebguts.pod
Expand Up @@ -573,19 +573,23 @@ will be lost.

# Word Boundary Opcodes:
BOUND no Match "" at any word boundary using native
charset rules for non-utf8
BOUNDL no Match "" at any locale word boundary
BOUNDU no Match "" at any word boundary using Unicode
rules
BOUNDA no Match "" at any word boundary using ASCII
rules
charset rules for non-utf8, otherwise
Unicode rules
BOUNDL no Match "" at any boundary of a given type
using locale rules
BOUNDU no Match "" at any boundary of a given type
using Unicode rules
BOUNDA no Match "" at any boundary of a given type
using ASCII rules
NBOUND no Match "" at any word non-boundary using
native charset rules for non-utf8
NBOUNDL no Match "" at any locale word non-boundary
NBOUNDU no Match "" at any word non-boundary using
native charset rules for non-utf8, otherwise
Unicode rules
NBOUNDA no Match "" at any word non-boundary using
ASCII rules
NBOUNDL no Match "" at any boundary of a given type
using locale rules
NBOUNDU no Match "" at any boundary of a given type
using using Unicode rules
NBOUNDA no Match "" at any boundary of a given type
using using ASCII rules

# [Special] alternatives:
REG_ANY no Match any one character (except newline).
Expand Down
9 changes: 8 additions & 1 deletion pod/perldelta.pod
Expand Up @@ -25,7 +25,14 @@ XXX New core language features go here. Summarize user-visible core language
enhancements. Particularly prominent performance optimisations could go
here, but most should go in the L</Performance Enhancements> section.

[ List each enhancement as a =head2 entry ]
=head2 qr/\b{gcb}/ is now handled in regular expressions

C<gcb> stands for Grapheme Cluster Boundary. It is a Unicode property
that finds the boundary between sequences of characters that look like a
single character to a native speaker of a language. Perl has long had
the ability to deal with these through the C<\X> regular escape
sequence. Now, there is an alternative way of handling these. See
L<perlrebackslash/\b{}, \b, \B{}, \B> for details.

=head1 Security

Expand Down
24 changes: 24 additions & 0 deletions pod/perldiag.pod
Expand Up @@ -2894,6 +2894,12 @@ with 'useperlio'.
(F) Your machine doesn't implement the sockatmark() functionality,
neither as a system call nor an ioctl call (SIOCATMARK).

=item '%s' is an unknown bound type in regex; marked by <-- HERE in m/%s/

(F) You used C<\b{...}> or C<\B{...}> and the C<...> is not known to
Perl. The current valid ones are given in
L<perlrebackslash/\b{}, \b, \B{}, \B>.

=item "%s" is more clearly written simply as "%s" in regex; marked by <-- HERE in m/%s/

(W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>)
Expand Down Expand Up @@ -6638,6 +6644,15 @@ is deprecated. See L<perlvar/"$[">.
form if you wish to use an empty line as the terminator of the
here-document.

=item Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale

(W locale) You are matching a regular expression using locale rules,
and a Unicode boundary is being matched, but the locale is not a Unicode
one. This doesn't make sense. Perl will continue, assuming a Unicode
(UTF-8) locale, but the results could well be wrong except if the locale
happens to be ISO-8859-1 (Latin1) where this message is spurious and can
be ignored.

=item Use of chdir('') or chdir(undef) as chdir() deprecated

(D deprecated) chdir() with no arguments is documented to change to
Expand Down Expand Up @@ -6859,6 +6874,15 @@ a range. For these, what should happen isn't clear at all. In
these circumstances, Perl discards all but the first character
of the returned sequence, which is not likely what you want.

=item Using /u for '%s' instead of /%s in regex; marked by <-- HERE in m/%s/

(W regexp) You used a Unicode boundary (C<\b{...}> or C<\B{...}>) in a
portion of a regular expression where the character set modifiers C</a>
or C</aa> are in effect. These two modifiers indicate an ASCII
interpretation, and this doesn't make sense for a Unicode definiton.
The generated regular expression will compile so that the boundary uses
all of Unicode. No other portion of the regular expression is affected.

=item Using !~ with %s doesn't make sense

(F) Using the C<!~> operator with C<s///r>, C<tr///r> or C<y///r> is
Expand Down
12 changes: 12 additions & 0 deletions pod/perlre.pod
Expand Up @@ -388,6 +388,10 @@ the pattern uses a Unicode property (C<\p{...}> or C<\P{...}>); or

=item 6

the pattern uses a Unicode break (C<\b{...}> or C<\B{...}>); or

=item 7

the pattern uses L</C<(?[ ])>>

=back
Expand Down Expand Up @@ -770,6 +774,8 @@ X<regexp, zero-width assertion>
X<regular expression, zero-width assertion>
X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G>

\b{} Match at Unicode boundary of specified type
\B{} Match where corresponding \b{} doesn't match
\b Match a word boundary
\B Match except at a word boundary
\A Match only at beginning of string
Expand All @@ -778,6 +784,12 @@ X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G>
\G Match only at pos() (e.g. at the end-of-match position
of prior m//g)

A Unicode boundary (C<\b{}>), available starting in v5.22, is a spot
between two characters, or before the first character in the string, or
after the final character in the string where certain criteria defined
by Unicode are met. See L<perlrebackslash/\b{}, \b, \B{}, \B> for
details.

A word boundary (C<\b>) is a spot between two characters
that has a C<\w> on one side of it and a C<\W> on the other side
of it (in either order), counting the imaginary characters off the
Expand Down
39 changes: 31 additions & 8 deletions pod/perlrebackslash.pod
Expand Up @@ -66,8 +66,8 @@ as C<Not in [].>
\1 Absolute backreference. Not in [].
\a Alarm or bell.
\A Beginning of string. Not in [].
\b Word/non-word boundary. (Backspace in []).
\B Not a word/non-word boundary. Not in [].
\b{}, \b Boundary. (\b is a backspace in []).
\B{}, \B Not a boundary.
\cX Control-X.
\C Single octet, even under UTF-8. Not in [].
(Deprecated)
Expand Down Expand Up @@ -134,7 +134,8 @@ description. (For EBCDIC platforms, see L<perlebcdic/OPERATOR DIFFERENCES>.)
=item [1]

C<\b> is the backspace character only inside a character class. Outside a
character class, C<\b> is a word/non-word boundary.
character class, C<\b> alone is a word-character/non-word-character
boundary, and C<\b{}> is some other type of boundary.

=item [2]

Expand Down Expand Up @@ -525,10 +526,21 @@ or the beginning of that string if there was no previous match.

Mnemonic: I<G>lobal.

=item \b, \B
=item \b{}, \b, \B{}, \B

C<\b> matches at any place between a word and a non-word character; C<\B>
matches at any place between characters where C<\b> doesn't match. C<\b>
C<\b{...}>, available starting in v5.22, matches a boundary (between two
characters, or before the first character of the string, or after the
final character of the string) based on the Unicode rules for the
boundary type specified inside the braces. The currently known boundary
types are given a few paragraphs below. C<\B{...}> matches at any place
between characters where C<\b{...}> of the same type doesn't match.

C<\b> when not immediately followed by a C<"{"> matches at any place
between a word (something matched by C<\w>) and a non-word character
(C<\W>); C<\B> when not immediately followed by a C<"{"> matches at any
place between characters where C<\b> doesn't match.

C<\b>
and C<\B> assume there's a non-word character before the beginning and after
the end of the source string; so C<\b> will match at the beginning (or end)
of the source string if the source string begins (or ends) with a word
Expand All @@ -537,13 +549,22 @@ character. Otherwise, C<\B> will match.
Do not use something like C<\b=head\d\b> and expect it to match the
beginning of a line. It can't, because for there to be a boundary before
the non-word "=", there must be a word character immediately previous.
All boundary determinations look for word characters alone, not for
non-words characters nor for string ends. It may help to understand how
All plain C<\b> and C<\B> boundary determinations look for word
characters alone, not for
non-word characters nor for string ends. It may help to understand how
<\b> and <\B> work by equating them as follows:

\b really means (?:(?<=\w)(?!\w)|(?<!\w)(?=\w))
\B really means (?:(?<=\w)(?=\w)|(?<!\w)(?!\w))

In contrast, C<\b{...}> always matches at the beginning and end of the
line (and C<\B{...}> never does). The only boundary type currently
"Grapheme Cluster Boundary". (Actually Perl always uses the improved
"extended" grapheme cluster"). These are explained below under C<\X>.
In fact, C<\X> is another way to get the same functionality. It is
equivalent to C</.+?\b{gcb}/>. Use whichever is most convenient for
your situation.

Mnemonic: I<b>oundary.

=back
Expand Down Expand Up @@ -650,6 +671,8 @@ were a single character.
The match is greedy and non-backtracking, so that the cluster is never
broken up into smaller components.

See also L<C<\b{gcb}>|/\b{}, \b, \B{}, \B>.

Mnemonic: eI<X>tended Unicode character.

=back
Expand Down
2 changes: 2 additions & 0 deletions pod/perlreref.pod
Expand Up @@ -201,6 +201,8 @@ All are zero-width assertions.

^ Match string start (or line, if /m is used)
$ Match string end (or line, if /m is used) or before newline
\b{} Match boundary of type specified within the braces
\B{} Match wherever \b{} doesn't match
\b Match word boundary (between \w and \W)
\B Match except at word boundary (between \w and \w or \W and \W)
\A Match string start (regardless of /m)
Expand Down
8 changes: 5 additions & 3 deletions pod/perlunicode.pod
Expand Up @@ -1100,7 +1100,8 @@ Level 2 - Extended Unicode Support

[10] see UAX#15 "Unicode Normalization Forms"
[11] have Unicode::Normalize but not integrated to regexes
[12] have \X but we don't have a "Grapheme Cluster Mode"
[12] have \X and \b{gcb} but we don't have a "Grapheme Cluster
Mode"
[14] see UAX#29, Word Boundaries
[15] This is covered in Chapter 3.13 (in Unicode 6.0)

Expand Down Expand Up @@ -1575,8 +1576,9 @@ regular expressions outside the scope.

=item *

Matching any of several properties in regular expressions, namely C<\b>,
C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes
Matching any of several properties in regular expressions, namely
C<\b> (without braces), C<\B> (without braces), C<\s>, C<\S>, C<\w>,
C<\W>, and all the Posix character classes
I<except> C<[[:ascii:]]>.
Starting in Perl 5.14.0, regular expressions compiled within
the scope of C<unicode_strings> use character semantics
Expand Down
3 changes: 3 additions & 0 deletions proto.h
Expand Up @@ -7432,6 +7432,9 @@ STATIC bool S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
#define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \
assert(character)

STATIC bool S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
__attribute__warn_unused_result__;

STATIC I32 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
__attribute__warn_unused_result__
__attribute__nonnull__(1)
Expand Down

0 comments on commit 64935bc

Please sign in to comment.