Skip to content

Commit

Permalink
Make fc(), /i thread-safe on participating platforms
Browse files Browse the repository at this point in the history
A long standing bug in Perl that has gone undetected is that the array
is global that is created when changing locales and tells fc() and qr//i
matching what the folds are in the new locale.

What this means is that any program only has one set of fold definitions
that apply to all threads within it, even if we claim that the locales
are thread-safe on the given platform.  One possibility for this going
undetected so long is that no one is using locales on multi-threaded
systems much.  Another possibility is that modern UTF-8 locales have the
same set of folds as any other one.

It is a simple matter to make the fold array per-thread instead of
per-process, and that solves the problem transparently to other code.

I discovered this stress-testing locale handling under threads.  That
test will be added in a future commit.
  • Loading branch information
khwilliamson committed May 6, 2021
1 parent 1080621 commit b6d8914
Show file tree
Hide file tree
Showing 8 changed files with 6 additions and 90 deletions.
50 changes: 0 additions & 50 deletions ebcdic_tables.h
Expand Up @@ -258,31 +258,6 @@ SOFTWARE.
};
# endif

# ifndef DOINIT
EXT U8 PL_fold_locale[256];
# else
EXT U8 PL_fold_locale[256] = {
/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/
/*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
/*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
/*2_*/0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
/*3_*/0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
/*4_*/0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
/*5_*/0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
/*6_*/0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
/*7_*/0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
/*8_*/0x80,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
/*9_*/0x90,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,0xD8,0xD9,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
/*A_*/0xA0,0xA1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
/*B_*/0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
/*C_*/0xC0,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
/*D_*/0xD0,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
/*E_*/0xE0,0xE1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
/*F_*/0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/
};
# endif

/* Index is EBCDIC 1047 code point; value is its other fold-pair equivalent
* (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
* the code point itself */
Expand Down Expand Up @@ -661,31 +636,6 @@ SOFTWARE.
};
# endif

# ifndef DOINIT
EXT U8 PL_fold_locale[256];
# else
EXT U8 PL_fold_locale[256] = {
/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/
/*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
/*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
/*2_*/0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
/*3_*/0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
/*4_*/0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
/*5_*/0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
/*6_*/0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
/*7_*/0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
/*8_*/0x80,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
/*9_*/0x90,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,0xD8,0xD9,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
/*A_*/0xA0,0xA1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
/*B_*/0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
/*C_*/0xC0,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
/*D_*/0xD0,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
/*E_*/0xE0,0xE1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
/*F_*/0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/
};
# endif

/* Index is EBCDIC 037 code point; value is its other fold-pair equivalent
* (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
* the code point itself */
Expand Down
1 change: 1 addition & 0 deletions embedvar.h
Expand Up @@ -158,6 +158,7 @@
#define PL_fdpid (vTHX->Ifdpid)
#define PL_filemode (vTHX->Ifilemode)
#define PL_firstgv (vTHX->Ifirstgv)
#define PL_fold_locale (vTHX->Ifold_locale)
#define PL_forkprocess (vTHX->Iforkprocess)
#define PL_formtarget (vTHX->Iformtarget)
#define PL_generation (vTHX->Igeneration)
Expand Down
1 change: 0 additions & 1 deletion globvar.sym
Expand Up @@ -17,7 +17,6 @@ PL_EXACT_REQ8_bitmask
PL_extended_utf8_dfa_tab
PL_fold
PL_fold_latin1
PL_fold_locale
PL_hexdigit
PL_inf
PL_interp_size
Expand Down
2 changes: 2 additions & 0 deletions inline.h
Expand Up @@ -2565,6 +2565,8 @@ Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
{
const U8 *a = (const U8 *)s1;
const U8 *b = (const U8 *)s2;
dTHX; /* XXX pTHX_ for this, but would have to make all similar fcns the
have the same signature */

PERL_ARGS_ASSERT_FOLDEQ_LOCALE;

Expand Down
2 changes: 2 additions & 0 deletions intrpvar.h
Expand Up @@ -995,6 +995,8 @@ PERLVAR(I, SB_invlist, SV *)
PERLVAR(I, SCX_invlist, SV *)
PERLVAR(I, UpperLatin1, SV *) /* Code points 128 - 255 */

PERLVARA(I, fold_locale, 256, U8)

/* List of characters that participate in any fold defined by Unicode */
PERLVAR(I, in_some_fold, SV *)

Expand Down
36 changes: 0 additions & 36 deletions perl.h
Expand Up @@ -5205,41 +5205,6 @@ EXTCONST unsigned char PL_fold[] = {
248, 249, 250, 251, 252, 253, 254, 255
};

EXT unsigned char PL_fold_locale[] = { /* Unfortunately not EXTCONST. */
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63,
64, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', 91, 92, 93, 94, 95,
96, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135,
136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151,
152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167,
168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183,
184, 185, 186, 187, 188, 189, 190, 191,
192, 193, 194, 195, 196, 197, 198, 199,
200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215,
216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231,
232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247,
248, 249, 250, 251, 252, 253, 254, 255
};

EXTCONST unsigned char PL_fold_latin1[] = {
/* Full latin1 complement folding, except for three problematic code points:
* Micro sign (181 = 0xB5) and y with diearesis (255 = 0xFF) have their
Expand Down Expand Up @@ -5371,7 +5336,6 @@ EXTCONST unsigned char PL_fold[];
EXTCONST unsigned char PL_fold_latin1[];
EXTCONST unsigned char PL_mod_latin1_uc[];
EXTCONST unsigned char PL_latin1_lc[];
EXT unsigned char PL_fold_locale[]; /* Unfortunately not EXTCONST. */
# endif
#endif

Expand Down
3 changes: 0 additions & 3 deletions regen/ebcdic.pl
Expand Up @@ -384,9 +384,6 @@ END
* is A-Z; all other code points map to themselves */
END
output_table(\@ascii_fold, "PL_fold");

# This table is also the correct folding for the default C locale
output_table(\@ascii_fold, "PL_fold_locale");
}

{
Expand Down
1 change: 1 addition & 0 deletions sv.c
Expand Up @@ -15657,6 +15657,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
PL_utf8locale = proto_perl->Iutf8locale;
PL_in_utf8_CTYPE_locale = proto_perl->Iin_utf8_CTYPE_locale;
PL_in_utf8_turkic_locale = proto_perl->Iin_utf8_turkic_locale;
Copy(proto_perl->Ifold_locale, PL_fold_locale, 256, U8);
#endif

#ifdef USE_LOCALE_COLLATE
Expand Down

0 comments on commit b6d8914

Please sign in to comment.