Skip to content

Commit

Permalink
Removes 32-bit limit on substr arguments. The full range of IV and UV…
Browse files Browse the repository at this point in the history
… is available for the pos and len arguments, with safe conversion to STRLEN where it's smaller than an IV.
  • Loading branch information
ikegami authored and nwc10 committed Feb 14, 2010
1 parent 6e3b7bf commit 777f7c5
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 67 deletions.
1 change: 1 addition & 0 deletions embed.fnc
Expand Up @@ -1165,6 +1165,7 @@ ApdR |SV* |sv_newmortal
Apd |SV* |sv_newref |NULLOK SV *const sv
Ap |char* |sv_peek |NULLOK SV* sv
Apd |void |sv_pos_u2b |NULLOK SV *const sv|NN I32 *const offsetp|NULLOK I32 *const lenp
Apd |void |sv_pos_u2b_proper|NULLOK SV *const sv|NN STRLEN *const offsetp|NULLOK STRLEN *const lenp
Apd |void |sv_pos_b2u |NULLOK SV *const sv|NN I32 *const offsetp
Amdb |char* |sv_pvn_force |NN SV* sv|NULLOK STRLEN* lp
Apd |char* |sv_pvutf8n_force|NN SV *const sv|NULLOK STRLEN *const lp
Expand Down
2 changes: 2 additions & 0 deletions embed.h
Expand Up @@ -967,6 +967,7 @@
#define sv_newref Perl_sv_newref
#define sv_peek Perl_sv_peek
#define sv_pos_u2b Perl_sv_pos_u2b
#define sv_pos_u2b_proper Perl_sv_pos_u2b_proper
#define sv_pos_b2u Perl_sv_pos_b2u
#define sv_pvutf8n_force Perl_sv_pvutf8n_force
#define sv_pvbyten_force Perl_sv_pvbyten_force
Expand Down Expand Up @@ -3371,6 +3372,7 @@
#define sv_newref(a) Perl_sv_newref(aTHX_ a)
#define sv_peek(a) Perl_sv_peek(aTHX_ a)
#define sv_pos_u2b(a,b,c) Perl_sv_pos_u2b(aTHX_ a,b,c)
#define sv_pos_u2b_proper(a,b,c) Perl_sv_pos_u2b_proper(aTHX_ a,b,c)
#define sv_pos_b2u(a,b) Perl_sv_pos_b2u(aTHX_ a,b)
#define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b)
#define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b)
Expand Down
1 change: 1 addition & 0 deletions global.sym
Expand Up @@ -567,6 +567,7 @@ Perl_sv_newmortal
Perl_sv_newref
Perl_sv_peek
Perl_sv_pos_u2b
Perl_sv_pos_u2b_proper
Perl_sv_pos_b2u
Perl_sv_pvn_force
Perl_sv_pvutf8n_force
Expand Down
19 changes: 9 additions & 10 deletions mg.c
Expand Up @@ -2008,17 +2008,17 @@ Perl_magic_getsubstr(pTHX_ SV *sv, MAGIC *mg)
STRLEN len;
SV * const lsv = LvTARG(sv);
const char * const tmps = SvPV_const(lsv,len);
I32 offs = LvTARGOFF(sv);
I32 rem = LvTARGLEN(sv);
STRLEN offs = LvTARGOFF(sv);
STRLEN rem = LvTARGLEN(sv);

PERL_ARGS_ASSERT_MAGIC_GETSUBSTR;
PERL_UNUSED_ARG(mg);

if (SvUTF8(lsv))
sv_pos_u2b(lsv, &offs, &rem);
if (offs > (I32)len)
sv_pos_u2b_proper(lsv, &offs, &rem);
if (offs > len)
offs = len;
if (rem + offs > (I32)len)
if (rem > len - offs)
rem = len - offs;
sv_setpvn(sv, tmps + offs, (STRLEN)rem);
if (SvUTF8(lsv))
Expand All @@ -2033,22 +2033,22 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
STRLEN len;
const char * const tmps = SvPV_const(sv, len);
SV * const lsv = LvTARG(sv);
I32 lvoff = LvTARGOFF(sv);
I32 lvlen = LvTARGLEN(sv);
STRLEN lvoff = LvTARGOFF(sv);
STRLEN lvlen = LvTARGLEN(sv);

PERL_ARGS_ASSERT_MAGIC_SETSUBSTR;
PERL_UNUSED_ARG(mg);

if (DO_UTF8(sv)) {
sv_utf8_upgrade(lsv);
sv_pos_u2b(lsv, &lvoff, &lvlen);
sv_pos_u2b_proper(lsv, &lvoff, &lvlen);
sv_insert(lsv, lvoff, lvlen, tmps, len);
LvTARGLEN(sv) = sv_len_utf8(sv);
SvUTF8_on(lsv);
}
else if (lsv && SvUTF8(lsv)) {
const char *utf8;
sv_pos_u2b(lsv, &lvoff, &lvlen);
sv_pos_u2b_proper(lsv, &lvoff, &lvlen);
LvTARGLEN(sv) = len;
utf8 = (char*)bytes_to_utf8((U8*)tmps, &len);
sv_insert(lsv, lvoff, lvlen, utf8, len);
Expand All @@ -2059,7 +2059,6 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
LvTARGLEN(sv) = len;
}


return 0;
}

Expand Down
143 changes: 93 additions & 50 deletions pp.c
Expand Up @@ -3079,15 +3079,19 @@ PP(pp_substr)
{
dVAR; dSP; dTARGET;
SV *sv;
I32 len = 0;
STRLEN curlen;
STRLEN utf8_curlen;
I32 pos;
I32 rem;
I32 fail;
SV * pos_sv;
IV pos1_iv;
int pos1_is_uv;
IV pos2_iv;
int pos2_is_uv;
SV * len_sv;
IV len_iv = 0;
int len_is_uv = 1;
const I32 lvalue = PL_op->op_flags & OPf_MOD || LVRET;
const char *tmps;
const I32 arybase = CopARYBASE_get(PL_curcop);
const IV arybase = CopARYBASE_get(PL_curcop);
SV *repl_sv = NULL;
const char *repl = NULL;
STRLEN repl_len;
Expand All @@ -3103,9 +3107,13 @@ PP(pp_substr)
repl = SvPV_const(repl_sv, repl_len);
repl_is_utf8 = DO_UTF8(repl_sv) && SvCUR(repl_sv);
}
len = POPi;
len_sv = POPs;
len_iv = SvIV(len_sv);
len_is_uv = SvIOK_UV(len_sv);
}
pos = POPi;
pos_sv = POPs;
pos1_iv = SvIV(pos_sv);
pos1_is_uv = SvIOK_UV(pos_sv);
sv = POPs;
PUTBACK;
if (repl_sv) {
Expand All @@ -3127,51 +3135,80 @@ PP(pp_substr)
else
utf8_curlen = 0;

if (pos >= arybase) {
pos -= arybase;
rem = curlen-pos;
fail = rem;
if (num_args > 2) {
if (len < 0) {
rem += len;
if (rem < 0)
rem = 0;
}
else if (rem > len)
rem = len;
if ( (pos1_is_uv && arybase < 0) || (pos1_iv >= arybase) ) { /* pos >= $[ */
UV pos1_uv = pos1_iv-arybase;
/* Overflow can occur when $[ < 0 */
if (arybase < 0 && pos1_uv < (UV)pos1_iv)
goto BOUND_FAIL;
pos1_iv = pos1_uv;
pos1_is_uv = 1;
}
else if (pos1_is_uv ? (UV)pos1_iv > 0 : pos1_iv > 0) {
goto BOUND_FAIL; /* $[=3; substr($_,2,...) */
}
else { /* pos < $[ */
if (pos1_iv == 0) { /* $[=1; substr($_,0,...) */
pos1_iv = curlen;
pos1_is_uv = 1;
} else {
if (curlen) {
pos1_is_uv = curlen-1 > ~(UV)pos1_iv;
pos1_iv += curlen;
}
}
}
else {
pos += curlen;
if (num_args < 3)
rem = curlen;
else if (len >= 0) {
rem = pos+len;
if (rem > (I32)curlen)
rem = curlen;
if (pos1_is_uv || pos1_iv > 0) {
if ((UV)pos1_iv > curlen)
goto BOUND_FAIL;
}

if (num_args > 2) {
if (!len_is_uv && len_iv < 0) {
pos2_iv = curlen + len_iv;
if (curlen)
pos2_is_uv = curlen-1 > ~(UV)len_iv;
else
pos2_is_uv = 0;
} else { /* len_iv >= 0 */
if (!pos1_is_uv && pos1_iv < 0) {
pos2_iv = pos1_iv + len_iv;
pos2_is_uv = (UV)len_iv > (UV)IV_MAX;
} else {
if ((UV)len_iv > curlen-(UV)pos1_iv)
pos2_iv = curlen;
else
pos2_iv = pos1_iv+len_iv;
pos2_is_uv = 1;
}
}
else {
rem = curlen+len;
if (rem < pos)
rem = pos;
}
if (pos < 0)
pos = 0;
fail = rem;
rem -= pos;
}
if (fail < 0) {
if (lvalue || repl)
Perl_croak(aTHX_ "substr outside of string");
Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string");
RETPUSHUNDEF;
}
else {
const I32 upos = pos;
const I32 urem = rem;
pos2_iv = curlen;
pos2_is_uv = 1;
}

if (!pos2_is_uv && pos2_iv < 0) {
if (!pos1_is_uv && pos1_iv < 0)
goto BOUND_FAIL;
pos2_iv = 0;
}
else if (!pos1_is_uv && pos1_iv < 0)
pos1_iv = 0;

if ((UV)pos2_iv < (UV)pos1_iv)
pos2_iv = pos1_iv;
if ((UV)pos2_iv > curlen)
pos2_iv = curlen;

{
/* pos1_iv and pos2_iv both in 0..curlen, so the cast is safe */
const STRLEN pos = (STRLEN)( (UV)pos1_iv );
const STRLEN len = (STRLEN)( (UV)pos2_iv - (UV)pos1_iv );
STRLEN byte_pos = pos;
STRLEN byte_len = len;
if (utf8_curlen)
sv_pos_u2b(sv, (I32 *)&pos, (I32 *)&rem);
tmps += pos;
sv_pos_u2b_proper(sv, &byte_pos, &byte_len);
tmps += byte_pos;
/* we either return a PV or an LV. If the TARG hasn't been used
* before, or is of that type, reuse it; otherwise use a mortal
* instead. Note that LVs can have an extended lifetime, so also
Expand All @@ -3185,7 +3222,7 @@ PP(pp_substr)
}
}

sv_setpvn(TARG, tmps, rem);
sv_setpvn(TARG, tmps, byte_len);
#ifdef USE_LOCALE_COLLATE
sv_unmagic(TARG, PERL_MAGIC_collxfrm);
#endif
Expand All @@ -3202,7 +3239,7 @@ PP(pp_substr)
}
if (!SvOK(sv))
sv_setpvs(sv, "");
sv_insert_flags(sv, pos, rem, repl, repl_len, 0);
sv_insert_flags(sv, byte_pos, byte_len, repl, repl_len, 0);
if (repl_is_utf8)
SvUTF8_on(sv);
SvREFCNT_dec(repl_sv_copy);
Expand Down Expand Up @@ -3232,13 +3269,19 @@ PP(pp_substr)
SvREFCNT_dec(LvTARG(TARG));
LvTARG(TARG) = SvREFCNT_inc_simple(sv);
}
LvTARGOFF(TARG) = upos;
LvTARGLEN(TARG) = urem;
LvTARGOFF(TARG) = pos;
LvTARGLEN(TARG) = len;
}
}
SPAGAIN;
PUSHs(TARG); /* avoid SvSETMAGIC here */
RETURN;

BOUND_FAIL:
if (lvalue || repl)
Perl_croak(aTHX_ "substr outside of string");
Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string");
RETPUSHUNDEF;
}

PP(pp_vec)
Expand Down
5 changes: 5 additions & 0 deletions proto.h
Expand Up @@ -3374,6 +3374,11 @@ PERL_CALLCONV void Perl_sv_pos_u2b(pTHX_ SV *const sv, I32 *const offsetp, I32 *
#define PERL_ARGS_ASSERT_SV_POS_U2B \
assert(offsetp)

PERL_CALLCONV void Perl_sv_pos_u2b_proper(pTHX_ SV *const sv, STRLEN *const offsetp, STRLEN *const lenp)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_SV_POS_U2B_PROPER \
assert(offsetp)

PERL_CALLCONV void Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_SV_POS_B2U \
Expand Down
47 changes: 41 additions & 6 deletions sv.c
Expand Up @@ -6240,7 +6240,7 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start


/*
=for apidoc sv_pos_u2b
=for apidoc sv_pos_u2b_proper
Converts the value pointed to by offsetp from a count of UTF-8 chars from
the start of the string, to a count of the equivalent number of bytes; if
Expand All @@ -6252,14 +6252,14 @@ type coercion.
*/

/*
* sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
* sv_pos_u2b_proper() uses, like sv_pos_b2u(), the mg_ptr of the potential
* PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and
* byte offsets. See also the comments of S_utf8_mg_pos_cache_update().
*
*/

void
Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp)
Perl_sv_pos_u2b_proper(pTHX_ register SV *const sv, STRLEN *const offsetp, STRLEN *const lenp)
{
const U8 *start;
STRLEN len;
Expand All @@ -6271,17 +6271,17 @@ Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp

start = (U8*)SvPV_const(sv, len);
if (len) {
STRLEN uoffset = (STRLEN) *offsetp;
STRLEN uoffset = *offsetp;
const U8 * const send = start + len;
MAGIC *mg = NULL;
const STRLEN boffset = sv_pos_u2b_cached(sv, &mg, start, send,
uoffset, 0, 0);

*offsetp = (I32) boffset;
*offsetp = boffset;

if (lenp) {
/* Convert the relative offset to absolute. */
const STRLEN uoffset2 = uoffset + (STRLEN) *lenp;
const STRLEN uoffset2 = uoffset + *lenp;
const STRLEN boffset2
= sv_pos_u2b_cached(sv, &mg, start, send, uoffset2,
uoffset, boffset) - boffset;
Expand All @@ -6298,6 +6298,41 @@ Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp
return;
}

/*
=for apidoc sv_pos_u2b
Converts the value pointed to by offsetp from a count of UTF-8 chars from
the start of the string, to a count of the equivalent number of bytes; if
lenp is non-zero, it does the same to lenp, but this time starting from
the offset, rather than from the start of the string. Handles magic and
type coercion.
=cut
*/

/*
* sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
* PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and
* byte offsets. See also the comments of S_utf8_mg_pos_cache_update().
*
*/

/* This function is subject to size and sign problems */

void
Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp)
{
STRLEN uoffset = (STRLEN)*offsetp;
if (lenp) {
STRLEN ulen = (STRLEN)*lenp;
sv_pos_u2b_proper(sv, &uoffset, &ulen);
*lenp = (I32)ulen;
} else {
sv_pos_u2b_proper(sv, &uoffset, NULL);
}
*offsetp = (I32)uoffset;
}

/* Create and update the UTF8 magic offset cache, with the proffered utf8/
byte length pairing. The (byte) length of the total SV is passed in too,
as blen, because for some (more esoteric) SVs, the call to SvPV_const()
Expand Down

0 comments on commit 777f7c5

Please sign in to comment.