Skip to content
Permalink
Browse files Browse the repository at this point in the history
Fix issue #646 (#648)
* Fix issue #646 and some edge cases with wide regexps using \b and \B

* Rename function IS_WORD_CHAR to _yr_re_is_word_char
  • Loading branch information
plusvic committed Apr 27, 2017
1 parent f12262c commit 83d7998
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 45 deletions.
1 change: 1 addition & 0 deletions libyara/exec.c
Expand Up @@ -850,6 +850,7 @@ int yr_execute_code(
(uint8_t*) r2.re->code,
(uint8_t*) r1.ss->c_string,
r1.ss->length,
0,
r2.re->flags | RE_FLAGS_SCAN,
NULL,
NULL) >= 0;
Expand Down
15 changes: 4 additions & 11 deletions libyara/include/yara/re.h
Expand Up @@ -94,7 +94,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define RE_FLAGS_NO_CASE 0x20
#define RE_FLAGS_SCAN 0x40
#define RE_FLAGS_DOT_ALL 0x80
#define RE_FLAGS_NOT_AT_START 0x100
#define RE_FLAGS_GREEDY 0x400
#define RE_FLAGS_UNGREEDY 0x800

Expand All @@ -107,14 +106,6 @@ typedef struct RE_ERROR RE_ERROR;
typedef uint8_t RE_SPLIT_ID_TYPE;


#define CHAR_IN_CLASS(chr, cls) \
((cls)[(chr) / 8] & 1 << ((chr) % 8))


#define IS_WORD_CHAR(chr) \
(isalnum(chr) || (chr) == '_')


struct RE_NODE
{
int type;
Expand Down Expand Up @@ -213,7 +204,8 @@ void yr_re_node_destroy(
int yr_re_exec(
uint8_t* re_code,
uint8_t* input,
size_t input_size,
size_t input_forwards_size,
size_t input_backwards_size,
int flags,
RE_MATCH_CALLBACK_FUNC callback,
void* callback_args);
Expand All @@ -222,7 +214,8 @@ int yr_re_exec(
int yr_re_fast_exec(
uint8_t* re_code,
uint8_t* input,
size_t input_size,
size_t input_forwards_size,
size_t input_backwards_size,
int flags,
RE_MATCH_CALLBACK_FUNC callback,
void* callback_args);
Expand Down
110 changes: 79 additions & 31 deletions libyara/re.c
Expand Up @@ -140,6 +140,24 @@ typedef struct _RE_THREAD_STORAGE
YR_THREAD_STORAGE_KEY thread_storage_key = 0;


#define CHAR_IN_CLASS(chr, cls) \
((cls)[(chr) / 8] & 1 << ((chr) % 8))


int _yr_re_is_word_char(
uint8_t* input,
int character_size)
{
int result = ((isalnum(*input) || (*input) == '_'));

if (character_size == 2)
result = result && (*(input + 1) == 0);

return result;
}



//
// yr_re_initialize
//
Expand Down Expand Up @@ -360,6 +378,7 @@ int yr_re_match(
re->code,
(uint8_t*) target,
strlen(target),
0,
re->flags | RE_FLAGS_SCAN,
NULL,
NULL);
Expand Down Expand Up @@ -1799,18 +1818,30 @@ int _yr_re_fiber_sync(
//
// yr_re_exec
//
// Executes a regular expression
// Executes a regular expression. The specified regular expression will try to
// match the data starting at the address specified by "input". The "input"
// pointer can point to any address inside a memory buffer. Arguments
// "input_forwards_size" and "input_backwards_size" indicate how many bytes
// can be accesible starting at "input" and going forwards and backwards
// respectively.
//
// <--- input_backwards_size -->|<----------- input_forwards_size -------->
// |-------- memory buffer -----------------------------------------------|
// ^
// input
//
// Args:
// uint8_t* re_code - Regexp code be executed
// uint8_t* input - Pointer to input data
// size_t input_size - Input data size
// size_t input_forwards_size - Number of accessible bytes starting at
// "input" and going forwards.
// size_t input_backwards_size - Number of accessible bytes starting at
// "input" and going backwards
// int flags - Flags:
// RE_FLAGS_SCAN
// RE_FLAGS_BACKWARDS
// RE_FLAGS_EXHAUSTIVE
// RE_FLAGS_WIDE
// RE_FLAGS_NOT_AT_START
// RE_FLAGS_NO_CASE
// RE_FLAGS_DOT_ALL
// RE_MATCH_CALLBACK_FUNC callback - Callback function
Expand All @@ -1825,10 +1856,12 @@ int _yr_re_fiber_sync(
// -4 Too many fibers
// -5 Unknown fatal error


int yr_re_exec(
uint8_t* re_code,
uint8_t* input_data,
size_t input_size,
size_t input_forwards_size,
size_t input_backwards_size,
int flags,
RE_MATCH_CALLBACK_FUNC callback,
void* callback_args)
Expand Down Expand Up @@ -1858,18 +1891,23 @@ int yr_re_exec(
#define ACTION_KILL 2
#define ACTION_KILL_TAIL 3

#define prolog if (bytes_matched >= max_bytes_matched) \
#define prolog { \
if ((bytes_matched >= max_bytes_matched) || \
(character_size == 2 && *(input + 1) != 0)) \
{ \
action = ACTION_KILL; \
break; \
}
} \
}

#define fail_if_error(e) switch (e) { \
#define fail_if_error(e) { \
switch (e) { \
case ERROR_INSUFFICIENT_MEMORY: \
return -2; \
case ERROR_TOO_MANY_RE_FIBERS: \
return -4; \
}
} \
}

if (_yr_re_alloc_storage(&storage) != ERROR_SUCCESS)
return -2;
Expand All @@ -1884,14 +1922,17 @@ int yr_re_exec(

if (flags & RE_FLAGS_BACKWARDS)
{
max_bytes_matched = (int) yr_min(input_backwards_size, RE_SCAN_LIMIT);
input -= character_size;
input_incr = -input_incr;
}

max_bytes_matched = (int) yr_min(input_size, RE_SCAN_LIMIT);
else
{
max_bytes_matched = (int) yr_min(input_forwards_size, RE_SCAN_LIMIT);
}

// Round down max_bytes_matched to a multiple of character_size, this way if
// character_size is 2 and input_size is odd we are ignoring the
// character_size is 2 and max_bytes_matched is odd we are ignoring the
// extra byte which can't match anyways.

max_bytes_matched = max_bytes_matched - max_bytes_matched % character_size;
Expand Down Expand Up @@ -1973,14 +2014,14 @@ int yr_re_exec(

case RE_OPCODE_WORD_CHAR:
prolog;
match = IS_WORD_CHAR(*input);
match = _yr_re_is_word_char(input, character_size);
action = match ? ACTION_NONE : ACTION_KILL;
fiber->ip += 1;
break;

case RE_OPCODE_NON_WORD_CHAR:
prolog;
match = !IS_WORD_CHAR(*input);
match = !_yr_re_is_word_char(input, character_size);
action = match ? ACTION_NONE : ACTION_KILL;
fiber->ip += 1;
break;
Expand Down Expand Up @@ -2028,16 +2069,25 @@ int yr_re_exec(
case RE_OPCODE_WORD_BOUNDARY:
case RE_OPCODE_NON_WORD_BOUNDARY:

if (bytes_matched == 0 &&
!(flags & RE_FLAGS_NOT_AT_START) &&
!(flags & RE_FLAGS_BACKWARDS))
if (bytes_matched == 0 && input_backwards_size < character_size)
{
match = TRUE;
}
else if (bytes_matched >= max_bytes_matched)
{
match = TRUE;
else if (IS_WORD_CHAR(*(input - input_incr)) != IS_WORD_CHAR(*input))
match = TRUE;
}
else
match = FALSE;
{
assert(input < input_data + input_forwards_size);
assert(input >= input_data - input_backwards_size);

assert(input - input_incr < input_data + input_forwards_size);
assert(input - input_incr >= input_data - input_backwards_size);

match = _yr_re_is_word_char(input, character_size) != \
_yr_re_is_word_char(input - input_incr, character_size);
}

if (*ip == RE_OPCODE_NON_WORD_BOUNDARY)
match = !match;
Expand All @@ -2048,16 +2098,16 @@ int yr_re_exec(

case RE_OPCODE_MATCH_AT_START:
if (flags & RE_FLAGS_BACKWARDS)
kill = input_size > (size_t) bytes_matched;
kill = input_backwards_size > (size_t) bytes_matched;
else
kill = (flags & RE_FLAGS_NOT_AT_START) || (bytes_matched != 0);
kill = input_backwards_size > 0 || (bytes_matched != 0);
action = kill ? ACTION_KILL : ACTION_CONTINUE;
fiber->ip += 1;
break;

case RE_OPCODE_MATCH_AT_END:
kill = flags & RE_FLAGS_BACKWARDS ||
input_size > (size_t) bytes_matched;
input_forwards_size > (size_t) bytes_matched;
action = kill ? ACTION_KILL : ACTION_CONTINUE;
fiber->ip += 1;
break;
Expand Down Expand Up @@ -2134,13 +2184,6 @@ int yr_re_exec(
}
}

if (flags & RE_FLAGS_WIDE &&
bytes_matched < max_bytes_matched &&
*(input + 1) != 0)
{
_yr_re_fiber_kill_all(&fibers, &storage->fiber_pool);
}

input += input_incr;
bytes_matched += character_size;

Expand All @@ -2164,7 +2207,8 @@ int yr_re_exec(
int yr_re_fast_exec(
uint8_t* code,
uint8_t* input_data,
size_t input_size,
size_t input_forwards_size,
size_t input_backwards_size,
int flags,
RE_MATCH_CALLBACK_FUNC callback,
void* callback_args)
Expand All @@ -2187,7 +2231,11 @@ int yr_re_fast_exec(
int input_incr;
int sp = 0;
int bytes_matched;
int max_bytes_matched = input_size;
int max_bytes_matched;

max_bytes_matched = flags & RE_FLAGS_BACKWARDS ?
input_backwards_size :
input_forwards_size;

input_incr = flags & RE_FLAGS_BACKWARDS ? -1 : 1;

Expand Down
10 changes: 7 additions & 3 deletions libyara/scan.c
Expand Up @@ -528,7 +528,8 @@ int _yr_scan_match_callback(
typedef int (*RE_EXEC_FUNC)(
uint8_t* code,
uint8_t* input,
size_t input_size,
size_t input_forwards_size,
size_t input_backwards_size,
int flags,
RE_MATCH_CALLBACK_FUNC callback,
void* callback_args);
Expand Down Expand Up @@ -569,7 +570,8 @@ int _yr_scan_verify_re_match(
ac_match->forward_code,
data + offset,
data_size - offset,
offset > 0 ? flags | RE_FLAGS_NOT_AT_START : flags,
offset,
flags,
NULL,
NULL);
}
Expand All @@ -581,7 +583,8 @@ int _yr_scan_verify_re_match(
ac_match->forward_code,
data + offset,
data_size - offset,
offset > 0 ? flags | RE_FLAGS_NOT_AT_START : flags,
offset,
flags,
NULL,
NULL);
}
Expand Down Expand Up @@ -616,6 +619,7 @@ int _yr_scan_verify_re_match(
backward_matches = exec(
ac_match->backward_code,
data + offset,
data_size - offset,
offset,
flags | RE_FLAGS_BACKWARDS | RE_FLAGS_EXHAUSTIVE,
_yr_scan_match_callback,
Expand Down
44 changes: 44 additions & 0 deletions tests/test-rules.c
Expand Up @@ -901,6 +901,50 @@ void test_re()
"rule test { strings: $a = /a.{1,2}b/ wide condition: !a == 8 }",
"a\0x\0x\0b\0");

assert_true_rule_blob(
"rule test { strings: $a = /\\babc/ wide condition: $a }",
"a\0b\0c\0");

assert_true_rule_blob(
"rule test { strings: $a = /\\babc/ wide condition: $a }",
"\0a\0b\0c\0");

assert_true_rule_blob(
"rule test { strings: $a = /\\babc/ wide condition: $a }",
"\ta\0b\0c\0");

assert_false_rule_blob(
"rule test { strings: $a = /\\babc/ wide condition: $a }",
"x\0a\0b\0c\0");

assert_true_rule_blob(
"rule test { strings: $a = /\\babc/ wide condition: $a }",
"x\ta\0b\0c\0");

assert_true_rule_blob(
"rule test { strings: $a = /abc\\b/ wide condition: $a }",
"a\0b\0c\0");

assert_true_rule_blob(
"rule test { strings: $a = /abc\\b/ wide condition: $a }",
"a\0b\0c\0\0");

assert_true_rule_blob(
"rule test { strings: $a = /abc\\b/ wide condition: $a }",
"a\0b\0c\0\t");

assert_false_rule_blob(
"rule test { strings: $a = /abc\\b/ wide condition: $a }",
"a\0b\0c\0x\0");

assert_true_rule_blob(
"rule test { strings: $a = /abc\\b/ wide condition: $a }",
"a\0b\0c\0b\t");

assert_false_rule_blob(
"rule test { strings: $a = /\\b/ wide condition: $a }",
"abc");

assert_regexp_syntax_error(")");
assert_true_regexp("abc", "abc", "abc");
assert_false_regexp("abc", "xbc");
Expand Down

0 comments on commit 83d7998

Please sign in to comment.