Skip to content

Commit

Permalink
Merge pull request #286 from PJK/codepoint-fixes
Browse files Browse the repository at this point in the history
Fix UTF-8 codepoint counting in cbor_string_set_handle and parsing
  • Loading branch information
PJK committed Jun 3, 2023
2 parents 9ac3066 + d9f60b3 commit 5fe9656
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 29 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ Next
- Improvements to `cbor_describe`
- [Bytestring data will now be printed as well](https://github.com/PJK/libcbor/pull/281) by [akallabeth](https://github.com/akallabeth)
- [Formatting consistency and clarity improvements](https://github.com/PJK/libcbor/pull/285)
- [Fix `cbor_string_set_handle` not setting the codepoint count](https://github.com/PJK/libcbor/pull/286)
- BREAKING: [`cbor_load` will no longer fail on input strings that are well-formed but not valid UTF-8](https://github.com/PJK/libcbor/pull/286)
- If you were relying on the validation, please check the result using `cbor_string_codepoint_count` instead

0.10.2 (2023-01-31)
---------------------
Expand Down
11 changes: 0 additions & 11 deletions src/cbor/internal/builder_callbacks.c
Original file line number Diff line number Diff line change
Expand Up @@ -256,18 +256,8 @@ void cbor_builder_string_callback(void *context, cbor_data data,
uint64_t length) {
struct _cbor_decoder_context *ctx = context;
CHECK_LENGTH(ctx, length);
struct _cbor_unicode_status unicode_status;
uint64_t codepoint_count =
_cbor_unicode_codepoint_count(data, length, &unicode_status);

if (unicode_status.status != _CBOR_UNICODE_OK) {
ctx->syntax_error = true;
return;
}
CBOR_ASSERT(codepoint_count <= length);

unsigned char *new_handle = _cbor_malloc(length);

if (new_handle == NULL) {
ctx->creation_failed = true;
return;
Expand All @@ -281,7 +271,6 @@ void cbor_builder_string_callback(void *context, cbor_data data,
return;
}
cbor_string_set_handle(new_chunk, new_handle, length);
new_chunk->metadata.string_metadata.codepoint_count = codepoint_count;

// If an indef string is on the stack, extend it (if it were closed, it would
// have been popped). Handle any syntax errors upstream.
Expand Down
6 changes: 3 additions & 3 deletions src/cbor/internal/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@ uint32_t _cbor_unicode_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
return *state;
}

uint64_t _cbor_unicode_codepoint_count(cbor_data source, uint64_t source_length,
struct _cbor_unicode_status* status) {
size_t _cbor_unicode_codepoint_count(cbor_data source, size_t source_length,
struct _cbor_unicode_status* status) {
*status =
(struct _cbor_unicode_status){.location = 0, .status = _CBOR_UNICODE_OK};
uint32_t codepoint, state = UTF8_ACCEPT, res;
uint64_t pos = 0, count = 0;
size_t pos = 0, count = 0;

for (; pos < source_length; pos++) {
res = _cbor_unicode_decode(&state, &codepoint, source[pos]);
Expand Down
6 changes: 3 additions & 3 deletions src/cbor/internal/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ enum _cbor_unicode_status_error { _CBOR_UNICODE_OK, _CBOR_UNICODE_BADCP };
/** Signals unicode validation error and possibly its location */
struct _cbor_unicode_status {
enum _cbor_unicode_status_error status;
uint64_t location;
size_t location;
};

_CBOR_NODISCARD
uint64_t _cbor_unicode_codepoint_count(cbor_data source, uint64_t source_length,
struct _cbor_unicode_status* status);
size_t _cbor_unicode_codepoint_count(cbor_data source, size_t source_length,
struct _cbor_unicode_status* status);

#ifdef __cplusplus
}
Expand Down
10 changes: 10 additions & 0 deletions src/cbor/strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "strings.h"
#include <string.h>
#include "internal/memory_utils.h"
#include "internal/unicode.h"

cbor_item_t *cbor_new_definite_string(void) {
cbor_item_t *item = _cbor_malloc(sizeof(cbor_item_t));
Expand Down Expand Up @@ -66,6 +67,15 @@ void cbor_string_set_handle(cbor_item_t *item,
CBOR_ASSERT(cbor_string_is_definite(item));
item->data = data;
item->metadata.string_metadata.length = length;
struct _cbor_unicode_status unicode_status;
size_t codepoint_count =
_cbor_unicode_codepoint_count(data, length, &unicode_status);
CBOR_ASSERT(codepoint_count <= length);
if (unicode_status.status == _CBOR_UNICODE_OK) {
item->metadata.string_metadata.codepoint_count = codepoint_count;
} else {
item->metadata.string_metadata.codepoint_count = 0;
}
}

cbor_item_t **cbor_string_chunks_handle(const cbor_item_t *item) {
Expand Down
19 changes: 14 additions & 5 deletions src/cbor/strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ _CBOR_NODISCARD CBOR_EXPORT size_t cbor_string_length(const cbor_item_t *item);

/** The number of codepoints in this string
*
* Might differ from length if there are multibyte ones
* Might differ from `cbor_string_length` if there are multibyte codepoints.
* If the string data is not valid UTF-8, returns 0.
*
* @param item A string
* @return The number of codepoints in this string
Expand Down Expand Up @@ -71,6 +72,8 @@ cbor_string_handle(const cbor_item_t *item);

/** Set the handle to the underlying string
*
* The data is assumed to be a valid UTF-8 string. If the string is non-empty
* and invalid, `cbor_string_codepoint_count` will return 0.
*
* \rst
* .. warning:: Using a pointer to a stack allocated constant is a common
Expand Down Expand Up @@ -144,7 +147,11 @@ _CBOR_NODISCARD CBOR_EXPORT cbor_item_t *cbor_new_indefinite_string(void);

/** Creates a new string and initializes it
*
* The `val` will be copied to a newly allocated block
* The data from `val` will be copied to a newly allocated memory block.
*
* Note that valid UTF-8 strings do not contain null bytes, so this routine is
* correct for all valid inputs. If the input is not guaranteed to be valid,
* use `cbor_build_stringn` instead.
*
* @param val A null-terminated UTF-8 string
* @return Reference to the new string item. The item's reference count is
Expand All @@ -155,10 +162,12 @@ _CBOR_NODISCARD CBOR_EXPORT cbor_item_t *cbor_build_string(const char *val);

/** Creates a new string and initializes it
*
* The `handle` will be copied to a newly allocated block
* The data from `handle` will be copied to a newly allocated memory block.
*
* All @p `length` bytes will be stored in the string, even if there are null
* bytes or invalid UTF-8 sequences.
*
* @param val A UTF-8 string, at least @p `length` long (excluding the null
* byte)
* @param val A UTF-8 string, at least @p `length` bytes long
* @param length Length (in bytes) of the string passed in @p `val`.
* @return Reference to the new string item. The item's reference count is
* initialized to one.
Expand Down
2 changes: 2 additions & 0 deletions test/cbor_serialize_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ static void test_serialize_indefinite_string(void **_CBOR_UNUSED(_state)) {
static void test_serialize_string_no_space(void **_CBOR_UNUSED(_state)) {
cbor_item_t *item = cbor_new_definite_string();
unsigned char *data = malloc(12);
memset(data, 0, 12);
cbor_string_set_handle(item, data, 12);

assert_size_equal(cbor_serialize(item, buffer, 1), 0);
Expand All @@ -254,6 +255,7 @@ static void test_serialize_indefinite_string_no_space(
cbor_item_t *item = cbor_new_indefinite_string();
cbor_item_t *chunk = cbor_new_definite_string();
unsigned char *data = malloc(256);
memset(data, 0, 256);
cbor_string_set_handle(chunk, data, 256);
assert_true(cbor_string_add_chunk(item, cbor_move(chunk)));

Expand Down
11 changes: 4 additions & 7 deletions test/pretty_printer_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,9 @@ static void test_indefinite_bytestring(void **_CBOR_UNUSED(_state)) {
static void test_definite_string(void **_CBOR_UNUSED(_state)) {
char *string = "Hello!";
cbor_item_t *item = cbor_build_string(string);
// TODO: Codepoint metadata is not set by the builder, fix.
assert_describe_result(
item,
"[CBOR_TYPE_STRING] Definite, Length: 6B, Codepoints: 0, Data:\n"
"[CBOR_TYPE_STRING] Definite, Length: 6B, Codepoints: 6, Data:\n"
" Hello!\n");
cbor_decref(&item);
}
Expand All @@ -92,9 +91,9 @@ static void test_indefinite_string(void **_CBOR_UNUSED(_state)) {
assert_describe_result(
item,
"[CBOR_TYPE_STRING] Indefinite, Chunks: 2, Chunk data:\n"
" [CBOR_TYPE_STRING] Definite, Length: 6B, Codepoints: 0, Data:\n"
" [CBOR_TYPE_STRING] Definite, Length: 6B, Codepoints: 6, Data:\n"
" Hello!\n"
" [CBOR_TYPE_STRING] Definite, Length: 6B, Codepoints: 0, Data:\n"
" [CBOR_TYPE_STRING] Definite, Length: 6B, Codepoints: 6, Data:\n"
" Hello!\n");
cbor_decref(&item);
}
Expand All @@ -103,10 +102,9 @@ static void test_multibyte_string(void **_CBOR_UNUSED(_state)) {
// "Štěstíčko" in UTF-8
char *string = "\xc5\xa0t\xc4\x9bst\xc3\xad\xc4\x8dko";
cbor_item_t *item = cbor_build_string(string);
// TODO: Codepoint metadata is not set by the builder, fix.
assert_describe_result(
item,
"[CBOR_TYPE_STRING] Definite, Length: 13B, Codepoints: 0, Data:\n"
"[CBOR_TYPE_STRING] Definite, Length: 13B, Codepoints: 9, Data:\n"
" \xc5\xa0t\xc4\x9bst\xc3\xad\xc4\x8dko\n");
cbor_decref(&item);
}
Expand Down Expand Up @@ -138,7 +136,6 @@ static void test_definite_map(void **_CBOR_UNUSED(_state)) {
assert_true(cbor_map_add(
item, (struct cbor_pair){.key = cbor_move(cbor_build_uint8(1)),
.value = cbor_move(cbor_build_uint8(2))}));
cbor_describe(item, stdout);
assert_describe_result(item,
"[CBOR_TYPE_MAP] Definite, Size: 1, Contents:\n"
" Map entry 0\n"
Expand Down
67 changes: 67 additions & 0 deletions test/string_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,22 @@ static void test_short_indef_string(void **_CBOR_UNUSED(_state)) {
assert_null(string);
}

static void test_invalid_utf(void **_CBOR_UNUSED(_state)) {
/* 0x60 + 1 | 0xC5 (invalid unfinished 2B codepoint) */
unsigned char string_data[] = {0x61, 0xC5};
string = cbor_load(string_data, 2, &res);

assert_non_null(string);
assert_true(cbor_typeof(string) == CBOR_TYPE_STRING);
assert_true(cbor_isa_string(string));
assert_size_equal(cbor_string_length(string), 1);
assert_size_equal(cbor_string_codepoint_count(string), 0);
assert_true(cbor_string_is_definite(string));
assert_true(res.read == 2);

cbor_decref(&string);
}

static void test_inline_creation(void **_CBOR_UNUSED(_state)) {
string = cbor_build_string("Hello!");
assert_memory_equal(cbor_string_handle(string), "Hello!", strlen("Hello!"));
Expand Down Expand Up @@ -275,6 +291,53 @@ static void test_add_chunk_reallocation_overflow(void **_CBOR_UNUSED(_state)) {
cbor_decref(&string);
}

static void test_set_handle(void **_CBOR_UNUSED(_state)) {
string = cbor_new_definite_string();
char *test_string = "Hello";
unsigned char *string_data = malloc(strlen(test_string));
memcpy(string_data, test_string, strlen(test_string));
assert_ptr_not_equal(string_data, NULL);
cbor_string_set_handle(string, string_data, strlen(test_string));

assert_ptr_equal(cbor_string_handle(string), string_data);
assert_size_equal(cbor_string_length(string), 5);
assert_size_equal(cbor_string_codepoint_count(string), 5);

cbor_decref(&string);
}

static void test_set_handle_multibyte_codepoint(void **_CBOR_UNUSED(_state)) {
string = cbor_new_definite_string();
// "Štěstíčko" in UTF-8
char *test_string = "\xc5\xa0t\xc4\x9bst\xc3\xad\xc4\x8dko";
unsigned char *string_data = malloc(strlen(test_string));
memcpy(string_data, test_string, strlen(test_string));
assert_ptr_not_equal(string_data, NULL);
cbor_string_set_handle(string, string_data, strlen(test_string));

assert_ptr_equal(cbor_string_handle(string), string_data);
assert_size_equal(cbor_string_length(string), 13);
assert_size_equal(cbor_string_codepoint_count(string), 9);

cbor_decref(&string);
}

static void test_set_handle_invalid_utf(void **_CBOR_UNUSED(_state)) {
string = cbor_new_definite_string();
// Invalid multi-byte character (missing the second byte).
char *test_string = "Test: \xc5";
unsigned char *string_data = malloc(strlen(test_string));
memcpy(string_data, test_string, strlen(test_string));
assert_ptr_not_equal(string_data, NULL);
cbor_string_set_handle(string, string_data, strlen(test_string));

assert_ptr_equal(cbor_string_handle(string), string_data);
assert_size_equal(cbor_string_length(string), 7);
assert_size_equal(cbor_string_codepoint_count(string), 0);

cbor_decref(&string);
}

int main(void) {
const struct CMUnitTest tests[] = {
cmocka_unit_test(test_empty_string),
Expand All @@ -285,10 +348,14 @@ int main(void) {
cmocka_unit_test(test_int32_string),
cmocka_unit_test(test_int64_string),
cmocka_unit_test(test_short_indef_string),
cmocka_unit_test(test_invalid_utf),
cmocka_unit_test(test_inline_creation),
cmocka_unit_test(test_string_creation),
cmocka_unit_test(test_string_add_chunk),
cmocka_unit_test(test_add_chunk_reallocation_overflow),
cmocka_unit_test(test_set_handle),
cmocka_unit_test(test_set_handle_multibyte_codepoint),
cmocka_unit_test(test_set_handle_invalid_utf),
};
return cmocka_run_group_tests(tests, NULL, NULL);
}

0 comments on commit 5fe9656

Please sign in to comment.