From 5b6bc55bd998cd4c6713efaf34e1cde620a107cd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 11 Jul 2024 04:38:23 -0400 Subject: [PATCH] Additional tests for parquet reader utf8 validation (#6023) --- parquet/src/arrow/arrow_reader/mod.rs | 202 +++++++++++++++++++------- 1 file changed, 146 insertions(+), 56 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index cc369cec0ea..71bb0b5e2ec 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2425,89 +2425,179 @@ mod tests { fn test_invalid_utf8_string_array_inner() { let cases = [ - ( - invalid_utf8_first_char::(), - "Parquet argument error: Parquet error: encountered non UTF-8 data", - ), - ( - invalid_utf8_later_char::(), - "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 6", - ), + invalid_utf8_first_char::(), + invalid_utf8_first_char_long_strings::(), + invalid_utf8_later_char::(), + invalid_utf8_later_char_long_strings::(), + invalid_utf8_later_char_really_long_strings::(), + invalid_utf8_later_char_really_long_strings2::(), ]; - for (array, expected_error) in cases { - // data is not valid utf8 we can not construct a correct StringArray - // safely, so purposely create an invalid StringArray - let array = unsafe { - GenericStringArray::::new_unchecked( - array.offsets().clone(), - array.values().clone(), - array.nulls().cloned(), - ) - }; - let data_type = array.data_type().clone(); - let data = write_to_parquet(Arc::new(array)); - let err = read_from_parquet(data).unwrap_err(); - assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}") + for array in &cases { + for encoding in STRING_ENCODINGS { + // data is not valid utf8 we can not construct a correct StringArray + // safely, so purposely create an invalid StringArray + let array = unsafe { + GenericStringArray::::new_unchecked( + array.offsets().clone(), + array.values().clone(), + array.nulls().cloned(), + ) + }; + let data_type = array.data_type().clone(); + let data = write_to_parquet_with_encoding(Arc::new(array), *encoding); + let err = read_from_parquet(data).unwrap_err(); + let expected_err = + "Parquet argument error: Parquet error: encountered non UTF-8 data"; + assert!( + err.to_string().contains(expected_err), + "data type: {data_type:?}, expected: {expected_err}, got: {err}" + ); + } } } #[test] fn test_invalid_utf8_string_view_array() { let cases = [ - ( - invalid_utf8_first_char::(), - "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 11", - ), - ( - invalid_utf8_later_char::(), - "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 14", - ), + invalid_utf8_first_char::(), + invalid_utf8_first_char_long_strings::(), + invalid_utf8_later_char::(), + invalid_utf8_later_char_long_strings::(), + invalid_utf8_later_char_really_long_strings::(), + invalid_utf8_later_char_really_long_strings2::(), ]; - for (array, expected_error) in cases { - let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap(); - let array = array.as_binary_view(); - - // data is not valid utf8 we can not construct a correct StringArray - // safely, so purposely create an invalid StringArray - let array = unsafe { - StringViewArray::new_unchecked( - array.views().clone(), - array.data_buffers().to_vec(), - array.nulls().cloned(), - ) - }; - let data_type = array.data_type().clone(); - let data = write_to_parquet(Arc::new(array)); - let err = read_from_parquet(data).unwrap_err(); - assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}") + + for encoding in STRING_ENCODINGS { + for array in &cases { + let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap(); + let array = array.as_binary_view(); + + // data is not valid utf8 we can not construct a correct StringArray + // safely, so purposely create an invalid StringViewArray + let array = unsafe { + StringViewArray::new_unchecked( + array.views().clone(), + array.data_buffers().to_vec(), + array.nulls().cloned(), + ) + }; + + let data_type = array.data_type().clone(); + let data = write_to_parquet_with_encoding(Arc::new(array), *encoding); + let err = read_from_parquet(data).unwrap_err(); + let expected_err = + "Parquet argument error: Parquet error: encountered non UTF-8 data"; + assert!( + err.to_string().contains(expected_err), + "data type: {data_type:?}, expected: {expected_err}, got: {err}" + ); + } } } + /// Encodings suitable for string data + const STRING_ENCODINGS: &[Option] = &[ + None, + Some(Encoding::PLAIN), + Some(Encoding::DELTA_LENGTH_BYTE_ARRAY), + Some(Encoding::DELTA_BYTE_ARRAY), + ]; + + /// Invalid Utf-8 sequence in the first character + /// + const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20]; + + /// Invalid Utf=8 sequence in NOT the first character + /// + const INVALID_UTF8_LATER_CHAR: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20]; + /// returns a BinaryArray with invalid UTF8 data in the first character fn invalid_utf8_first_char() -> GenericBinaryArray { - // invalid sequence in the first character - // https://stackoverflow.com/questions/1301402/example-invalid-utf8-string let valid: &[u8] = b" "; - let invalid: &[u8] = &[0xa0, 0xa1, 0x20, 0x20]; + let invalid = INVALID_UTF8_FIRST_CHAR; GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(invalid)]) } + /// Returns a BinaryArray with invalid UTF8 data in the first character of a + /// string larger than 12 bytes which is handled specially when reading + /// `ByteViewArray`s + fn invalid_utf8_first_char_long_strings() -> GenericBinaryArray { + let valid: &[u8] = b" "; + let mut invalid = vec![]; + invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes"); + invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR); + GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(&invalid)]) + } + /// returns a BinaryArray with invalid UTF8 data in a character other than /// the first (this is checked in a special codepath) fn invalid_utf8_later_char() -> GenericBinaryArray { - // invalid sequence in NOT the first character - // https://stackoverflow.com/questions/1301402/example-invalid-utf8-string let valid: &[u8] = b" "; - let invalid: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20]; + let invalid: &[u8] = INVALID_UTF8_LATER_CHAR; GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(invalid)]) } - // writes the array into a single column parquet file - fn write_to_parquet(array: ArrayRef) -> Vec { + /// returns a BinaryArray with invalid UTF8 data in a character other than + /// the first in a string larger than 12 bytes which is handled specially + /// when reading `ByteViewArray`s (this is checked in a special codepath) + fn invalid_utf8_later_char_long_strings() -> GenericBinaryArray { + let valid: &[u8] = b" "; + let mut invalid = vec![]; + invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes"); + invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR); + GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(&invalid)]) + } + + /// returns a BinaryArray with invalid UTF8 data in a character other than + /// the first in a string larger than 128 bytes which is handled specially + /// when reading `ByteViewArray`s (this is checked in a special codepath) + fn invalid_utf8_later_char_really_long_strings() -> GenericBinaryArray { + let valid: &[u8] = b" "; + let mut invalid = vec![]; + for _ in 0..10 { + // each instance is 38 bytes + invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes"); + } + invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR); + GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(&invalid)]) + } + + /// returns a BinaryArray with small invalid UTF8 data followed by a large + /// invalid UTF8 data in a character other than the first in a string larger + fn invalid_utf8_later_char_really_long_strings2() -> GenericBinaryArray { + let valid: &[u8] = b" "; + let mut valid_long = vec![]; + for _ in 0..10 { + // each instance is 38 bytes + valid_long.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes"); + } + let invalid = INVALID_UTF8_LATER_CHAR; + GenericBinaryArray::::from_iter(vec![ + None, + Some(valid), + Some(invalid), + None, + Some(&valid_long), + Some(valid), + ]) + } + + /// writes the array into a single column parquet file with the specified + /// encoding. + /// + /// If no encoding is specified, use default (dictionary) encoding + fn write_to_parquet_with_encoding(array: ArrayRef, encoding: Option) -> Vec { let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap(); let mut data = vec![]; let schema = batch.schema(); - let props = None; + let props = encoding.map(|encoding| { + WriterProperties::builder() + // must disable dictionary encoding to actually use encoding + .set_dictionary_enabled(false) + .set_encoding(encoding) + .build() + }); + { let mut writer = ArrowWriter::try_new(&mut data, schema, props).unwrap(); writer.write(&batch).unwrap();