Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/pdf-document/src/encryption.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ mod tests {
#[test]
fn test_parse_full_encrypt_dictionary() {
let dict = make_dictionary(vec![
("Filter", ObjectVariant::Name("Standard".to_string())),
("Filter", ObjectVariant::Name(b"Standard".to_vec())),
("V", ObjectVariant::Integer(4)),
("R", ObjectVariant::Integer(4)),
("O", ObjectVariant::HexString(vec![0u8; 32])),
Expand Down
4 changes: 3 additions & 1 deletion crates/pdf-object-collection/src/object_collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,9 @@ impl ObjectCollection {
ObjectVariant::LiteralString(s) => {
json!({ "type": "LiteralString", "value": String::from_utf8_lossy(s).as_ref() })
}
ObjectVariant::Name(name) => json!({ "type": "Name", "value": name }),
ObjectVariant::Name(name) => {
json!({ "type": "Name", "value": String::from_utf8_lossy(name).as_ref() })
}
ObjectVariant::Integer(i) => JsonValue::Number((*i).into()),
ObjectVariant::Real(r) => serde_json::Number::from_f64(*r)
.map(JsonValue::Number)
Expand Down
12 changes: 6 additions & 6 deletions crates/pdf-object/src/object_variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ pub enum ObjectVariant {
Array(Vec<ObjectVariant>),
/// A literal string (enclosed in parentheses in PDF syntax), stored as raw bytes.
LiteralString(Vec<u8>),
/// A name object (prefixed with a slash in PDF syntax).
Name(String),
/// A name object (prefixed with a slash in PDF syntax), stored as raw bytes.
Name(Vec<u8>),
/// An integer number.
Integer(i64),
/// A real (floating point) number.
Expand Down Expand Up @@ -167,8 +167,9 @@ impl ObjectVariant {
let s = String::from_utf8_lossy(s);
Ok(s)
}
ObjectVariant::LiteralString(s) => Ok(String::from_utf8_lossy(s)),
ObjectVariant::Name(s) => Ok(Cow::Borrowed(s)),
ObjectVariant::LiteralString(s) | ObjectVariant::Name(s) => {
Ok(String::from_utf8_lossy(s))
}
_ => Err(ObjectError::TypeMismatch("String", object.name())),
}
}
Expand Down Expand Up @@ -324,8 +325,7 @@ impl ObjectVariant {

match object {
ObjectVariant::HexString(s) => Ok(s),
ObjectVariant::Name(s) => Ok(s.as_bytes()),
ObjectVariant::LiteralString(s) => Ok(s),
ObjectVariant::Name(s) | ObjectVariant::LiteralString(s) => Ok(s),
_ => Err(ObjectError::TypeMismatch("HexString", object.name())),
}
}
Expand Down
6 changes: 3 additions & 3 deletions crates/pdf-parser/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ impl ArrayParser for PdfParser<'_> {
objects: &dyn ObjectResolver,
) -> Result<Vec<ObjectVariant>, ParserError> {
self.tokenizer.expect(PdfToken::LeftSquareBracket)?;
self.skip_whitespace();
self.skip_whitespace_and_comments();

let mut values = Vec::new();
while let Some(token) = self.tokenizer.peek() {
self.skip_whitespace();
self.skip_whitespace_and_comments();

if let PdfToken::RightSquareBracket = token {
break;
Expand All @@ -54,7 +54,7 @@ impl ArrayParser for PdfParser<'_> {
if let Some(PdfToken::RightSquareBracket) = self.tokenizer.peek() {
break;
}
self.skip_whitespace();
self.skip_whitespace_and_comments();
}

self.tokenizer.expect(PdfToken::RightSquareBracket)?;
Expand Down
2 changes: 1 addition & 1 deletion crates/pdf-parser/src/cross_reference_stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ mod tests {
raw_data: Vec<u8>,
) -> StreamObject {
let mut dict_map = BTreeMap::new();
dict_map.insert("Type".to_string(), ObjectVariant::Name("XRef".to_string()));
dict_map.insert("Type".to_string(), ObjectVariant::Name(b"XRef".to_vec()));
dict_map.insert("Size".to_string(), ObjectVariant::Integer(size as i64));
dict_map.insert(
"W".to_string(),
Expand Down
14 changes: 6 additions & 8 deletions crates/pdf-parser/src/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ impl DictionaryParser for PdfParser<'_> {
// Expect the opening `<<` of the dictionary.
self.tokenizer.expect(PdfToken::DoubleLeftAngleBracket)?;

// Skip whitespace
self.skip_whitespace();
self.skip_whitespace_and_comments();

let mut dictionary = BTreeMap::new();

Expand All @@ -59,19 +58,18 @@ impl DictionaryParser for PdfParser<'_> {
break;
}

// Skip whitespace
self.skip_whitespace();
self.skip_whitespace_and_comments();

// Parse key.
let key = self.parse_name()?;
// Parse key. Dictionary keys are always ASCII per spec; convert at boundary.
let key = String::from_utf8_lossy(&self.parse_name()?).into_owned();

self.skip_whitespace();
self.skip_whitespace_and_comments();

// Parse object.
let object = self.parse_object(objects)?;

dictionary.insert(key, object);
self.skip_whitespace();
self.skip_whitespace_and_comments();
}

// Consume the closing `>>` of the dictionary.
Expand Down
121 changes: 60 additions & 61 deletions crates/pdf-parser/src/name.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,102 +61,101 @@ impl NameParser for PdfParser<'_> {
/// A `Name` object containing the decoded name string (with hex escapes resolved),
/// or a `ParserError` if the input does not start with `/`, is empty after the `/`,
/// or contains an invalid hex escape sequence.
fn parse_name(&mut self) -> Result<String, Self::ErrorType> {
// let position = self.tokenizer.position.saturating_sub(20);
// println!("Parsing name object '{:?}'", String::from_utf8_lossy(&self.tokenizer.input[position..self.tokenizer.position+20]));
fn parse_name(&mut self) -> Result<Vec<u8>, Self::ErrorType> {
self.tokenizer.expect(PdfToken::Solidus)?;

let name = self.tokenizer.read_while_u8(|b| !Self::is_pdf_delimiter(b));
if name.is_empty() {
return Err(NameObjectError::InvalidToken);
}

let name = escape(name)?;

Ok(name)
escape(name)
}
}

/// Decodes escape sequences in PDF name objects.
/// Handles '#' followed by two hex digits by converting them to the corresponding ASCII character.
fn escape(input: &[u8]) -> Result<String, NameObjectError> {
let mut result = String::with_capacity(input.len());
let mut chars = input.iter();
/// Handles '#' followed by two hex digits by converting them to the corresponding byte value.
fn escape(input: &[u8]) -> Result<Vec<u8>, NameObjectError> {
let mut result = Vec::with_capacity(input.len());
let mut iter = input.iter();

while let Some(byte) = chars.next() {
while let Some(&byte) = iter.next() {
match byte {
b'#' => {
// Read the first hex digit character
let h1_byte = match chars.next() {
Some(b) => *b,
None => return Err(NameObjectError::IncompleteHexEscape),
};
// Read the second hex digit character
let h2_byte = match chars.next() {
Some(b) => *b,
None => return Err(NameObjectError::IncompleteHexEscape),
};

let h1_char = char::from(h1_byte);
let h2_char = char::from(h2_byte);
let h1 = *iter.next().ok_or(NameObjectError::IncompleteHexEscape)?;
let h2 = *iter.next().ok_or(NameObjectError::IncompleteHexEscape)?;

if !h1_char.is_ascii_hexdigit() {
return Err(NameObjectError::NonHexDigitInEscape(h1_char));
if !h1.is_ascii_hexdigit() {
return Err(NameObjectError::NonHexDigitInEscape(char::from(h1)));
}
if !h2_char.is_ascii_hexdigit() {
return Err(NameObjectError::NonHexDigitInEscape(h2_char));
if !h2.is_ascii_hexdigit() {
return Err(NameObjectError::NonHexDigitInEscape(char::from(h2)));
}

let hex_pair_str = String::from_iter([h1_char, h2_char]);
let byte_val = u8::from_str_radix(&hex_pair_str, 16).map_err(|e| {
NameObjectError::HexRadixError {
hex_pair: hex_pair_str,
reason: e.to_string(),
}
})?;
result.push(char::from(byte_val));
let high = hex_digit_value(h1);
let low = hex_digit_value(h2);
result.push((high << 4) | low);
}
_ => result.push(char::from(*byte)),
_ => result.push(byte),
}
}
Ok(result)
}

/// Converts an ASCII hex digit to its numeric value (0–15).
/// Caller must ensure the byte is a valid hex digit.
const fn hex_digit_value(b: u8) -> u8 {
match b {
b'0'..=b'9' => b.saturating_sub(b'0'),
b'a' => 10,
b'b' => 11,
b'c' => 12,
b'd' => 13,
b'e' => 14,
b'f' => 15,
b'A' => 10,
b'B' => 11,
b'C' => 12,
b'D' => 13,
b'E' => 14,
b'F' => 15,
_ => 0,
}
}

#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
mod tests {
use super::*;

#[test]
fn test_name_object_valid() {
let valid_inputs: Vec<(&[u8], &str)> = vec![
(b"/Name\n", "Name"),
(b"/Name\t", "Name"),
(b"/Name1 ", "Name1"),
(b"/Name ", "Name"),
(b"/Name<", "Name"),
(b"/Name>", "Name"),
(b"/Name[", "Name"),
(b"/Name]", "Name"),
(b"/Name{", "Name"),
(b"/Name}", "Name"),
(b"/Name(abc)", "Name"),
(b"/Name", "Name"),
(b"/A#20Name", "A Name"),
(b"/D#23E#5fF", "D#E_F"),
(b"/A#20B", "A B"),
let valid_inputs: Vec<(&[u8], &[u8])> = vec![
(b"/Name\n", b"Name"),
(b"/Name\t", b"Name"),
(b"/Name1 ", b"Name1"),
(b"/Name ", b"Name"),
(b"/Name<", b"Name"),
(b"/Name>", b"Name"),
(b"/Name[", b"Name"),
(b"/Name]", b"Name"),
(b"/Name{", b"Name"),
(b"/Name}", b"Name"),
(b"/Name(abc)", b"Name"),
(b"/Name", b"Name"),
(b"/A#20Name", b"A Name"),
(b"/D#23E#5fF", b"D#E_F"),
(b"/A#20B", b"A B"),
];
for (input, expected) in valid_inputs {
let mut parser = PdfParser::from(input);
let value = parser.parse_name().unwrap();
if value != expected {
panic!(
"Expected `{}`, but got `{}` for input `{}`",
expected,
value,
String::from_utf8_lossy(input)
);
}
assert_eq!(
value,
expected,
"input: `{}`",
String::from_utf8_lossy(input)
);
}
}

Expand Down
15 changes: 3 additions & 12 deletions crates/pdf-parser/src/object_stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,7 @@ mod tests {
let first = 10; // "10 0 11 3 " is 10 bytes

let mut dict_map = BTreeMap::new();
dict_map.insert(
"Type".to_string(),
ObjectVariant::Name("ObjStm".to_string()),
);
dict_map.insert("Type".to_string(), ObjectVariant::Name(b"ObjStm".to_vec()));
dict_map.insert("N".to_string(), ObjectVariant::Integer(2));
dict_map.insert("First".to_string(), ObjectVariant::Integer(first as i64));
dict_map.insert(
Expand Down Expand Up @@ -134,10 +131,7 @@ mod tests {
let first = 4; // "5 0 " is 4 bytes

let mut dict_map = BTreeMap::new();
dict_map.insert(
"Type".to_string(),
ObjectVariant::Name("ObjStm".to_string()),
);
dict_map.insert("Type".to_string(), ObjectVariant::Name(b"ObjStm".to_vec()));
dict_map.insert("N".to_string(), ObjectVariant::Integer(1));
dict_map.insert("First".to_string(), ObjectVariant::Integer(first as i64));
dict_map.insert(
Expand All @@ -158,10 +152,7 @@ mod tests {
assert_eq!(result[0].0, 5);
match &result[0].1 {
ObjectVariant::Dictionary(d) => {
assert_eq!(
d.get("Key"),
Some(&ObjectVariant::Name("Value".to_string()))
);
assert_eq!(d.get("Key"), Some(&ObjectVariant::Name(b"Value".to_vec())));
}
other => panic!("Expected dictionary, got {:?}", other),
}
Expand Down
20 changes: 20 additions & 0 deletions crates/pdf-parser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,26 @@ impl PdfParser<'_> {
let _ = self.tokenizer.read_while_u8(Self::is_pdf_whitespace);
}

/// Skips whitespace and comments.
///
/// Per PDF spec Section 7.2.3, comments are equivalent to whitespace and
/// should be ignored everywhere whitespace is allowed. This method
/// repeatedly skips whitespace and consumes any `% ... EOL` comments
/// until a non-whitespace, non-comment token is reached.
pub(crate) fn skip_whitespace_and_comments(&mut self) {
loop {
self.skip_whitespace();
if let Some(PdfToken::Percent) = self.tokenizer.peek() {
// Consume the '%' and everything up to EOL.
self.tokenizer.read();
let _ = self.tokenizer.read_while_u8(|c| c != b'\n' && c != b'\r');
let _ = self.try_read_end_of_line_marker();
} else {
break;
}
}
}

/// Parses a PDF object at a specific position in the input stream.
///
/// This function temporarily moves the tokenizer to the specified `position`, parses the object
Expand Down
2 changes: 1 addition & 1 deletion crates/pdf-parser/src/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ pub trait LiteralStringParser {
pub trait NameParser {
type ErrorType;

fn parse_name(&mut self) -> Result<String, Self::ErrorType>;
fn parse_name(&mut self) -> Result<Vec<u8>, Self::ErrorType>;
}

pub trait NullObjectParser {
Expand Down