Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Improve quote handling in lexer #3274

Merged
merged 8 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,14 @@

**Features**:

- Strings can be delimited with any odd number of quote characters. The logic
for lexing quotes is now simpler and slightly faster. Escapes in
single-quote-delimited strings escape single-quotes rather than double-quotes.
(@max-sixty, @3274)

**Fixes**:

- `prolc` no longer displays a prompt when piping a query into its stdin
- `prqlc` no longer displays a prompt when piping a query into its stdin
(@max-sixty, #3248).

**Documentation**:
Expand Down
128 changes: 82 additions & 46 deletions crates/prql-parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,62 +293,67 @@ fn literal() -> impl Parser<char, Literal, Error = Cheap<char>> {
}

fn quoted_string(escaped: bool) -> impl Parser<char, String, Error = Cheap<char>> {
// I don't know how this could be simplified and implemented for (n*2)+1 in general
choice((
quoted_string_inner(r#"""""""""#, escaped),
quoted_string_inner(r#"""""""#, escaped),
quoted_string_inner(r#"""""#, escaped),
quoted_string_inner(r#"""#, escaped),
quoted_string_inner(r#"'''''''"#, escaped),
quoted_string_inner(r#"'''''"#, escaped),
quoted_string_inner(r#"'''"#, escaped),
quoted_string_inner(r#"'"#, escaped),
quoted_string_of_quote(&'"', escaped),
quoted_string_of_quote(&'\'', escaped),
))
.collect::<String>()
.labelled("string")
}

fn quoted_string_inner(
quotes: &str,
fn quoted_string_of_quote(
quote: &char,
escaping: bool,
) -> impl Parser<char, Vec<char>, Error = Cheap<char>> + '_ {
let mut forbidden = just(quotes).boxed();

if escaping {
forbidden = just(quotes).or(just("\\")).boxed()
};

let mut inner = forbidden.not().boxed();

if escaping {
inner = inner
.or(just('\\').ignore_then(
just('\\')
.or(just('/'))
.or(just('"'))
.or(just('b').to('\x08'))
.or(just('f').to('\x0C'))
.or(just('n').to('\n'))
.or(just('r').to('\r'))
.or(just('t').to('\t'))
.or(just('u').ignore_then(
filter(|c: &char| c.is_ascii_hexdigit())
.repeated()
.exactly(4)
.collect::<String>()
.validate(|digits, span, emit| {
char::from_u32(u32::from_str_radix(&digits, 16).unwrap())
.unwrap_or_else(|| {
emit(Cheap::expected_input_found(span, None, None));
'\u{FFFD}' // unicode replacement character
})
}),
)),
let opening = just(*quote).repeated().at_least(1);

opening.then_with(move |opening| {
if opening.len() % 2 == 0 {
// If we have an even number of quotes, it's an empty string.
return (just(vec![])).boxed();
}
let delimiter = just(*quote).repeated().exactly(opening.len());

let inner = if escaping {
choice((
// If we're escaping, don't allow consuming a backslash
// We need the `vec` to satisfy the type checker
(delimiter.or(just(vec!['\\']))).not(),
escaped_character(),
// Or escape the quote char of the current string
just('\\').ignore_then(just(*quote)),
))
.boxed();
}
.boxed()
} else {
delimiter.not().boxed()
};

inner.repeated().then_ignore(delimiter).boxed()
})
}

inner.repeated().delimited_by(just(quotes), just(quotes))
fn escaped_character() -> impl Parser<char, char, Error = Cheap<char>> {
just('\\').ignore_then(choice((
just('\\'),
just('/'),
just('b').to('\x08'),
just('f').to('\x0C'),
just('n').to('\n'),
just('r').to('\r'),
just('t').to('\t'),
(just('u').ignore_then(
filter(|c: &char| c.is_ascii_hexdigit())
.repeated()
.exactly(4)
.collect::<String>()
.validate(|digits, span, emit| {
char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(|| {
emit(Cheap::expected_input_found(span, None, None));
'\u{FFFD}' // unicode replacement character
})
}),
)),
)))
}

fn digits(count: usize) -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
Expand Down Expand Up @@ -387,3 +392,34 @@ impl std::hash::Hash for Token {
}

impl std::cmp::Eq for Token {}

#[test]
fn quotes() {
use insta::assert_snapshot;

// All these are valid & equal.
assert_snapshot!(quoted_string(false).parse(r#"'aoeu'"#).unwrap(), @"aoeu");
assert_snapshot!(quoted_string(false).parse(r#"'''aoeu'''"#).unwrap(), @"aoeu");
assert_snapshot!(quoted_string(false).parse(r#"'''''aoeu'''''"#).unwrap(), @"aoeu");
assert_snapshot!(quoted_string(false).parse(r#"'''''''aoeu'''''''"#).unwrap(), @"aoeu");

// An even number is interpreted as a closed string (and the remainder is unparsed)
assert_snapshot!(quoted_string(false).parse(r#"''aoeu''"#).unwrap(), @"");

// When not escaping, we take the inner string between the three quotes
assert_snapshot!(quoted_string(false).parse(r#""""\"hello\""""#).unwrap(), @r###"\"hello\"###);

assert_snapshot!(quoted_string(true).parse(r#""""\"hello\"""""#).unwrap(), @r###""hello""###);

// Escape each inner quote depending on the outer quote
assert_snapshot!(quoted_string(true).parse(r#""\"hello\"""#).unwrap(), @r###""hello""###);
assert_snapshot!(quoted_string(true).parse(r#"'\'hello\''"#).unwrap(), @"'hello'");
Comment on lines +414 to +416
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was previously handled incorrectly, IIUC


assert_snapshot!(quoted_string(true).parse(r#"''"#).unwrap(), @"");

// An empty input should fail
quoted_string(false).parse(r#""#).unwrap_err();

// An even number of quotes is an empty string
assert_snapshot!(quoted_string(true).parse(r#"''''''"#).unwrap(), @"");
}
3 changes: 1 addition & 2 deletions web/book/src/reference/syntax/literals.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,7 @@ implementation.
### Quoting quotations

To quote a string containing quotes, escape the quotes, use the "other" type of
quote, or an odd number{{footnote: currently up to 7}} of quotes, and close with
the same number.
quote, or use more quotes.

```prql
from artists
Expand Down