From 6b79fcd231b085ba914d9e582d4cef796abdcf59 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 5 Jan 2024 19:21:47 -0800 Subject: [PATCH] feat: Allow nested interpolated strings in lexer --- .../prql-compiler/tests/integration/parser.rs | 8 +- prqlc/prqlc-parser/src/expr.rs | 6 +- prqlc/prqlc-parser/src/interpolation.rs | 4 +- prqlc/prqlc-parser/src/lexer.rs | 489 +++++++++++++++++- 4 files changed, 488 insertions(+), 19 deletions(-) diff --git a/prqlc/prql-compiler/tests/integration/parser.rs b/prqlc/prql-compiler/tests/integration/parser.rs index 8d4b62da9d37..6bf25ed48a06 100644 --- a/prqlc/prql-compiler/tests/integration/parser.rs +++ b/prqlc/prql-compiler/tests/integration/parser.rs @@ -1328,10 +1328,12 @@ fn test_func_call() { "###); // A non-friendly option for #154 - let ast = parse_expr(r#"count s'*'"#).unwrap(); - let func_call: FuncCall = ast.kind.into_func_call().unwrap(); assert_yaml_snapshot!( - func_call, @r###" + parse_expr(r#"count s'*'"#) + .unwrap() + .kind + .into_func_call() + .unwrap(), @r###" --- name: Ident: diff --git a/prqlc/prqlc-parser/src/expr.rs b/prqlc/prqlc-parser/src/expr.rs index ef0a3995b2e4..f25dfcc97900 100644 --- a/prqlc/prqlc-parser/src/expr.rs +++ b/prqlc/prqlc-parser/src/expr.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use chumsky::prelude::*; +use itertools::Itertools; use prqlc_ast::expr::*; use prqlc_ast::Span; @@ -85,12 +86,15 @@ pub fn expr() -> impl Parser + Clone { |_| Expr::new(ExprKind::Literal(Literal::Null)), )); + // TODO: temporary implementation to keep it working on the new + // interpolation lexer with the old interpolation parser let interpolation = select! { Token::Interpolation('s', string) => (ExprKind::SString as fn(_) -> _, string), Token::Interpolation('f', string) => (ExprKind::FString as fn(_) -> _, string), } .validate(|(finish, string), span: ParserSpan, emit| { - match interpolation::parse(string, span + 2) { + match interpolation::parse(string.into_iter().map(|x| x.to_string()).join(""), span + 2) + { Ok(items) => finish(items), Err(errors) => { for err in errors { diff --git a/prqlc/prqlc-parser/src/interpolation.rs b/prqlc/prqlc-parser/src/interpolation.rs index cf9cd53648b9..6fe5a57e1e93 100644 --- a/prqlc/prqlc-parser/src/interpolation.rs +++ b/prqlc/prqlc-parser/src/interpolation.rs @@ -8,9 +8,11 @@ use super::common::{into_expr, PError}; use super::lexer::*; use super::span::ParserSpan; +use prqlc_ast::expr::InterpolateItem; + /// Parses interpolated strings pub fn parse(string: String, span_base: ParserSpan) -> Result, Vec> { - let res = parser(span_base).parse(string); + let res = parser(span_base).parse(dbg!(string)); match res { Ok(items) => Ok(items), diff --git a/prqlc/prqlc-parser/src/lexer.rs b/prqlc/prqlc-parser/src/lexer.rs index 41180e351c79..ee7cef17a814 100644 --- a/prqlc/prqlc-parser/src/lexer.rs +++ b/prqlc/prqlc-parser/src/lexer.rs @@ -4,6 +4,7 @@ use chumsky::{ text::{newline, Character}, }; +use itertools::Itertools; use prqlc_ast::expr::*; #[derive(Clone, PartialEq, Debug)] @@ -19,7 +20,7 @@ pub enum Token { bind_left: bool, bind_right: bool, }, - Interpolation(char, String), + Interpolation(char, Vec), /// single-char control tokens Control(char), @@ -39,7 +40,19 @@ pub enum Token { Annotate, // @ } +#[derive(Clone, PartialEq, Debug)] +pub enum InterpolateItem { + String(String), + Expr(Vec, Option), +} + +/// Lex tokens until the end of the input pub fn lexer() -> impl Parser)>, Error = Cheap> { + lex_token().repeated().then_ignore(end()) +} + +/// Lex a single token +pub fn lex_token() -> impl Parser), Error = Cheap> { let whitespace = filter(|x: &char| x.is_inline_whitespace()) .repeated() .at_least(1) @@ -85,11 +98,6 @@ pub fn lexer() -> impl Parser)>, Error .ignore_then(filter(|c: &char| c.is_alphanumeric() || *c == '_' || *c == '.').repeated()) .collect::() .map(Token::Param); - - let interpolation = one_of("sf") - .then(quoted_string(true)) - .map(|(c, s)| Token::Interpolation(c, s)); - // I think declaring this and then cloning will be more performant than // calling the function on each invocation. // https://github.com/zesterer/chumsky/issues/501 would allow us to avoid @@ -100,14 +108,17 @@ pub fn lexer() -> impl Parser)>, Error let token = choice(( newline.to(Token::NewLine), control_multi, - interpolation, + interpolation(), param, control, literal, keyword, ident, )) - .recover_with(skip_then_retry_until([]).skip_start()); + // TODO: I think this now needs to be able to fail without recovering, since we use + // it in the interpolation lexer + // .recover_with(skip_then_retry_until([]).skip_start()); + ; let comment = just('#') .then(newline.not().repeated()) @@ -142,11 +153,10 @@ pub fn lexer() -> impl Parser)>, Error choice(( range, - ignored.ignore_then(token.map_with_span(|tok, span| (tok, span))), + ignored + .ignore_then(token.map_with_span(|tok, span| (tok, span))) + .then_ignore(ignored), )) - .repeated() - .then_ignore(ignored) - .then_ignore(end()) } pub fn ident_part() -> impl Parser> + Clone { @@ -158,6 +168,12 @@ pub fn ident_part() -> impl Parser> + Clone { plain.or(backticks).collect() } +fn interpolation() -> impl Parser> { + one_of("sf") + .then(interpolated_string(&'"', true).or(interpolated_string(&'\'', true))) + .map(|(c, s)| Token::Interpolation(c, s)) +} + fn literal() -> impl Parser> { let binary_notation = just("0b") .then_ignore(just("_").or_not()) @@ -352,6 +368,76 @@ fn quoted_string(escaped: bool) -> impl Parser .labelled("string") } +fn interpolated_string( + quote: &char, + escaping: bool, +) -> impl Parser, Error = Cheap> + '_ { + let opening = just(*quote).repeated().at_least(1); + + opening.then_with(move |opening| { + if opening.len() % 2 == 0 { + // If we have an even number of quotes, it's an empty string. + return chumsky::prelude::empty().to(vec![]).boxed(); + } + let delimiter = just(*quote).repeated().exactly(opening.len()); + + let string = if escaping { + choice(( + // Convert double braces to single braces + just(vec!['{', '{']).to('{'), + just(vec!['}', '}']).to('}'), + // Don't allow consuming a backslash, or the start of an expr + choice((delimiter, just(vec!['\\']), just(vec!['{']))).not(), + escaped_character(), + // Or escape the quote char or `{` of the current string (though + // maybe we don't need to allow the escaping of `{` given we + // have `{{`?) + just('\\').ignore_then(just(*quote).or(just('{'))), + )) + .boxed() + } else { + choice(( + just(vec!['{', '{']).to('{'), + just(vec!['}', '}']).to('}'), + choice((delimiter, just(vec!['{']))).not(), + )) + .boxed() + } + .repeated() + .at_least(1) + .collect() + .map(InterpolateItem::String); + + choice((interpolated_expr(), string)) + .repeated() + .at_least(1) + .then_ignore(delimiter) + .boxed() + }) +} + +fn interpolated_expr() -> impl Parser> { + // Don't allow two opening `{{`, since that's converted to a string of `{` + let inner = just('{').repeated().at_least(2).not().rewind().ignore_then( + // TODO: decide how we want to handle colons in interpolated expressions + // We use rewinds to look ahead and ensure we don't have a closing + // bracket (or colon), before forwarding that to the lexer. + filter(|c| *c != '}' && *c != ':') + .rewind() + .ignore_then(lex_token()) + .repeated() + .then( + just(':') + .ignore_then(filter(|c| *c != '}').repeated().collect::()) + .or_not(), + ), + ); + + inner + .delimited_by(just('{'), just('}')) + .map(|(e, f)| InterpolateItem::Expr(e.into_iter().map(|(tok, _)| tok).collect(), f)) +} + fn quoted_string_of_quote( quote: &char, escaping: bool, @@ -361,7 +447,7 @@ fn quoted_string_of_quote( opening.then_with(move |opening| { if opening.len() % 2 == 0 { // If we have an even number of quotes, it's an empty string. - return (just(vec![])).boxed(); + return chumsky::prelude::empty().to(vec![]).boxed(); } let delimiter = just(*quote).repeated().exactly(opening.len()); @@ -500,12 +586,75 @@ impl std::fmt::Display for Token { if *bind_right { "" } else { " " } ), Token::Interpolation(c, s) => { - write!(f, "{c}\"{}\"", s) + write!(f, r#"{c}"{}""#, s.iter().map(|x| x.to_string()).join("")) } } } } +impl std::fmt::Display for InterpolateItem { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + InterpolateItem::String(s) => { + let s = s.replace('{', "{{").replace('}', "}}"); + write!(f, r#"{}"#, s) + } + InterpolateItem::Expr(expr, format) => { + let expr = expr.iter().map(|x| x.to_string()).join(""); + if let Some(format) = format { + write!(f, "{{{}:{}}}", expr, format) + } else { + write!(f, "{{{}}}", expr) + } + } + } + } +} + +#[test] +fn interpolate_item_display() { + use insta::assert_display_snapshot; + + assert_display_snapshot!(InterpolateItem::String("hello".to_string()), @"hello"); + assert_display_snapshot!(Token::Ident("hello".to_string()), @"`hello`"); + assert_display_snapshot!( + InterpolateItem::Expr(vec![Token::Ident("hello".to_string())], None), + @"{`hello`}" + ); + assert_display_snapshot!( + InterpolateItem::Expr(vec![Token::Ident("hello".to_string())], Some("s".to_string())), + @"{`hello`:s}" + ); +} + +#[test] +fn test_lexer() { + use insta::assert_debug_snapshot; + + assert_debug_snapshot!(lex_token().repeated().parse(r"{test}").unwrap(), @r###" + [ + ( + Control( + '{', + ), + 0..1, + ), + ( + Ident( + "test", + ), + 1..5, + ), + ( + Control( + '}', + ), + 5..6, + ), + ] + "###); +} + #[test] fn test_line_wrap() { use insta::assert_debug_snapshot; @@ -632,3 +781,315 @@ fn quotes() { // Unicode escape assert_snapshot!(quoted_string(true).parse(r"'\u{01f422}'").unwrap(), @"🐢"); } + +#[test] +fn test_interpolated_expr() { + use insta::assert_debug_snapshot; + + assert_debug_snapshot!(interpolated_expr().parse_recovery_verbose("{hello}"), @r###" + ( + Some( + Expr( + [ + Ident( + "hello", + ), + ], + None, + ), + ), + [], + ) + "###); + + assert_debug_snapshot!(interpolated_expr().parse_recovery_verbose("{hello + 5}"), @r###" + ( + Some( + Expr( + [ + Ident( + "hello", + ), + Control( + '+', + ), + Literal( + Integer( + 5, + ), + ), + ], + None, + ), + ), + [], + ) + "###); + + assert_debug_snapshot!(interpolated_expr().parse_recovery_verbose("{hello}{foo}"), @r###" + ( + Some( + Expr( + [ + Ident( + "hello", + ), + ], + None, + ), + ), + [], + ) + "###); +} + +#[test] +fn interpolate() { + use insta::assert_debug_snapshot; + + let interpolated_string = interpolated_string(&'"', false); + + assert_debug_snapshot!(interpolated_string.parse_recovery_verbose(r#""{hello + 5}""#), @r###" + ( + Some( + [ + Expr( + [ + Ident( + "hello", + ), + Control( + '+', + ), + Literal( + Integer( + 5, + ), + ), + ], + None, + ), + ], + ), + [], + ) + "###); + + assert_debug_snapshot!(interpolated_string.parse_recovery_verbose(r#""hello""#), @r###" + ( + Some( + [ + String( + "hello", + ), + ], + ), + [], + ) + "###); + + assert_debug_snapshot!(interpolated_string.parse(r#""{hello}world""#).unwrap(), @r###" + [ + Expr( + [ + Ident( + "hello", + ), + ], + None, + ), + String( + "world", + ), + ] + "###); + + assert_debug_snapshot!(interpolated_string.parse_recovery_verbose(r#""{hello}{world}""#), @r###" + ( + Some( + [ + Expr( + [ + Ident( + "hello", + ), + ], + None, + ), + Expr( + [ + Ident( + "world", + ), + ], + None, + ), + ], + ), + [], + ) + "###); + + assert_debug_snapshot!(interpolated_string.parse_recovery_verbose(r#""{hello}world""#), @r###" + ( + Some( + [ + Expr( + [ + Ident( + "hello", + ), + ], + None, + ), + String( + "world", + ), + ], + ), + [], + ) + "###); + + assert_debug_snapshot!(interpolated_string.parse_recovery_verbose(r#""{hello + 5}world""#), @r###" + ( + Some( + [ + Expr( + [ + Ident( + "hello", + ), + Control( + '+', + ), + Literal( + Integer( + 5, + ), + ), + ], + None, + ), + String( + "world", + ), + ], + ), + [], + ) + "###); + + assert_debug_snapshot!(interpolated_string.parse_recovery_verbose(r#""print('{{hello}}')""#), @r###" + ( + Some( + [ + String( + "print('{hello}')", + ), + ], + ), + [], + ) + "###); +} + +#[test] +fn test_interpolate_lex() { + use insta::assert_debug_snapshot; + + assert_debug_snapshot!(lexer().parse_recovery_verbose(r#"s"hello{world}""#), @r###" + ( + Some( + [ + ( + Interpolation( + 's', + [ + String( + "hello", + ), + Expr( + [ + Ident( + "world", + ), + ], + None, + ), + ], + ), + 0..15, + ), + ], + ), + [], + ) + "###); + + assert_debug_snapshot!(lexer().parse_recovery_verbose(r#"s"hello{{world}}""#), @r###" + ( + Some( + [ + ( + Interpolation( + 's', + [ + String( + "hello{world}", + ), + ], + ), + 0..17, + ), + ], + ), + [], + ) + "###); +} + +#[test] +fn test_interpolated_display() { + use insta::assert_display_snapshot; + + fn roundtrip(s: &str) -> String { + let parsed = interpolation().parse(s).unwrap(); + parsed.to_string() + } + + assert_display_snapshot!(roundtrip(r#"s"{hello}world""#), @r###"s"{`hello`}world""###); + + assert_display_snapshot!( + InterpolateItem::String("hello".to_string()), + @"hello" + ); + + assert_display_snapshot!( + InterpolateItem::Expr(vec![Token::Ident("hello".to_string())], None), + @"{`hello`}" + ); + + assert_display_snapshot!( + InterpolateItem::Expr( + vec![Token::Ident("hello".to_string()), Token::Control('+'), Token::Literal(Literal::Integer(3))], + None + ) + , + @"{`hello`+3}" + ); + + assert_display_snapshot!( + InterpolateItem::Expr( + vec![Token::Ident("hello".to_string()), Token::Control('+'), Token::Literal(Literal::Integer(3))], + Some("fmt".to_string()) + ) + , + @"{`hello`+3:fmt}" + ); + + assert_display_snapshot!( + InterpolateItem::String("a{bracket}".to_string()), + @"a{{bracket}}" + ); +}