Skip to content

Commit

Permalink
Lex variable interpolations using custom segments
Browse files Browse the repository at this point in the history
  • Loading branch information
fwcd committed Mar 25, 2024
1 parent a44555c commit f2cdb4d
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 66 deletions.
162 changes: 99 additions & 63 deletions src/line/lex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,45 @@ operators! {
(Assign, assign, '='),
}

/// The kind of string segment.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SegmentKind {
/// A literal string segment.
Literal,
/// A variable interpolation segment.
Variable,
}

/// A fragment of a string token.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Segment {
/// The text of the segment. Excludes the interpolation character ($) if not literal.
pub text: String,
/// The kind of string segment. Indicates e.g. whether this segment represents an interpolation.
pub kind: SegmentKind,
}

impl Segment {
pub fn empty_literal() -> Self {
Self { text: String::new(), kind: SegmentKind::Literal }
}

pub fn empty_variable() -> Self {
Self { text: String::new(), kind: SegmentKind::Variable }
}
}

/// A token produced by the lexer.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
Operator(Operator),
String(Vec<String>),
String(Vec<Segment>),
}

/// Tokenizes the line. This handles quoting and removes whitespace.
pub fn lex(line: &str) -> Result<Vec<Token>> {
let mut tokens = Vec::<Token>::new();
let mut current: Option<Vec<String>> = None;
let mut current: Option<Vec<Segment>> = None;
let mut it = line.chars().into_iter();
while let Some(c) = it.next() {
if let Ok(op) = Operator::try_from(c) { // Operator
Expand All @@ -63,30 +91,30 @@ pub fn lex(line: &str) -> Result<Vec<Token>> {
let mut is_escaped = false;
let mut in_interpolation = false;
if current.is_none() {
current = Some(vec![String::new()]);
current = Some(vec![Segment::empty_literal()]);
}
loop {
let Some(c) = it.next() else {
bail!("Unexpectedly reached end of {}-quoted string", quote);
};
if !is_escaped && c == '\\' {
is_escaped = true;
} else if !is_escaped && !in_interpolation && quote == '"' && c == '$' {
// Entering interpolation
in_interpolation = true;
current.as_mut().unwrap().push(Segment::empty_variable());
} else {
if !is_escaped {
if !in_interpolation && quote == '"' && c == '$' {
// Entering interpolation
in_interpolation = true;
current.as_mut().unwrap().push(String::new());
} else if in_interpolation && !c.is_ascii_alphanumeric() && c != '_' {
if in_interpolation && !c.is_ascii_alphanumeric() && c != '_' {
// Exiting interpolation
in_interpolation = false;
current.as_mut().unwrap().push(String::new());
current.as_mut().unwrap().push(Segment::empty_literal());
}
if c == quote {
break;
}
}
current.as_mut().unwrap().last_mut().unwrap().push(c);
current.as_mut().unwrap().last_mut().unwrap().text.push(c);
is_escaped = false;
}
}
Expand All @@ -97,10 +125,10 @@ pub fn lex(line: &str) -> Result<Vec<Token>> {
current = None;
} else { // Non-whitespace
if current.is_none() {
current = Some(vec![String::new()]);
current = Some(vec![Segment::empty_literal()]);
}
if let Some(current) = current.as_mut() {
current.last_mut().unwrap().push(c);
current.last_mut().unwrap().text.push(c);
}
}
}
Expand All @@ -111,20 +139,28 @@ pub fn lex(line: &str) -> Result<Vec<Token>> {
}

#[cfg(test)]
fn op(op: Operator) -> Token {
Token::Operator(op)
}
mod tests {
use super::{assign, lex, redirect, Operator, Segment, SegmentKind, Token};

#[cfg(test)]
fn string(s: impl IntoIterator<Item = &'static str>) -> Token {
Token::String(s.into_iter().map(|s| s.to_owned()).collect())
}
fn op(op: Operator) -> Token {
Token::Operator(op)
}

#[cfg(test)]
mod tests {
use crate::line::lex::op;
fn lit(s: &str) -> Segment {
Segment { text: s.to_owned(), kind: SegmentKind::Literal }
}

fn var(s: &str) -> Segment {
Segment { text: s.to_owned(), kind: SegmentKind::Variable }
}

use super::{string, assign, lex, redirect};
fn string(s: impl IntoIterator<Item = Segment>) -> Token {
Token::String(s.into_iter().map(|s| s.to_owned()).collect())
}

fn lit_string(s: impl IntoIterator<Item = &'static str>) -> Token {
string(s.into_iter().map(lit))
}

#[test]
fn whitespace() {
Expand All @@ -134,73 +170,73 @@ mod tests {

#[test]
fn simple_commands() {
assert_eq!(lex("ls").unwrap(), vec![string(["ls"])]);
assert_eq!(lex("ls /").unwrap(), vec![string(["ls"]), string(["/"])]);
assert_eq!(lex("ls hello").unwrap(), vec![string(["ls"]), string(["hello"])]);
assert_eq!(lex("ls /hello/ world").unwrap(), vec![string(["ls"]), string(["/hello/"]), string(["world"])]);
assert_eq!(lex(" ls / ").unwrap(), vec![string(["ls"]), string(["/"])]);
assert_eq!(lex(" pwd").unwrap(), vec![string(["pwd"])]);
assert_eq!(lex("echo Hello world123 !").unwrap(), vec![string(["echo"]), string(["Hello"]), string(["world123"]), string(["!"])]);
assert_eq!(lex("ls").unwrap(), vec![lit_string(["ls"])]);
assert_eq!(lex("ls /").unwrap(), vec![lit_string(["ls"]), lit_string(["/"])]);
assert_eq!(lex("ls hello").unwrap(), vec![lit_string(["ls"]), lit_string(["hello"])]);
assert_eq!(lex("ls /hello/ world").unwrap(), vec![lit_string(["ls"]), lit_string(["/hello/"]), lit_string(["world"])]);
assert_eq!(lex(" ls / ").unwrap(), vec![lit_string(["ls"]), lit_string(["/"])]);
assert_eq!(lex(" pwd").unwrap(), vec![lit_string(["pwd"])]);
assert_eq!(lex("echo Hello world123 !").unwrap(), vec![lit_string(["echo"]), lit_string(["Hello"]), lit_string(["world123"]), lit_string(["!"])]);
}

#[test]
fn quotes() {
assert_eq!(lex("''").unwrap(), vec![string([""])]);
assert_eq!(lex(r#""""#).unwrap(), vec![string([""])]);
assert_eq!(lex(r#" "" "" "" "#).unwrap(), vec![string([""]), string([""]), string([""])]);
assert_eq!(lex(r#" """" "" "#).unwrap(), vec![string([""]), string([""])]);
assert_eq!(lex(r#" "''" "" "#).unwrap(), vec![string(["''"]), string([""])]);
assert_eq!(lex(r#"echo "Hello world " 1234"#).unwrap(), vec![string(["echo"]), string(["Hello world "]), string(["1234"])]);
assert_eq!(lex(r#"echo '"Hello 'world 1234"#).unwrap(), vec![string(["echo"]), string(["\"Hello world"]), string(["1234"])]);
assert_eq!(lex("''").unwrap(), vec![lit_string([""])]);
assert_eq!(lex(r#""""#).unwrap(), vec![lit_string([""])]);
assert_eq!(lex(r#" "" "" "" "#).unwrap(), vec![lit_string([""]), lit_string([""]), lit_string([""])]);
assert_eq!(lex(r#" """" "" "#).unwrap(), vec![lit_string([""]), lit_string([""])]);
assert_eq!(lex(r#" "''" "" "#).unwrap(), vec![lit_string(["''"]), lit_string([""])]);
assert_eq!(lex(r#"echo "Hello world " 1234"#).unwrap(), vec![lit_string(["echo"]), lit_string(["Hello world "]), lit_string(["1234"])]);
assert_eq!(lex(r#"echo '"Hello 'world 1234"#).unwrap(), vec![lit_string(["echo"]), lit_string(["\"Hello world"]), lit_string(["1234"])]);
assert!(lex(r#" "'' "" "#).is_err());
assert!(lex(r#"echo "Hello world 1234"#).is_err());
}

#[test]
fn escapes() {
assert!(lex("'''").is_err());
assert_eq!(lex(r#"'\''"#).unwrap(), vec![string(["'"])]);
assert_eq!(lex(r#""\'""#).unwrap(), vec![string(["'"])]);
assert_eq!(lex(r#"'\"'"#).unwrap(), vec![string(["\""])]);
assert_eq!(lex(r#"Hello " \"world\"""#).unwrap(), vec![string(["Hello"]), string([" \"world\""])]);
assert_eq!(lex(r#""\\""#).unwrap(), vec![string(["\\"])]);
assert_eq!(lex(r#"'This\\\\is a double escape'"#).unwrap(), vec![string(["This\\\\is a double escape"])]);
assert_eq!(lex(r#""Escaped dollar sign: \$test 123""#).unwrap(), vec![string(["Escaped dollar sign: $test 123"])]);
assert_eq!(lex(r#"'\''"#).unwrap(), vec![lit_string(["'"])]);
assert_eq!(lex(r#""\'""#).unwrap(), vec![lit_string(["'"])]);
assert_eq!(lex(r#"'\"'"#).unwrap(), vec![lit_string(["\""])]);
assert_eq!(lex(r#"Hello " \"world\"""#).unwrap(), vec![lit_string(["Hello"]), lit_string([" \"world\""])]);
assert_eq!(lex(r#""\\""#).unwrap(), vec![lit_string(["\\"])]);
assert_eq!(lex(r#"'This\\\\is a double escape'"#).unwrap(), vec![lit_string(["This\\\\is a double escape"])]);
assert_eq!(lex(r#""Escaped dollar sign: \$test 123""#).unwrap(), vec![lit_string(["Escaped dollar sign: $test 123"])]);
assert!(lex(r#"'Unclosed: \\\'"#).is_err());
// TODO: We should handle backslashes outside quoted contexts too
assert_eq!(lex("\\").unwrap(), vec![string(["\\"])]);
assert_eq!(lex("\\").unwrap(), vec![lit_string(["\\"])]);
// TODO: Should we insert the backslash with unrecognized characters? Or error?
assert_eq!(lex(r#"'\another char'"#).unwrap(), vec![string(["another char"])]);
assert_eq!(lex(r#"'\another char'"#).unwrap(), vec![lit_string(["another char"])]);
}

#[test]
fn interpolations() {
assert_eq!(lex("'test$x'").unwrap(), vec![string(["test$x"])]);
assert_eq!(lex(r#""$abc""#).unwrap(), vec![string(["", "$abc", ""])]);
assert_eq!(lex(r#""test$x""#).unwrap(), vec![string(["test", "$x", ""])]);
assert_eq!(lex(r#""$var_with_underscore abc""#).unwrap(), vec![string(["", "$var_with_underscore", " abc"])]);
assert_eq!(lex(r#""$var_with-hyphen""#).unwrap(), vec![string(["", "$var_with", "-hyphen"])]);
assert_eq!(lex(r#""/$var_with/slash""#).unwrap(), vec![string(["/", "$var_with", "/slash"])]);
assert_eq!(lex("'test$x'").unwrap(), vec![lit_string(["test$x"])]);
assert_eq!(lex(r#""$abc""#).unwrap(), vec![string([lit(""), var("abc"), lit("")])]);
assert_eq!(lex(r#""test$x""#).unwrap(), vec![string([lit("test"), var("x"), lit("")])]);
assert_eq!(lex(r#""$var_with_underscore abc""#).unwrap(), vec![string([lit(""), var("var_with_underscore"), lit(" abc")])]);
assert_eq!(lex(r#""$var_with-hyphen""#).unwrap(), vec![string([lit(""), var("var_with"), lit("-hyphen")])]);
assert_eq!(lex(r#""/$var_with/slash""#).unwrap(), vec![string([lit("/"), var("var_with"), lit("/slash")])]);
}

#[test]
fn redirects() {
assert_eq!(lex(">").unwrap(), vec![op(redirect())]);
assert_eq!(lex(">>").unwrap(), vec![op(redirect()), op(redirect())]);
assert_eq!(lex(">a").unwrap(), vec![op(redirect()), string(["a"])]);
assert_eq!(lex(">1").unwrap(), vec![op(redirect()), string(["1"])]);
assert_eq!(lex(" >0> 1").unwrap(), vec![op(redirect()), string(["0"]), op(redirect()), string(["1"])]);
assert_eq!(lex("echo Test > a").unwrap(), vec![string(["echo"]), string(["Test"]), op(redirect()), string(["a"])]);
assert_eq!(lex(r#"echo '{"x": 23,"y":3}' > /dev/null"#).unwrap(), vec![string(["echo"]), string([r#"{"x": 23,"y":3}"#]), op(redirect()), string(["/dev/null"])])
assert_eq!(lex(">a").unwrap(), vec![op(redirect()), lit_string(["a"])]);
assert_eq!(lex(">1").unwrap(), vec![op(redirect()), lit_string(["1"])]);
assert_eq!(lex(" >0> 1").unwrap(), vec![op(redirect()), lit_string(["0"]), op(redirect()), lit_string(["1"])]);
assert_eq!(lex("echo Test > a").unwrap(), vec![lit_string(["echo"]), lit_string(["Test"]), op(redirect()), lit_string(["a"])]);
assert_eq!(lex(r#"echo '{"x": 23,"y":3}' > /dev/null"#).unwrap(), vec![lit_string(["echo"]), string([r#"{"x": 23,"y":3}"#].map(lit)), op(redirect()), lit_string(["/dev/null"])])
}

#[test]
fn assignments() {
assert_eq!(lex(r#"hello="123""#).unwrap(), vec![string(["hello"]), op(assign()), string(["123"])]);
assert_eq!(lex(r#"hello ="1""#).unwrap(), vec![string(["hello"]), op(assign()), string(["1"])]);
assert_eq!(lex(r#"hello = "1""#).unwrap(), vec![string(["hello"]), op(assign()), string(["1"])]);
assert_eq!(lex(r#"hello='"123"'"#).unwrap(), vec![string(["hello"]), op(assign()), string(["\"123\""])]);
assert_eq!(lex(r#"hello '="123"'"#).unwrap(), vec![string(["hello"]), string(["=\"123\""])]);
assert_eq!(lex(r#"hello'="123"'"#).unwrap(), vec![string(["hello=\"123\""])]);
assert_eq!(lex(r#"hello="123""#).unwrap(), vec![lit_string(["hello"]), op(assign()), lit_string(["123"])]);
assert_eq!(lex(r#"hello ="1""#).unwrap(), vec![lit_string(["hello"]), op(assign()), lit_string(["1"])]);
assert_eq!(lex(r#"hello = "1""#).unwrap(), vec![lit_string(["hello"]), op(assign()), lit_string(["1"])]);
assert_eq!(lex(r#"hello='"123"'"#).unwrap(), vec![lit_string(["hello"]), op(assign()), lit_string(["\"123\""])]);
assert_eq!(lex(r#"hello '="123"'"#).unwrap(), vec![lit_string(["hello"]), lit_string(["=\"123\""])]);
assert_eq!(lex(r#"hello'="123"'"#).unwrap(), vec![lit_string(["hello=\"123\""])]);
}
}
6 changes: 3 additions & 3 deletions src/line/parse.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use anyhow::{bail, Result};
use multipeek::{IteratorExt, MultiPeek};

use super::lex::{lex, Operator, Token};
use super::lex::{lex, Operator, Segment, Token};

/// A fragment of an argument (a string fragment after evaluation).
#[derive(Debug, Clone, PartialEq, Eq)]
Expand Down Expand Up @@ -108,9 +108,9 @@ fn parse_command<T>(tokens: &mut MultiPeek<T>) -> Result<Command> where T: Itera
))
}

fn parse_argument(args: &[String]) -> Result<Argument> {
fn parse_argument(segments: &[Segment]) -> Result<Argument> {
// TODO
let fragments = args.into_iter().map(|a| Fragment::Literal(a.to_owned())).collect();
let fragments = segments.into_iter().map(|a| Fragment::Literal(a.text.to_owned())).collect();
Ok(Argument { fragments })
}

Expand Down

0 comments on commit f2cdb4d

Please sign in to comment.