From e41ea7a05b0b38972c89d4e3cfffc5340f4061b0 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Fri, 17 May 2024 21:56:28 +0900 Subject: [PATCH 01/15] =?UTF-8?q?[remove]=20LexIterator=20=E5=89=8A?= =?UTF-8?q?=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/algorithm_lr1/src/driver.rs | 17 ++++----- crates/algorithm_lr1/src/lib.rs | 4 +-- crates/core/src/lex.rs | 57 +++++++++--------------------- crates/core/src/parse.rs | 4 +-- 4 files changed, 30 insertions(+), 52 deletions(-) diff --git a/crates/algorithm_lr1/src/driver.rs b/crates/algorithm_lr1/src/driver.rs index d22c366..0281c2f 100644 --- a/crates/algorithm_lr1/src/driver.rs +++ b/crates/algorithm_lr1/src/driver.rs @@ -1,5 +1,5 @@ use core::cfg::{TokenSet, Syntax}; -use core::lex::LexIterator; +use core::lex::Token; use super::builder::{LRAction, LR1Configure}; @@ -19,7 +19,7 @@ where pub fn run<'c>( &self, - lexer: &mut impl LexIterator<'a, 'c, T>, + lexer: &mut impl Iterator>, ) -> anyhow::Result<()> { let mut stack = vec![0]; loop { @@ -46,12 +46,13 @@ where stack.push(self.0.goto_table[stack[stack.len() - 1]][*goto]); } LRAction::None => { - let pos = lexer.pos(); - let pos = match action.1 { - Some(raw) => (pos.0, pos.1 - (raw.len() as u32)), - None => pos, - }; - return Err(anyhow::anyhow!("Error at {:?}", pos).into()); + // let pos = lexer.pos(); + // let pos = match action.1 { + // Some(raw) => (pos.0, pos.1 - (raw.len() as u32)), + // None => pos, + // }; + // return Err(anyhow::anyhow!("Error at {:?}", pos).into()); + return Err(anyhow::anyhow!("Error",).into()); } LRAction::Accept => return Ok(()), } diff --git a/crates/algorithm_lr1/src/lib.rs b/crates/algorithm_lr1/src/lib.rs index ab0936b..1df9737 100644 --- a/crates/algorithm_lr1/src/lib.rs +++ b/crates/algorithm_lr1/src/lib.rs @@ -4,7 +4,7 @@ mod driver; use serde::{Serialize, Deserialize}; use core::cfg::{TokenSet, Syntax}; -use core::lex::LexIterator; +use core::lex::Token; use core::parse::ParserImpl; use builder::LR1Configure; @@ -31,7 +31,7 @@ where fn parse<'b>( &self, - mut lexer: impl LexIterator<'a, 'b, T>, + mut lexer: impl Iterator>, ) -> anyhow::Result { LR1Driver::new(&self.0).run(&mut lexer) } diff --git a/crates/core/src/lex.rs b/crates/core/src/lex.rs index 2c7b703..d0d2d9f 100644 --- a/crates/core/src/lex.rs +++ b/crates/core/src/lex.rs @@ -31,32 +31,17 @@ impl<'a, 'b, T: TokenSet<'a>> Token<'a, 'b, T> { } } -pub struct Lexer; +pub(crate) struct Lexer; impl Lexer { - pub fn new<'a, 'b, T>(input: &'b str) -> anyhow::Result> + pub fn new<'a, 'b, T>(input: &'b str) -> anyhow::Result>> where T: TokenSet<'a> + 'a, { - let regex_map = T::try_into()?; - - let regex_set = regex_map.iter().map(|(_, token)| T::to_regex(&token)).collect::>(); - let regex_set = RegexSet::new(regex_set)?; - - let regex_istr = Regex::new(T::ignore_str())?; - - Ok(LexDriver::<'a, 'b, T>::new(regex_set, regex_map, regex_istr, input)) + LexDriver::<'a, 'b, T>::try_from(input) } } -pub trait LexIterator<'a, 'b, T: TokenSet<'a> + 'a> -where - Self: Iterator>, -{ - fn pos(&self) -> (u32, u32); - fn remain(&self) -> Option<&'b str>; -} - struct LexDriver<'a, 'b, T: TokenSet<'a>> { // Regex regex_set: RegexSet, @@ -71,34 +56,26 @@ struct LexDriver<'a, 'b, T: TokenSet<'a>> { tokenset: PhantomData<&'a T>, } -impl<'a, 'b, T: TokenSet<'a>> LexDriver<'a, 'b, T> { - fn new( - regex_set: RegexSet, - regex_map: Vec<(Regex, T)>, - regex_istr: Regex, - input: &'b str, - ) -> Self { - LexDriver { +impl<'a, 'b, T: TokenSet<'a>> TryFrom<&'b str> for LexDriver<'a, 'b, T> { + type Error = anyhow::Error; + + fn try_from(input: &'b str) -> anyhow::Result { + let regex_map = T::try_into()?; + let regex_set = regex_map + .iter() + .map(|(_, token)| T::to_regex(&token)) + .collect::>(); + let regex_set = RegexSet::new(regex_set)?; + let regex_istr = Regex::new(T::ignore_str())?; + + Ok(LexDriver { regex_set, regex_map, regex_istr, input, pos: (0, 0), tokenset: PhantomData, - } - } -} - -impl<'a, 'b, T: TokenSet<'a> + 'a> LexIterator<'a, 'b, T> for LexDriver<'a, 'b, T> { - fn pos(&self) -> (u32, u32) { - self.pos - } - - fn remain(&self) -> Option<&'b str> { - match self.input { - "" => None, - s => Some(s), - } + }) } } diff --git a/crates/core/src/parse.rs b/crates/core/src/parse.rs index dbe86c6..5730f23 100644 --- a/crates/core/src/parse.rs +++ b/crates/core/src/parse.rs @@ -1,5 +1,5 @@ use crate::cfg::{TokenSet, Syntax}; -use super::lex::LexIterator; +use crate::lex::Token; pub trait ParserImpl<'a> where @@ -12,6 +12,6 @@ where fn setup() -> anyhow::Result; fn parse<'b>( &self, - lexer: impl LexIterator<'a, 'b, Self::TokenSet> + lexer: impl Iterator>, ) -> anyhow::Result; } From 4d2e4ed4041320ab4aaf8919a611ff710fb2a7be Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Fri, 17 May 2024 22:21:53 +0900 Subject: [PATCH 02/15] =?UTF-8?q?[change]=20lex::Token=20=E3=81=AE?= =?UTF-8?q?=E6=96=87=E5=AD=97=E5=88=97=E6=83=85=E5=A0=B1=E3=81=AE=E6=8C=81?= =?UTF-8?q?=E3=81=A1=E6=96=B9=E3=82=92=E5=A4=89=E3=81=88=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/core/src/lex.rs | 87 +++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 52 deletions(-) diff --git a/crates/core/src/lex.rs b/crates/core/src/lex.rs index d0d2d9f..44b2484 100644 --- a/crates/core/src/lex.rs +++ b/crates/core/src/lex.rs @@ -7,27 +7,24 @@ use crate::cfg::TokenSet; #[derive(Debug, Copy, Clone)] pub struct Token<'a, 'b, T: TokenSet<'a>> { pub kind: T, - pub pos: (u32, u32), - orig_txt: &'b str, + pub src: &'b str, + pub range: (usize, usize), tokenset: PhantomData<&'a T>, } impl<'a, 'b, T: TokenSet<'a>> Token<'a, 'b, T> { - pub fn new(kind: T, orig_txt: &'b str, pos: (u32, u32)) -> Self { + pub fn new(kind: T, src: &'b str, range: (usize, usize)) -> Self { Token { kind, - pos, - orig_txt, + src, + range, tokenset: PhantomData, } } pub fn as_str(&self) -> &'b str { - self.orig_txt - } - - pub fn to_string(&self) -> String { - self.orig_txt.to_string() + let (l, r) = self.range; + &self.src[l..r] } } @@ -50,7 +47,7 @@ struct LexDriver<'a, 'b, T: TokenSet<'a>> { // State input: &'b str, - pos: (u32, u32), + pos: usize, // PhantomData tokenset: PhantomData<&'a T>, @@ -73,7 +70,7 @@ impl<'a, 'b, T: TokenSet<'a>> TryFrom<&'b str> for LexDriver<'a, 'b, T> { regex_map, regex_istr, input, - pos: (0, 0), + pos: 0, tokenset: PhantomData, }) } @@ -83,47 +80,31 @@ impl<'a, 'b, T: TokenSet<'a> + 'a> Iterator for LexDriver<'a, 'b, T> { type Item = Token<'a, 'b, T>; fn next(&mut self) -> Option { - // Skip spaces - if let Some(acc_s) = self.regex_istr.find(self.input) { - self.update_state(acc_s.as_str()); - } + // Skip Spaces + let remain = match self.regex_istr.find(&self.input[self.pos..]) { + Some(acc_s) => { + self.pos += acc_s.len(); + &self.input[self.pos..] + } + None => &self.input[self.pos..] + }; // Find the token let mut matches = self .regex_set - .matches(self.input) + .matches(remain) .into_iter() .map(|idx| &self.regex_map[idx]) - .map(|(regex, token)| (*token, regex.find(self.input).unwrap().as_str())) + .map(|(regex, token)| (*token, regex.find(remain).unwrap().as_str())) .collect::>(); matches.sort_by(|(_, a), (_, b)| a.len().cmp(&b.len())); // Update myself let (token, acc_s) = matches.first()?; - let pos = self.pos; - self.update_state(acc_s); - - Some(Token::new(*token, acc_s, pos)) - } -} - -impl<'a, 'b, T: TokenSet<'a>> LexDriver<'a, 'b, T> { - fn update_state(&mut self, acc_s: &str) { - let (mut rows, mut cols) = self.pos; - for c in acc_s.chars() { - match c { - '\n' => { - rows += 1; - cols = 0; - } - _ => { - cols += 1; - } - } - } + let range = (self.pos, self.pos + acc_s.len()); + self.pos += acc_s.len(); - self.input = &self.input[acc_s.len()..]; - self.pos = (rows, cols); + Some(Token::new(*token, &self.input, range)) } } @@ -158,22 +139,24 @@ mod test { } fn check<'a, 'b>( - expected: &Vec<(TestToken, &'b str, (u32, u32))>, + expected: &Vec<(TestToken, &'b str, (usize, usize))>, input: &'b str, ) -> bool { Lexer::new::(input) .unwrap() .into_iter() .zip(expected.iter()) - .all(|(a, b)| a.kind == b.0 && a.pos == b.2 && a.orig_txt == b.1) + .all(|(a, b)| { + a.kind == b.0 && a.range == b.2 && a.as_str() == b.1 + }) } #[test] fn input_ok_1() { let expected = vec![ - (TestToken::Num, "10", (0, 0)), - (TestToken::Plus, "+", (0, 2)), - (TestToken::Num, "20", (0, 3)), + (TestToken::Num, "10", (0, 2)), + (TestToken::Plus, "+", (2, 3)), + (TestToken::Num, "20", (3, 5)), ]; let input = "10+20"; assert!(check(&expected, input)); @@ -182,9 +165,9 @@ mod test { #[test] fn input_ok_2() { let expected = vec![ - (TestToken::Num, "10", (0, 12)), - (TestToken::Plus, "+", (0, 15)), - (TestToken::Num, "20", (1, 6)), + (TestToken::Num, "10", (12, 14)), + (TestToken::Plus, "+", (15, 16)), + (TestToken::Num, "20", (23, 25)), ]; let input = " 10 +\n 20 "; assert!(check(&expected, input)); @@ -193,9 +176,9 @@ mod test { #[test] fn input_ok_3() { let expected = vec![ - (TestToken::Num, "10", (0, 12)), - (TestToken::Plus, "+", (0, 15)), - (TestToken::Num, "20", (1, 6)), + (TestToken::Num, "10", (12, 14)), + (TestToken::Plus, "+", (15, 16)), + (TestToken::Num, "20", (23, 25)), ]; let input = " 10 +\n 20ffff30 - 40 * 50"; assert!(check(&expected, input)); From a227129ee599da7e27b922ecd4148acf1aeb278b Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Fri, 17 May 2024 23:05:14 +0900 Subject: [PATCH 03/15] =?UTF-8?q?[fix]=20core,=20algorithm=20=E3=81=AE?= =?UTF-8?q?=E3=82=A4=E3=83=B3=E3=83=9D=E3=83=BC=E3=83=88=E5=90=8D=E3=82=92?= =?UTF-8?q?=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.toml | 10 +++++----- crates/algorithm_lr1/Cargo.toml | 2 +- crates/algorithm_lr1/src/builder.rs | 2 +- crates/algorithm_lr1/src/driver.rs | 4 ++-- crates/algorithm_lr1/src/lib.rs | 10 +++++----- crates/core/Cargo.toml | 4 ++-- crates/core/src/cfg.rs | 2 +- src/lib.rs | 4 ++-- tests/derive.rs | 2 +- tests/serde.rs | 6 +++--- 10 files changed, 23 insertions(+), 23 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index aeaa347..568e25c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,8 +6,8 @@ edition = "2021" [dependencies] anyhow = { workspace = true } thiserror = { workspace = true } -core = { workspace = true } -algorithm = { workspace = true } +pgen_core = { workspace = true } +pgen_algorithm = { workspace = true } [dev-dependencies] serde = { workspace = true } @@ -15,7 +15,7 @@ serde_json = "1.0.117" [features] default = [] -derive = ["core/derive"] +derive = ["pgen_core/derive"] [workspace] resolver = "2" @@ -31,5 +31,5 @@ thiserror = "1.0.58" serde = "1.0.197" regex = "1.10.4" regex-macro = "0.2.0" -core = { path = "./crates/core" } -algorithm = { path = "./crates/algorithm" } +pgen_core = { package = "core", path = "./crates/core" } +pgen_algorithm = { package = "algorithm", path = "./crates/algorithm" } diff --git a/crates/algorithm_lr1/Cargo.toml b/crates/algorithm_lr1/Cargo.toml index 2511561..20d03f9 100644 --- a/crates/algorithm_lr1/Cargo.toml +++ b/crates/algorithm_lr1/Cargo.toml @@ -8,4 +8,4 @@ anyhow = { workspace = true } thiserror = { workspace = true } serde = { workspace = true, features = ["derive"] } itertools = "0.12.1" -core = { path = "../core", features = ["derive"] } +pgen_core = { package = "core", path = "../core", features = ["derive"] } diff --git a/crates/algorithm_lr1/src/builder.rs b/crates/algorithm_lr1/src/builder.rs index 2b2fada..5635f8f 100644 --- a/crates/algorithm_lr1/src/builder.rs +++ b/crates/algorithm_lr1/src/builder.rs @@ -5,7 +5,7 @@ use std::marker::PhantomData; use serde::{Serialize, Deserialize}; use itertools::Itertools; -use core::cfg::{TokenSet, Syntax, Rule, RuleElem, RuleSet}; +use pgen_core::cfg::{TokenSet, Syntax, Rule, RuleElem, RuleSet}; #[derive(Debug, Serialize, Deserialize)] pub(super) enum LRAction { diff --git a/crates/algorithm_lr1/src/driver.rs b/crates/algorithm_lr1/src/driver.rs index 0281c2f..e3de5ad 100644 --- a/crates/algorithm_lr1/src/driver.rs +++ b/crates/algorithm_lr1/src/driver.rs @@ -1,5 +1,5 @@ -use core::cfg::{TokenSet, Syntax}; -use core::lex::Token; +use pgen_core::cfg::{TokenSet, Syntax}; +use pgen_core::lex::Token; use super::builder::{LRAction, LR1Configure}; diff --git a/crates/algorithm_lr1/src/lib.rs b/crates/algorithm_lr1/src/lib.rs index 1df9737..14c7532 100644 --- a/crates/algorithm_lr1/src/lib.rs +++ b/crates/algorithm_lr1/src/lib.rs @@ -3,9 +3,9 @@ mod driver; use serde::{Serialize, Deserialize}; -use core::cfg::{TokenSet, Syntax}; -use core::lex::Token; -use core::parse::ParserImpl; +use pgen_core::cfg::{TokenSet, Syntax}; +use pgen_core::lex::Token; +use pgen_core::parse::ParserImpl; use builder::LR1Configure; use driver::LR1Driver; @@ -39,8 +39,8 @@ where #[cfg(test)] mod test { - use core::cfg::{TokenSet, Syntax, Rule, RuleElem}; - use core::Parser; + use pgen_core::cfg::{TokenSet, Syntax, Rule, RuleElem}; + use pgen_core::Parser; use super::LR1; diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 54256b7..2148cba 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -8,8 +8,8 @@ anyhow = { workspace = true } thiserror = { workspace = true } serde = { workspace = true, features = ["derive"]} regex = { workspace = true } -derive = { package = "core_derive", path = "../core_derive", optional = true } +pgen_core_derive = { package = "core_derive", path = "../core_derive", optional = true } [features] default = [] -derive = ["dep:derive"] +derive = ["dep:pgen_core_derive"] diff --git a/crates/core/src/cfg.rs b/crates/core/src/cfg.rs index 458da47..c967c91 100644 --- a/crates/core/src/cfg.rs +++ b/crates/core/src/cfg.rs @@ -2,7 +2,7 @@ mod token; mod syntax; #[cfg(feature = "derive")] -pub use derive::{TokenSet, Syntax}; +pub use pgen_core_derive::{TokenSet, Syntax}; pub use token::TokenSet; pub use syntax::{Syntax, Rule, RuleElem, RuleSet}; diff --git a/src/lib.rs b/src/lib.rs index 0f77213..f7f0c81 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,2 @@ -pub use core::*; -pub use algorithm; +pub use pgen_core::*; +pub use pgen_algorithm as algorithm; diff --git a/tests/derive.rs b/tests/derive.rs index af6d4be..81e8db1 100644 --- a/tests/derive.rs +++ b/tests/derive.rs @@ -1,4 +1,4 @@ -use core::cfg::*; +use parsergen::cfg::*; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, TokenSet)] enum TestTokenSet { diff --git a/tests/serde.rs b/tests/serde.rs index 4faa3ed..24ddfba 100644 --- a/tests/serde.rs +++ b/tests/serde.rs @@ -1,8 +1,8 @@ use serde::{Serialize, Deserialize}; -use core::cfg::*; -use core::Parser; -use algorithm::LR1; +use parsergen::algorithm::LR1; +use parsergen::cfg::*; +use parsergen::Parser; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize, Deserialize, TokenSet)] enum TestTokenSet { From 5a77edfc16dbc481eb351eec47f0fd0c0d5ea923 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Fri, 17 May 2024 23:09:58 +0900 Subject: [PATCH 04/15] =?UTF-8?q?[update]=20Syntax,=20TokenSet=20=E3=81=8C?= =?UTF-8?q?=20Debug=20=E3=82=92=E8=A6=81=E6=B1=82=E3=81=99=E3=82=8B?= =?UTF-8?q?=E3=82=88=E3=81=86=E3=81=AB=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/core/src/cfg/syntax.rs | 3 ++- crates/core/src/cfg/token.rs | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/core/src/cfg/syntax.rs b/crates/core/src/cfg/syntax.rs index b77243b..0b631ab 100644 --- a/crates/core/src/cfg/syntax.rs +++ b/crates/core/src/cfg/syntax.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::fmt::Debug; use std::hash::Hash; use std::marker::PhantomData; @@ -6,7 +7,7 @@ use super::token::TokenSet; pub trait Syntax<'a> where - Self: Clone + Copy + Sized, + Self: Debug + Clone + Copy + Sized, { type TokenSet: TokenSet<'a>; diff --git a/crates/core/src/cfg/token.rs b/crates/core/src/cfg/token.rs index 89c77a1..7eb1032 100644 --- a/crates/core/src/cfg/token.rs +++ b/crates/core/src/cfg/token.rs @@ -1,10 +1,11 @@ +use std::fmt::Debug; use std::hash::Hash; use regex::Regex; pub trait TokenSet<'a> where - Self: Copy + Clone + Hash + Eq, + Self: Debug + Copy + Clone + Hash + Eq, { fn ignore_str() -> &'a str; fn enum_iter() -> impl Iterator; From 1f963cd0660be64a78b663be28774192af3bf1e0 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Fri, 17 May 2024 23:56:12 +0900 Subject: [PATCH 05/15] =?UTF-8?q?[update]=20ParseError=20=E3=81=AE?= =?UTF-8?q?=E8=A8=AD=E8=A8=88=E3=82=92=E5=A4=89=E6=9B=B4=20&=20pretty=5Fpr?= =?UTF-8?q?int=20=E3=81=A7=E3=81=8D=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB?= =?UTF-8?q?=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/algorithm_lr1/src/driver.rs | 28 +++++++------- crates/algorithm_lr1/src/error.rs | 27 +++++++++++++ crates/algorithm_lr1/src/lib.rs | 1 + crates/core/src/error.rs | 62 ++++++++++++++++++++++++++++-- examples/expr.rs | 8 +++- 5 files changed, 107 insertions(+), 19 deletions(-) create mode 100644 crates/algorithm_lr1/src/error.rs diff --git a/crates/algorithm_lr1/src/driver.rs b/crates/algorithm_lr1/src/driver.rs index e3de5ad..a7752a6 100644 --- a/crates/algorithm_lr1/src/driver.rs +++ b/crates/algorithm_lr1/src/driver.rs @@ -1,7 +1,8 @@ use pgen_core::cfg::{TokenSet, Syntax}; use pgen_core::lex::Token; -use super::builder::{LRAction, LR1Configure}; +use crate::error::ParseError; +use crate::builder::{LRAction, LR1Configure}; pub(super) struct LR1Driver<'a, 'b, T, S> (&'b LR1Configure<'a, T, S>) where @@ -29,32 +30,31 @@ where let action = match input { Some(token) => ( self.0.action_table[top].get(&token.kind).unwrap(), - Some(token.as_str()), + Some(token), ), None => ( &self.0.eof_action_table[top], None ), }; - match action.0 { - LRAction::Shift(new_state) => { + match action { + (LRAction::Shift(new_state), _) => { stack.push(*new_state); break; } - LRAction::Reduce(_, goto, elems_cnt) => { + (LRAction::Reduce(_, goto, elems_cnt), _) => { stack.truncate(stack.len() - elems_cnt); stack.push(self.0.goto_table[stack[stack.len() - 1]][*goto]); } - LRAction::None => { - // let pos = lexer.pos(); - // let pos = match action.1 { - // Some(raw) => (pos.0, pos.1 - (raw.len() as u32)), - // None => pos, - // }; - // return Err(anyhow::anyhow!("Error at {:?}", pos).into()); - return Err(anyhow::anyhow!("Error",).into()); + (LRAction::Accept, _) => { + return Ok(()); + } + (LRAction::None, Some(token)) => { + return Err(ParseError::new_unexpected_token(token).into()); + } + (LRAction::None, None) => { + return Err(ParseError::UnexpectedEOF.into()); } - LRAction::Accept => return Ok(()), } } } diff --git a/crates/algorithm_lr1/src/error.rs b/crates/algorithm_lr1/src/error.rs new file mode 100644 index 0000000..025eb7e --- /dev/null +++ b/crates/algorithm_lr1/src/error.rs @@ -0,0 +1,27 @@ +use thiserror::Error; + +use pgen_core::error::ParseError as SuperParseError; +use pgen_core::cfg::TokenSet; +use pgen_core::lex::Token; + +#[derive(Debug, Error)] +pub enum ParseError { + #[error("Unexpected token {actual:?} found")] + UnexpectedToken { + actual: String, + }, + #[error("Unexpected EOF")] + UnexpectedEOF, +} + +impl ParseError { + pub fn new_unexpected_token<'a, T>(expected: Token<'a, '_, T>) -> SuperParseError + where + T: TokenSet<'a>, + { + let err = ParseError::UnexpectedToken { + actual: format!("{:?}", expected.kind), + }; + SuperParseError::from(err).with(expected) + } +} diff --git a/crates/algorithm_lr1/src/lib.rs b/crates/algorithm_lr1/src/lib.rs index 14c7532..a5db5a4 100644 --- a/crates/algorithm_lr1/src/lib.rs +++ b/crates/algorithm_lr1/src/lib.rs @@ -1,3 +1,4 @@ +mod error; mod builder; mod driver; diff --git a/crates/core/src/error.rs b/crates/core/src/error.rs index eb5b6a4..7f559fc 100644 --- a/crates/core/src/error.rs +++ b/crates/core/src/error.rs @@ -1,17 +1,22 @@ +use std::cmp::{max, min}; use std::error::Error as StdError; use std::fmt::Display; use thiserror::Error; +use crate::cfg::TokenSet; +use crate::lex::Token; + #[derive(Debug, Error)] pub struct ParseError { err: Box, - pos: Option<(u32, u32)>, + src: Option, + pos: Option<(usize, usize)>, } impl Display for ParseError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{} at {:?}", self.err, self.pos) + write!(f, "{}", self.err) } } @@ -22,14 +27,63 @@ impl ParseError { { ParseError { err: Box::new(err), + src: None, pos: None, } } - pub fn with(self, pos: Option<(u32, u32)>) -> ParseError { + pub fn with<'a, T: TokenSet<'a>>(self, token: Token<'a, '_, T>) -> ParseError { + let mut sum = 0; + let (mut rows, mut cols) = (1, 1); + for c in token.src.chars() { + if token.range.0 <= sum { + break; + } + sum += c.len_utf8(); + + match c { + '\n' => { + rows += 1; + cols = 1; + } + _ => { + cols += 1; + } + } + } + ParseError { err: self.err, - pos, + src: Some(token.src.to_string()), + pos: Some((rows, cols)), + } + } + + pub fn pretty_print(&self) { + let pretty_printer = |input: &str, pos: (usize, usize)| { + eprintln!("-----"); + + let (row, col) = (pos.0 as i32 - 1, pos.1 as i32 - 1); + let lines = input.split('\n'); + let neighbor_lines = lines + .skip(max(0, row - 2) as usize) + .take(min(row + 1, 3) as usize); + + neighbor_lines.enumerate().for_each(|(idx, line)| { + let row = max(1, row - 1) + (idx as i32); + println!("{:2}: {}", row, line); + }); + + eprintln!(" {}^ here", " ".repeat(col as usize)); + eprintln!("Error at line {}, column {}.", row + 1, col + 1); + eprintln!("-----\n"); + }; + + match (&self.src, self.pos) { + (Some(src), Some(pos)) => { + pretty_printer(&src, pos); + } + _ => {}, } } } diff --git a/examples/expr.rs b/examples/expr.rs index 7b0b3cc..6ae7d5d 100644 --- a/examples/expr.rs +++ b/examples/expr.rs @@ -2,6 +2,7 @@ use std::io::stdin; use parsergen::algorithm::LR1; use parsergen::cfg::*; +use parsergen::error::ParseError; use parsergen::Parser; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, TokenSet)] @@ -52,7 +53,12 @@ fn main() -> anyhow::Result<()> { match ExprParser::new()?.parse(&input) { Ok(_) => println!("Accepted"), - Err(e) => println!("Rejected: {}", e), + Err(e) => { + if let Some(e) = e.downcast_ref::() { + e.pretty_print(); + } + println!("Rejected : {}", e); + } }; Ok(()) From b250277e474fc39d06dae39cfcc34f3f2c8f25d5 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Sat, 18 May 2024 00:01:16 +0900 Subject: [PATCH 06/15] =?UTF-8?q?[clean]=20TODO=20=E3=82=B3=E3=83=A1?= =?UTF-8?q?=E3=83=B3=E3=83=88=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/core/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 9d030ee..7743267 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -1,7 +1,7 @@ pub mod cfg; pub mod error; pub mod parse; -pub mod lex; // TODO : private +pub mod lex; use std::marker::PhantomData; From 905ac0802baa9bb05fef9cb050fb2c29eb5013da Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Mon, 20 May 2024 17:44:14 +0900 Subject: [PATCH 07/15] =?UTF-8?q?[update]=20Syntax::into=20=E2=86=92=20Syn?= =?UTF-8?q?tax::into=5Fruleset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/algorithm_lr1/src/builder.rs | 6 +----- crates/core/src/cfg/syntax.rs | 19 ++++++++----------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/crates/algorithm_lr1/src/builder.rs b/crates/algorithm_lr1/src/builder.rs index 5635f8f..2fde8a6 100644 --- a/crates/algorithm_lr1/src/builder.rs +++ b/crates/algorithm_lr1/src/builder.rs @@ -37,11 +37,7 @@ where { pub fn setup() -> anyhow::Result { // 1. Pre-process - let rules = S::try_into()? - .into_iter() - .map(|(rule, _)| rule) - .collect::>(); - let ruleset = RuleSet::from(rules); + let ruleset = S::into_ruleset(); let first_set = ruleset.first_set(); // 2. Generate dummy nonterm diff --git a/crates/core/src/cfg/syntax.rs b/crates/core/src/cfg/syntax.rs index 0b631ab..cccde0d 100644 --- a/crates/core/src/cfg/syntax.rs +++ b/crates/core/src/cfg/syntax.rs @@ -14,10 +14,12 @@ where fn enum_iter() -> impl Iterator; fn to_rule(&self) -> Rule<'a, Self::TokenSet>; - fn try_into() -> anyhow::Result, Self)>> { - Self::enum_iter() - .map(|elem| Ok((Self::to_rule(&elem), elem))) - .collect::>>() + fn into_ruleset() -> RuleSet<'a, Self::TokenSet> { + let rules = Self::enum_iter() + .map(|elem| Self::to_rule(&elem)) + .collect::>(); + + RuleSet::from(rules) } } @@ -228,7 +230,7 @@ impl<'a, T: TokenSet<'a>> RuleSet<'a, T> { mod test { use std::collections::HashMap; - use super::{TokenSet, Syntax, Rule, RuleElem, RuleSet}; + use super::{TokenSet, Syntax, Rule, RuleElem}; #[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)] enum TestToken { @@ -399,12 +401,7 @@ mod test { #[test] fn first_set() { - let rules = ::try_into() - .unwrap() - .into_iter() - .map(|(rule, _)| rule) - .collect::>(); - let ruleset = RuleSet::from(rules); + let ruleset = ::into_ruleset(); let first_set = ruleset.first_set(); check( From 9c851f2dd5c52810c2f3d213951b483201246483 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Mon, 20 May 2024 17:45:02 +0900 Subject: [PATCH 08/15] =?UTF-8?q?[change]=20Syntax=20=E3=81=AE=20Sized=20?= =?UTF-8?q?=E5=88=B6=E7=B4=84=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/core/src/cfg/syntax.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/src/cfg/syntax.rs b/crates/core/src/cfg/syntax.rs index cccde0d..28aebad 100644 --- a/crates/core/src/cfg/syntax.rs +++ b/crates/core/src/cfg/syntax.rs @@ -7,7 +7,7 @@ use super::token::TokenSet; pub trait Syntax<'a> where - Self: Debug + Clone + Copy + Sized, + Self: Debug + Clone + Copy, { type TokenSet: TokenSet<'a>; From 75dd0c25a5cee397ef9acd0fc54168d1fe96be39 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Mon, 20 May 2024 17:51:35 +0900 Subject: [PATCH 09/15] =?UTF-8?q?[change]=20Syntax=20=E3=83=88=E3=83=AC?= =?UTF-8?q?=E3=82=A4=E3=83=88=E3=81=8C=E6=8C=81=E3=81=A4=E3=83=A1=E3=82=BD?= =?UTF-8?q?=E3=83=83=E3=83=89=E3=81=AE=E5=91=BD=E5=90=8D=E3=82=92=E5=A4=89?= =?UTF-8?q?=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/algorithm_lr1/src/builder.rs | 2 +- crates/core/src/cfg/syntax.rs | 12 ++++++------ crates/core_derive/src/impl/syntax.rs | 4 ++-- tests/derive.rs | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/algorithm_lr1/src/builder.rs b/crates/algorithm_lr1/src/builder.rs index 2fde8a6..d6d56e0 100644 --- a/crates/algorithm_lr1/src/builder.rs +++ b/crates/algorithm_lr1/src/builder.rs @@ -81,7 +81,7 @@ where } // 5. Setup tables - let rule_table: Vec = S::enum_iter().collect(); + let rule_table: Vec = S::into_iter().collect(); for lritem_set in &dfa.0 { for (token, next) in &lritem_set.next { match &token { diff --git a/crates/core/src/cfg/syntax.rs b/crates/core/src/cfg/syntax.rs index 28aebad..42c1a71 100644 --- a/crates/core/src/cfg/syntax.rs +++ b/crates/core/src/cfg/syntax.rs @@ -11,12 +11,12 @@ where { type TokenSet: TokenSet<'a>; - fn enum_iter() -> impl Iterator; - fn to_rule(&self) -> Rule<'a, Self::TokenSet>; + fn into_iter() -> impl Iterator; + fn into_rule(&self) -> Rule<'a, Self::TokenSet>; fn into_ruleset() -> RuleSet<'a, Self::TokenSet> { - let rules = Self::enum_iter() - .map(|elem| Self::to_rule(&elem)) + let rules = Self::into_iter() + .map(|elem| Self::into_rule(&elem)) .collect::>(); RuleSet::from(rules) @@ -291,7 +291,7 @@ mod test { impl<'a> Syntax<'a> for TestSyntax { type TokenSet = TestToken; - fn enum_iter() -> impl Iterator { + fn into_iter() -> impl Iterator { Box::new( vec![ TestSyntax::ExprPlus, @@ -307,7 +307,7 @@ mod test { ) } - fn to_rule(&self) -> Rule<'a, Self::TokenSet> { + fn into_rule(&self) -> Rule<'a, Self::TokenSet> { let expr_plus = Rule::from(( RuleElem::new_nonterm("expr"), vec![ diff --git a/crates/core_derive/src/impl/syntax.rs b/crates/core_derive/src/impl/syntax.rs index b1e8574..e7346ce 100644 --- a/crates/core_derive/src/impl/syntax.rs +++ b/crates/core_derive/src/impl/syntax.rs @@ -31,13 +31,13 @@ pub fn syntax_proc_macro_impl(ast: DeriveInput) -> TokenStream { impl<'a> Syntax<'a> for #enum_name { type TokenSet = #enum_assoc_type; - fn enum_iter() -> impl Iterator { + fn into_iter() -> impl Iterator { vec![ #( #enum_variants, )* ].into_iter() } - fn to_rule(&self) -> Rule<'a, Self::TokenSet> { + fn into_rule(&self) -> Rule<'a, Self::TokenSet> { match self { #( #enum_rule_table, )* _ => unimplemented!(), diff --git a/tests/derive.rs b/tests/derive.rs index 81e8db1..553ccc4 100644 --- a/tests/derive.rs +++ b/tests/derive.rs @@ -43,5 +43,5 @@ enum TestSyntax { #[test] fn check_compile() { let _ = TestTokenSet::to_regex(&self::TestTokenSet::Plus); - let _ = TestSyntax::to_rule(&self::TestSyntax::ExprPlus); + let _ = TestSyntax::into_rule(&self::TestSyntax::ExprPlus); } From 78d215b3a66b5c8ba403eee2d6aaf76e2c514caa Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Mon, 20 May 2024 20:07:32 +0900 Subject: [PATCH 10/15] =?UTF-8?q?[change]=20TokenSet=20=E3=81=8C=E6=8C=81?= =?UTF-8?q?=E3=81=A4=E3=83=A1=E3=82=BD=E3=83=83=E3=83=89=E3=81=AE=E5=91=BD?= =?UTF-8?q?=E5=90=8D=E3=82=92=20Syntax=20=E3=81=AB=E5=90=88=E3=82=8F?= =?UTF-8?q?=E3=81=9B=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/algorithm_lr1/src/builder.rs | 2 +- crates/core/src/cfg/syntax.rs | 4 ++-- crates/core/src/cfg/token.rs | 20 +++++++++++++------- crates/core/src/lex.rs | 20 +++++++++----------- crates/core_derive/src/impl/tokenset.rs | 4 ++-- tests/derive.rs | 2 +- 6 files changed, 28 insertions(+), 24 deletions(-) diff --git a/crates/algorithm_lr1/src/builder.rs b/crates/algorithm_lr1/src/builder.rs index d6d56e0..b4f44e9 100644 --- a/crates/algorithm_lr1/src/builder.rs +++ b/crates/algorithm_lr1/src/builder.rs @@ -72,7 +72,7 @@ where let mut goto_table: Vec> = Vec::with_capacity(dfa.0.len()); for _ in 0..dfa.0.len() { action_table.push(HashMap::from_iter( - T::enum_iter() + T::into_iter() .map(|token| (token, LRAction::None)) .collect::)>>(), )); diff --git a/crates/core/src/cfg/syntax.rs b/crates/core/src/cfg/syntax.rs index 42c1a71..f6bc21b 100644 --- a/crates/core/src/cfg/syntax.rs +++ b/crates/core/src/cfg/syntax.rs @@ -244,7 +244,7 @@ mod test { } impl TokenSet<'_> for TestToken { - fn enum_iter() -> impl Iterator { + fn into_iter() -> impl Iterator { Box::new( vec![ TestToken::Num, @@ -259,7 +259,7 @@ mod test { ) } - fn to_regex(&self) -> &'static str { + fn into_regex_str(&self) -> &'static str { match self { TestToken::Num => r"^[1-9][0-9]*", TestToken::Plus => r"^\+", diff --git a/crates/core/src/cfg/token.rs b/crates/core/src/cfg/token.rs index 7eb1032..e3a6d3f 100644 --- a/crates/core/src/cfg/token.rs +++ b/crates/core/src/cfg/token.rs @@ -1,19 +1,25 @@ use std::fmt::Debug; use std::hash::Hash; -use regex::Regex; +use regex::{Regex, RegexSet}; pub trait TokenSet<'a> where Self: Debug + Copy + Clone + Hash + Eq, { fn ignore_str() -> &'a str; - fn enum_iter() -> impl Iterator; - fn to_regex(&self) -> &'a str; + fn into_iter() -> impl Iterator; + fn into_regex_str(&self) -> &'a str; - fn try_into() -> anyhow::Result> { - Self::enum_iter() - .map(|token| Ok((Regex::new(Self::to_regex(&token))?, token))) - .collect::>>() + fn into_regex(&self) -> anyhow::Result { + Ok(Regex::new(self.into_regex_str())?) + } + + fn try_into_regexset() -> anyhow::Result { + let regex_set = Self::into_iter() + .map(|token| Self::into_regex_str(&token)) + .collect::>(); + + Ok(RegexSet::new(regex_set)?) } } diff --git a/crates/core/src/lex.rs b/crates/core/src/lex.rs index 44b2484..240da9f 100644 --- a/crates/core/src/lex.rs +++ b/crates/core/src/lex.rs @@ -41,9 +41,9 @@ impl Lexer { struct LexDriver<'a, 'b, T: TokenSet<'a>> { // Regex + regex_istr: Regex, regex_set: RegexSet, regex_map: Vec<(Regex, T)>, - regex_istr: Regex, // State input: &'b str, @@ -57,18 +57,16 @@ impl<'a, 'b, T: TokenSet<'a>> TryFrom<&'b str> for LexDriver<'a, 'b, T> { type Error = anyhow::Error; fn try_from(input: &'b str) -> anyhow::Result { - let regex_map = T::try_into()?; - let regex_set = regex_map - .iter() - .map(|(_, token)| T::to_regex(&token)) - .collect::>(); - let regex_set = RegexSet::new(regex_set)?; let regex_istr = Regex::new(T::ignore_str())?; + let regex_set = T::try_into_regexset()?; + let regex_map = T::into_iter() + .map(|token| Ok((token.into_regex()?, token))) + .collect::>>()?; Ok(LexDriver { + regex_istr, regex_set, regex_map, - regex_istr, input, pos: 0, tokenset: PhantomData, @@ -126,11 +124,11 @@ mod test { r"^[ \t\n]+" } - fn enum_iter() -> Box> { - Box::new(vec![TestToken::Num, TestToken::Plus].into_iter()) + fn into_iter() -> impl Iterator { + vec![TestToken::Num, TestToken::Plus].into_iter() } - fn to_regex(&self) -> &'static str { + fn into_regex_str(&self) -> &'static str { match self { TestToken::Num => r"^[1-9][0-9]*", TestToken::Plus => r"^\+", diff --git a/crates/core_derive/src/impl/tokenset.rs b/crates/core_derive/src/impl/tokenset.rs index 84bcef3..1cd4df7 100644 --- a/crates/core_derive/src/impl/tokenset.rs +++ b/crates/core_derive/src/impl/tokenset.rs @@ -36,13 +36,13 @@ pub fn proc_macro_impl(ast: DeriveInput) -> TokenStream { #enum_ignored } - fn enum_iter() -> impl Iterator { + fn into_iter() -> impl Iterator { vec![ #( #enum_variants, )* ].into_iter() } - fn to_regex(&self) -> &'static str { + fn into_regex_str(&self) -> &'static str { match self { #( #enum_regex_table, )* _ => unimplemented!(), diff --git a/tests/derive.rs b/tests/derive.rs index 553ccc4..364ac9b 100644 --- a/tests/derive.rs +++ b/tests/derive.rs @@ -42,6 +42,6 @@ enum TestSyntax { #[test] fn check_compile() { - let _ = TestTokenSet::to_regex(&self::TestTokenSet::Plus); + let _ = TestTokenSet::into_regex(&self::TestTokenSet::Plus); let _ = TestSyntax::into_rule(&self::TestSyntax::ExprPlus); } From d6b34e8ae82719e3db69044c55b07ac72044eeba Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Mon, 20 May 2024 22:59:58 +0900 Subject: [PATCH 11/15] =?UTF-8?q?[update]=201=E3=81=A4=E3=81=AE=E5=88=97?= =?UTF-8?q?=E6=8C=99=E5=AD=90=E3=81=AB=E5=AF=BE=E3=81=97=E3=81=A6=E8=A4=87?= =?UTF-8?q?=E6=95=B0=E3=81=AE=20#rule=20=E3=82=92=E6=8C=87=E5=AE=9A?= =?UTF-8?q?=E3=81=A7=E3=81=8D=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E3=81=97?= =?UTF-8?q?=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/algorithm_lr1/src/builder.rs | 6 ++--- crates/algorithm_lr1/src/lib.rs | 9 ++----- crates/core/src/cfg/syntax.rs | 35 +++++++++++++++------------ crates/core_derive/src/impl/syntax.rs | 18 ++++++++------ examples/expr.rs | 9 ++----- tests/derive.rs | 11 +++------ tests/serde.rs | 9 ++----- 7 files changed, 41 insertions(+), 56 deletions(-) diff --git a/crates/algorithm_lr1/src/builder.rs b/crates/algorithm_lr1/src/builder.rs index b4f44e9..488b96f 100644 --- a/crates/algorithm_lr1/src/builder.rs +++ b/crates/algorithm_lr1/src/builder.rs @@ -37,6 +37,7 @@ where { pub fn setup() -> anyhow::Result { // 1. Pre-process + let rules = S::into_iter().collect::>(); let ruleset = S::into_ruleset(); let first_set = ruleset.first_set(); @@ -81,7 +82,6 @@ where } // 5. Setup tables - let rule_table: Vec = S::into_iter().collect(); for lritem_set in &dfa.0 { for (token, next) in &lritem_set.next { match &token { @@ -109,7 +109,7 @@ where let id = lritem_set.id as usize; let label = action_table[id].get_mut(&t.0).unwrap(); *label = LRAction::Reduce( - rule_table[item.rule.id as usize], + rules[item.rule.id as usize], *nonterm_table.get(lhs).unwrap(), item.rule.rhs.len(), ); @@ -120,7 +120,7 @@ where LRAction::Accept } else { LRAction::Reduce( - rule_table[item.rule.id as usize], + rules[item.rule.id as usize], *nonterm_table.get(lhs).unwrap(), item.rule.rhs.len(), ) diff --git a/crates/algorithm_lr1/src/lib.rs b/crates/algorithm_lr1/src/lib.rs index a5db5a4..88cd782 100644 --- a/crates/algorithm_lr1/src/lib.rs +++ b/crates/algorithm_lr1/src/lib.rs @@ -68,19 +68,14 @@ mod test { #[derive(Debug, Clone, Copy, Syntax)] enum TestSyntax { #[rule(" ::= Plus ")] - ExprPlus, #[rule(" ::= Minus ")] - ExprMinus, #[rule(" ::= ")] - ExprTerm, + Expr, #[rule(" ::= Mul ")] - TermMul, #[rule(" ::= Div ")] - TermDiv, #[rule(" ::= ")] - TermNum, + Term, #[rule(" ::= BracketL BracketR")] - NestedNum, #[rule(" ::= Num")] Num, } diff --git a/crates/core/src/cfg/syntax.rs b/crates/core/src/cfg/syntax.rs index f6bc21b..a41d085 100644 --- a/crates/core/src/cfg/syntax.rs +++ b/crates/core/src/cfg/syntax.rs @@ -12,11 +12,18 @@ where type TokenSet: TokenSet<'a>; fn into_iter() -> impl Iterator; - fn into_rule(&self) -> Rule<'a, Self::TokenSet>; + fn into_rules(&self) -> Vec>; fn into_ruleset() -> RuleSet<'a, Self::TokenSet> { let rules = Self::into_iter() - .map(|elem| Self::into_rule(&elem)) + .enumerate() + .flat_map(|(idx, elem)| { + let mut rules = Self::into_rules(&elem); + for rule in &mut rules { + rule.id = idx; + } + rules + }) .collect::>(); RuleSet::from(rules) @@ -110,16 +117,12 @@ pub struct RuleSet<'a, T: TokenSet<'a>> { } impl<'a, T: TokenSet<'a>> From>> for RuleSet<'a, T> { - fn from(mut rules: Vec>) -> Self { + fn from(rules: Vec>) -> Self { let top = match &rules[0].lhs { RuleElem::NonTerm(s) => s.clone(), _ => unreachable!(), }; - for (idx, rule) in rules.iter_mut().enumerate() { - rule.id = idx; - } - RuleSet { top, rules, @@ -307,7 +310,7 @@ mod test { ) } - fn into_rule(&self) -> Rule<'a, Self::TokenSet> { + fn into_rules(&self) -> Vec> { let expr_plus = Rule::from(( RuleElem::new_nonterm("expr"), vec![ @@ -366,14 +369,14 @@ mod test { let fact_2_num = Rule::from((RuleElem::new_nonterm("fact"), vec![])); match self { - TestSyntax::ExprPlus => expr_plus, - TestSyntax::ExprMinus => expr_minus, - TestSyntax::Expr2Term => expr_2_term, - TestSyntax::TermMul => term_mul, - TestSyntax::TermDiv => term_div, - TestSyntax::Term2Fact => term_2_fact, - TestSyntax::Fact2Expr => fact_2_expr, - TestSyntax::Fact2Num => fact_2_num, + TestSyntax::ExprPlus => vec![expr_plus], + TestSyntax::ExprMinus => vec![expr_minus], + TestSyntax::Expr2Term => vec![expr_2_term], + TestSyntax::TermMul => vec![term_mul], + TestSyntax::TermDiv => vec![term_div], + TestSyntax::Term2Fact => vec![term_2_fact], + TestSyntax::Fact2Expr => vec![fact_2_expr], + TestSyntax::Fact2Num => vec![fact_2_num], } } } diff --git a/crates/core_derive/src/impl/syntax.rs b/crates/core_derive/src/impl/syntax.rs index e7346ce..83324ce 100644 --- a/crates/core_derive/src/impl/syntax.rs +++ b/crates/core_derive/src/impl/syntax.rs @@ -37,7 +37,7 @@ pub fn syntax_proc_macro_impl(ast: DeriveInput) -> TokenStream { ].into_iter() } - fn into_rule(&self) -> Rule<'a, Self::TokenSet> { + fn into_rules(&self) -> Vec> { match self { #( #enum_rule_table, )* _ => unimplemented!(), @@ -50,23 +50,23 @@ pub fn syntax_proc_macro_impl(ast: DeriveInput) -> TokenStream { struct VariantInfo<'a> { parent_ident: &'a Ident, self_ident: &'a Ident, - rule: Option, + rules: Vec, } impl<'a> VariantInfo<'a> { fn parse(parent_ident: &'a Ident, variant: &'a Variant) -> VariantInfo<'a> { let self_ident = &variant.ident; - let mut rule = None; + let mut rules = vec![]; for attr in &variant.attrs { let attr = attr.parse_args::().unwrap().value(); - rule = Some(Self::parse_rule(&attr)); + rules.push(Self::parse_rule(&attr)); } VariantInfo { parent_ident, self_ident, - rule, + rules, } } @@ -102,9 +102,11 @@ impl<'a> VariantInfo<'a> { fn gen_ident_with_rule(&self) -> TokenStream { let ident = self.gen_ident(); - match &self.rule { - Some(rule) => quote! { #ident => #rule }, - None => quote! { unimplemented!() }, + if self.rules.is_empty() { + quote! { #ident => unimplemented!() } + } else { + let rules = &self.rules; + quote! { #ident => vec![#(#rules),*] } } } } diff --git a/examples/expr.rs b/examples/expr.rs index 6ae7d5d..a2a33b5 100644 --- a/examples/expr.rs +++ b/examples/expr.rs @@ -28,19 +28,14 @@ enum ExprTokenSet { #[derive(Debug, Clone, Copy, Syntax)] enum ExprSyntax { #[rule(" ::= Plus ")] - ExprPlus, #[rule(" ::= Minus ")] - ExprMinus, #[rule(" ::= ")] - ExprTerm, + Expr, #[rule(" ::= Mul ")] - TermMul, #[rule(" ::= Div ")] - TermDiv, #[rule(" ::= ")] - TermNum, + Term, #[rule(" ::= BracketL BracketR")] - NestedNum, #[rule(" ::= Num")] Num, } diff --git a/tests/derive.rs b/tests/derive.rs index 364ac9b..87d107f 100644 --- a/tests/derive.rs +++ b/tests/derive.rs @@ -23,19 +23,14 @@ enum TestTokenSet { #[derive(Debug, Clone, Copy, Syntax)] enum TestSyntax { #[rule(" ::= Plus ")] - ExprPlus, #[rule(" ::= Minus ")] - ExprMinus, #[rule(" ::= ")] - ExprTerm, + Expr, #[rule(" ::= Mul ")] - TermMul, #[rule(" ::= Div ")] - TermDiv, #[rule(" ::= ")] - TermNum, + Term, #[rule(" ::= BracketL BracketR")] - NestedNum, #[rule(" ::= Num")] Num, } @@ -43,5 +38,5 @@ enum TestSyntax { #[test] fn check_compile() { let _ = TestTokenSet::into_regex(&self::TestTokenSet::Plus); - let _ = TestSyntax::into_rule(&self::TestSyntax::ExprPlus); + let _ = TestSyntax::into_rules(&self::TestSyntax::Expr); } diff --git a/tests/serde.rs b/tests/serde.rs index 24ddfba..ce2a274 100644 --- a/tests/serde.rs +++ b/tests/serde.rs @@ -27,19 +27,14 @@ enum TestTokenSet { #[derive(Debug, Clone, Copy, Serialize, Deserialize, Syntax)] enum TestSyntax { #[rule(" ::= Plus ")] - ExprPlus, #[rule(" ::= Minus ")] - ExprMinus, #[rule(" ::= ")] - ExprTerm, + Expr, #[rule(" ::= Mul ")] - TermMul, #[rule(" ::= Div ")] - TermDiv, #[rule(" ::= ")] - TermNum, + Term, #[rule(" ::= BracketL BracketR")] - NestedNum, #[rule(" ::= Num")] Num, } From b75dce39144a49f5dc6ef0fcb7479f1c5e13d047 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Tue, 21 May 2024 01:56:58 +0900 Subject: [PATCH 12/15] =?UTF-8?q?[add]=20=E3=83=91=E3=83=BC=E3=82=B9?= =?UTF-8?q?=E7=B5=90=E6=9E=9C=E3=82=92S=E5=BC=8F=E3=81=A7=E8=A1=A8?= =?UTF-8?q?=E7=8F=BE=E3=81=99=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E3=81=97?= =?UTF-8?q?=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/algorithm_lr1/src/driver.rs | 13 ++++-- crates/algorithm_lr1/src/lib.rs | 5 +- crates/core/src/lib.rs | 7 ++- crates/core/src/parse.rs | 73 +++++++++++++++++++++++++++++- examples/expr.rs | 2 +- 5 files changed, 88 insertions(+), 12 deletions(-) diff --git a/crates/algorithm_lr1/src/driver.rs b/crates/algorithm_lr1/src/driver.rs index a7752a6..c484233 100644 --- a/crates/algorithm_lr1/src/driver.rs +++ b/crates/algorithm_lr1/src/driver.rs @@ -1,5 +1,6 @@ use pgen_core::cfg::{TokenSet, Syntax}; use pgen_core::lex::Token; +use pgen_core::parse::{SExp, SExpBuilder}; use crate::error::ParseError; use crate::builder::{LRAction, LR1Configure}; @@ -21,8 +22,9 @@ where pub fn run<'c>( &self, lexer: &mut impl Iterator>, - ) -> anyhow::Result<()> { + ) -> anyhow::Result> { let mut stack = vec![0]; + let mut builder = SExpBuilder::new(); loop { let input = lexer.next(); loop { @@ -38,16 +40,18 @@ where ), }; match action { - (LRAction::Shift(new_state), _) => { + (LRAction::Shift(new_state), Some(token)) => { stack.push(*new_state); + builder.push(token); break; } - (LRAction::Reduce(_, goto, elems_cnt), _) => { + (LRAction::Reduce(tag, goto, elems_cnt), _) => { stack.truncate(stack.len() - elems_cnt); stack.push(self.0.goto_table[stack[stack.len() - 1]][*goto]); + builder.wrap(*tag, *elems_cnt); } (LRAction::Accept, _) => { - return Ok(()); + return builder.build(); } (LRAction::None, Some(token)) => { return Err(ParseError::new_unexpected_token(token).into()); @@ -55,6 +59,7 @@ where (LRAction::None, None) => { return Err(ParseError::UnexpectedEOF.into()); } + _ => unreachable!(), } } } diff --git a/crates/algorithm_lr1/src/lib.rs b/crates/algorithm_lr1/src/lib.rs index 88cd782..d8ff6c1 100644 --- a/crates/algorithm_lr1/src/lib.rs +++ b/crates/algorithm_lr1/src/lib.rs @@ -6,7 +6,7 @@ use serde::{Serialize, Deserialize}; use pgen_core::cfg::{TokenSet, Syntax}; use pgen_core::lex::Token; -use pgen_core::parse::ParserImpl; +use pgen_core::parse::{ParserImpl, SExp}; use builder::LR1Configure; use driver::LR1Driver; @@ -24,7 +24,6 @@ where { type TokenSet = T; type Syntax = S; - type Output = (); fn setup() -> anyhow::Result { Ok(LR1(LR1Configure::setup()?)) @@ -33,7 +32,7 @@ where fn parse<'b>( &self, mut lexer: impl Iterator>, - ) -> anyhow::Result { + ) -> anyhow::Result> { LR1Driver::new(&self.0).run(&mut lexer) } } diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 7743267..78bc5b5 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -8,7 +8,7 @@ use std::marker::PhantomData; use serde::{Serialize, Deserialize}; use lex::Lexer; -use parse::ParserImpl; +use parse::{ParserImpl, SExp}; #[derive(Debug, Serialize, Deserialize)] pub struct Parser<'a, Algorithm> @@ -31,7 +31,10 @@ where }) } - pub fn parse<'b>(&self, input: &'b str) -> anyhow::Result { + pub fn parse<'b>( + &self, + input: &'b str, + ) -> anyhow::Result> { let lexer = Lexer::new::(input)?; self.r#impl.parse(lexer) } diff --git a/crates/core/src/parse.rs b/crates/core/src/parse.rs index 5730f23..8a2d4df 100644 --- a/crates/core/src/parse.rs +++ b/crates/core/src/parse.rs @@ -1,3 +1,5 @@ +use std::fmt::{Display, Debug}; + use crate::cfg::{TokenSet, Syntax}; use crate::lex::Token; @@ -7,11 +9,78 @@ where { type TokenSet: TokenSet<'a> + 'a; type Syntax: Syntax<'a, TokenSet = Self::TokenSet>; - type Output; fn setup() -> anyhow::Result; fn parse<'b>( &self, lexer: impl Iterator>, - ) -> anyhow::Result; + ) -> anyhow::Result>; +} + +#[derive(Debug)] +pub enum SExp<'a, 'b, T, S> +where + T: TokenSet<'a> + 'a, + S: Syntax<'a, TokenSet = T>, +{ + List { + tag: S, + elems: Vec>, + }, + Atom(Token<'a, 'b, T>), +} + +impl<'a, T, S> Display for SExp<'a, '_, T, S> +where + T: TokenSet<'a> + 'a, + S: Syntax<'a, TokenSet = T> + Debug, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SExp::List { tag, elems } => { + write!(f, "({:?}", tag)?; + for elem in elems { + write!(f, " {}", elem)?; + } + write!(f, ")") + } + SExp::Atom(token) => write!(f, "{:?}", token.as_str()), + } + } +} + +#[derive(Debug)] +pub struct SExpBuilder<'a, 'b, T, S> +where + T: TokenSet<'a> + 'a, + S: Syntax<'a, TokenSet = T>, +{ + stack: Vec>, +} + +impl<'a, 'b, T, S> SExpBuilder<'a, 'b, T, S> +where + T: TokenSet<'a> + 'a, + S: Syntax<'a, TokenSet = T>, +{ + pub fn new() -> SExpBuilder<'a, 'b, T, S> { + SExpBuilder { stack: vec![] } + } + + pub fn push(&mut self, token: Token<'a, 'b, T>) { + self.stack.push(SExp::Atom(token)); + } + + pub fn wrap(&mut self, tag: S, cnt: usize) { + let elems = self.stack.split_off(self.stack.len() - cnt); + self.stack.push(SExp::List { tag, elems }); + } + + pub fn build(mut self) -> anyhow::Result> { + if self.stack.len() == 1 { + Ok(self.stack.pop().unwrap()) + } else { + Err(anyhow::anyhow!("Invalid S-Expression")) + } + } } diff --git a/examples/expr.rs b/examples/expr.rs index a2a33b5..6f7b7a7 100644 --- a/examples/expr.rs +++ b/examples/expr.rs @@ -47,7 +47,7 @@ fn main() -> anyhow::Result<()> { stdin().read_line(&mut input)?; match ExprParser::new()?.parse(&input) { - Ok(_) => println!("Accepted"), + Ok(sexp) => println!("Accepted : {}", sexp), Err(e) => { if let Some(e) = e.downcast_ref::() { e.pretty_print(); From f73c26b329ddc126387fdfb65bb8d9edb2c37f58 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Tue, 21 May 2024 02:12:42 +0900 Subject: [PATCH 13/15] [update] README.md --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d0ded7b..7791500 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,14 @@ Rust製パーサジェネレータ ``` $ cargo run --example expr (10+20)/((30*40)-50) -Accepted - +Accepted : (Expr (Term (Term (Num "(" (Expr (Expr (Term (Num "10"))) "+" (Term (Num "20"))) ")")) "/" (Num "(" (Expr (Expr (Term (Num "(" (Expr (Term (Term (Num "30")) "*" (Num "40"))) ")"))) "-" (Term (Num "50"))) ")"))) $ cargo run --example expr 10** -Rejected: Error at (0, 3) +----- + 1: 10** + ^ here +Error at line 1, column 4. +----- + +Rejected : Unexpected token "Mul" found ``` From 3a28831490127b6dfe5a3a755444c6c12d973a21 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Tue, 21 May 2024 02:14:20 +0900 Subject: [PATCH 14/15] =?UTF-8?q?[update]=20Cargo.toml=20(0.1.0=20?= =?UTF-8?q?=E2=86=92=200.1.1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 10 +++++----- Cargo.toml | 2 +- crates/algorithm/Cargo.toml | 2 +- crates/algorithm_lr1/Cargo.toml | 2 +- crates/core/Cargo.toml | 2 +- crates/core_derive/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a5b89f0..0c5d23a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13,14 +13,14 @@ dependencies = [ [[package]] name = "algorithm" -version = "0.1.0" +version = "0.1.1" dependencies = [ "algorithm_lr1", ] [[package]] name = "algorithm_lr1" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "core", @@ -37,7 +37,7 @@ checksum = "25bdb32cbbdce2b519a9cd7df3a678443100e265d5e25ca763b7572a5104f5f3" [[package]] name = "core" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "core_derive", @@ -48,7 +48,7 @@ dependencies = [ [[package]] name = "core_derive" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "proc-macro2", @@ -86,7 +86,7 @@ checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "parsergen" -version = "0.1.0" +version = "0.1.1" dependencies = [ "algorithm", "anyhow", diff --git a/Cargo.toml b/Cargo.toml index 568e25c..aac0baa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "parsergen" -version = "0.1.0" +version = "0.1.1" edition = "2021" [dependencies] diff --git a/crates/algorithm/Cargo.toml b/crates/algorithm/Cargo.toml index e49344a..0db6c8a 100644 --- a/crates/algorithm/Cargo.toml +++ b/crates/algorithm/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "algorithm" -version = "0.1.0" +version = "0.1.1" edition = "2021" [dependencies] diff --git a/crates/algorithm_lr1/Cargo.toml b/crates/algorithm_lr1/Cargo.toml index 20d03f9..0380016 100644 --- a/crates/algorithm_lr1/Cargo.toml +++ b/crates/algorithm_lr1/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "algorithm_lr1" -version = "0.1.0" +version = "0.1.1" edition = "2021" [dependencies] diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 2148cba..ce5d477 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "core" -version = "0.1.0" +version = "0.1.1" edition = "2021" [dependencies] diff --git a/crates/core_derive/Cargo.toml b/crates/core_derive/Cargo.toml index 88efe71..8516b8c 100644 --- a/crates/core_derive/Cargo.toml +++ b/crates/core_derive/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "core_derive" -version = "0.1.0" +version = "0.1.1" edition = "2021" [dependencies] From 3aec6f94d2fd3e4bd592dda15e25bb2d8927aed4 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Tue, 21 May 2024 02:16:23 +0900 Subject: [PATCH 15/15] [fix] README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7791500..721eec6 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ Rust製パーサジェネレータ $ cargo run --example expr (10+20)/((30*40)-50) Accepted : (Expr (Term (Term (Num "(" (Expr (Expr (Term (Num "10"))) "+" (Term (Num "20"))) ")")) "/" (Num "(" (Expr (Expr (Term (Num "(" (Expr (Term (Term (Num "30")) "*" (Num "40"))) ")"))) "-" (Term (Num "50"))) ")"))) + $ cargo run --example expr 10** -----