From 7b0fd91d45c0cc71a34ea7e5bc37fc7fab3c271d Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Wed, 15 May 2024 22:16:45 +0900 Subject: [PATCH 1/6] =?UTF-8?q?[move]=20TypedCFG=20=E3=81=AE=E5=AE=9F?= =?UTF-8?q?=E8=A3=85=E3=81=8B=E3=82=89=E6=8C=81=E3=81=A3=E3=81=A6=E3=81=8D?= =?UTF-8?q?=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + Cargo.lock | 218 ++++++++++++++ Cargo.toml | 34 +++ crates/algorithm/Cargo.toml | 7 + crates/algorithm/src/lib.rs | 1 + crates/algorithm_lr1/Cargo.toml | 15 + crates/algorithm_lr1/src/builder.rs | 401 ++++++++++++++++++++++++++ crates/algorithm_lr1/src/driver.rs | 61 ++++ crates/algorithm_lr1/src/lib.rs | 139 +++++++++ crates/core/Cargo.toml | 10 + crates/core/src/cfg.rs | 5 + crates/core/src/cfg/syntax.rs | 427 ++++++++++++++++++++++++++++ crates/core/src/cfg/token.rs | 18 ++ crates/core/src/error.rs | 35 +++ crates/core/src/lex.rs | 226 +++++++++++++++ crates/core/src/lib.rs | 38 +++ crates/core/src/parse.rs | 17 ++ crates/macros/Cargo.toml | 14 + crates/macros/src/impl.rs | 2 + crates/macros/src/impl/syntax.rs | 110 +++++++ crates/macros/src/impl/tokenset.rs | 110 +++++++ crates/macros/src/lib.rs | 15 + src/lib.rs | 4 + 23 files changed, 1908 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 crates/algorithm/Cargo.toml create mode 100644 crates/algorithm/src/lib.rs create mode 100644 crates/algorithm_lr1/Cargo.toml create mode 100644 crates/algorithm_lr1/src/builder.rs create mode 100644 crates/algorithm_lr1/src/driver.rs create mode 100644 crates/algorithm_lr1/src/lib.rs create mode 100644 crates/core/Cargo.toml create mode 100644 crates/core/src/cfg.rs create mode 100644 crates/core/src/cfg/syntax.rs create mode 100644 crates/core/src/cfg/token.rs create mode 100644 crates/core/src/error.rs create mode 100644 crates/core/src/lex.rs create mode 100644 crates/core/src/lib.rs create mode 100644 crates/core/src/parse.rs create mode 100644 crates/macros/Cargo.toml create mode 100644 crates/macros/src/impl.rs create mode 100644 crates/macros/src/impl/syntax.rs create mode 100644 crates/macros/src/impl/tokenset.rs create mode 100644 crates/macros/src/lib.rs create mode 100644 src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2a8ca7e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target* diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..ad16bd1 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,218 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "algorithm" +version = "0.1.0" +dependencies = [ + "algorithm_lr1", +] + +[[package]] +name = "algorithm_lr1" +version = "0.1.0" +dependencies = [ + "anyhow", + "core", + "itertools", + "macros", + "serde", + "serde_json", + "thiserror", +] + +[[package]] +name = "anyhow" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25bdb32cbbdce2b519a9cd7df3a678443100e265d5e25ca763b7572a5104f5f3" + +[[package]] +name = "core" +version = "0.1.0" +dependencies = [ + "anyhow", + "regex", + "serde", + "thiserror", +] + +[[package]] +name = "either" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "macros" +version = "0.1.0" +dependencies = [ + "anyhow", + "proc-macro2", + "quote", + "syn", + "thiserror", +] + +[[package]] +name = "memchr" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" + +[[package]] +name = "parsergen" +version = "0.1.0" +dependencies = [ + "algorithm", + "anyhow", + "core", + "macros", + "thiserror", +] + +[[package]] +name = "proc-macro2" +version = "1.0.82" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "serde" +version = "1.0.202" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "226b61a0d411b2ba5ff6d7f73a476ac4f8bb900373459cd00fab8512828ba395" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.202" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6048858004bcff69094cd972ed40a32500f153bd3be9f716b2eed2e8217c4838" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf5be731623ca1a1fb7d8be6f261a3be6d3e2337b8a1f97be944d020c8fcb704" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "579e9083ca58dd9dcf91a9923bb9054071b9ebbd800b342194c9feb0ee89fc18" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..5978fba --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "parsergen" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = { workspace = true } +thiserror = { workspace = true } +core = { workspace = true } +algorithm = { workspace = true } +macros = { workspace = true, optional = true } + +[features] +default = [] +derive = ["macros"] + +[workspace] +resolver = "2" +members = [ + "./crates/core", + "./crates/algorithm", + "./crates/macros", +] +exclude = [] + +[workspace.dependencies] +anyhow = "1.0.82" +thiserror = "1.0.58" +serde = "1.0.197" +regex = "1.10.4" +regex-macro = "0.2.0" +core = { path = "./crates/core" } +algorithm = { path = "./crates/algorithm" } +macros = { path = "./crates/macros" } diff --git a/crates/algorithm/Cargo.toml b/crates/algorithm/Cargo.toml new file mode 100644 index 0000000..e49344a --- /dev/null +++ b/crates/algorithm/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "algorithm" +version = "0.1.0" +edition = "2021" + +[dependencies] +lr1 = { package = "algorithm_lr1", path = "../algorithm_lr1" } diff --git a/crates/algorithm/src/lib.rs b/crates/algorithm/src/lib.rs new file mode 100644 index 0000000..80e1952 --- /dev/null +++ b/crates/algorithm/src/lib.rs @@ -0,0 +1 @@ +pub use lr1::LR1; diff --git a/crates/algorithm_lr1/Cargo.toml b/crates/algorithm_lr1/Cargo.toml new file mode 100644 index 0000000..a516ff3 --- /dev/null +++ b/crates/algorithm_lr1/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "algorithm_lr1" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = { workspace = true } +thiserror = { workspace = true } +serde = { workspace = true, features = ["derive"]} +itertools = "0.12.1" +core = { path = "../core" } + +[dev-dependencies] +serde_json = "1.0.117" +macros = { path = "../macros" } diff --git a/crates/algorithm_lr1/src/builder.rs b/crates/algorithm_lr1/src/builder.rs new file mode 100644 index 0000000..2b2fada --- /dev/null +++ b/crates/algorithm_lr1/src/builder.rs @@ -0,0 +1,401 @@ +use std::collections::{HashMap, HashSet}; +use std::hash::Hash; +use std::marker::PhantomData; + +use serde::{Serialize, Deserialize}; +use itertools::Itertools; + +use core::cfg::{TokenSet, Syntax, Rule, RuleElem, RuleSet}; + +#[derive(Debug, Serialize, Deserialize)] +pub(super) enum LRAction { + Shift(usize), + Reduce(S, usize, usize), // syntax, goto_id, elems_cnt + Accept, + None, +} + +#[derive(Debug, Serialize, Deserialize)] +pub(super) struct LR1Configure<'a, T, S> +where + T: TokenSet<'a>, + S: Syntax<'a, TokenSet = T>, +{ + // LR Tables + pub action_table: Vec>>, + pub eof_action_table: Vec>, + pub goto_table: Vec>, + + // PhantomData + tokenset: PhantomData<&'a T>, +} + +impl<'a, T, S> LR1Configure<'a, T, S> +where + T: TokenSet<'a>, + S: Syntax<'a, TokenSet = T>, +{ + pub fn setup() -> anyhow::Result { + // 1. Pre-process + let rules = S::try_into()? + .into_iter() + .map(|(rule, _)| rule) + .collect::>(); + let ruleset = RuleSet::from(rules); + let first_set = ruleset.first_set(); + + // 2. Generate dummy nonterm + let top_dummy: Rule = Rule::from(( + RuleElem::new_nonterm("__top_dummy"), + vec![RuleElem::new_nonterm(&ruleset.top)], + )); + let top_dummy = vec![LRItem::new( + &top_dummy, + HashSet::from_iter(vec![&RuleElem::EOF]), + )]; + let lr_items = LRItemSet::new(0, HashSet::from_iter(top_dummy)); + let lr_items = lr_items.expand_closure(&ruleset, &first_set); + + // 3. Generate a DFA + let dfa = LRItemDFA::gen(lr_items, &ruleset, &first_set); + + // 4. Initialize tables + let mut idx = 0; + let mut nonterm_table = HashMap::new(); + for relem in ruleset.nonterms() { + if let RuleElem::NonTerm(s) = &relem { + if !nonterm_table.contains_key(s) { + nonterm_table.insert(s.to_string(), idx); + idx += 1; + } + } + } + + let mut action_table: Vec>> = Vec::with_capacity(dfa.0.len()); + let mut eof_action_table: Vec> = Vec::with_capacity(dfa.0.len()); + let mut goto_table: Vec> = Vec::with_capacity(dfa.0.len()); + for _ in 0..dfa.0.len() { + action_table.push(HashMap::from_iter( + T::enum_iter() + .map(|token| (token, LRAction::None)) + .collect::)>>(), + )); + eof_action_table.push(LRAction::None); + goto_table.push(vec![0; nonterm_table.keys().len()]); + } + + // 5. Setup tables + let rule_table: Vec = S::enum_iter().collect(); + for lritem_set in &dfa.0 { + for (token, next) in &lritem_set.next { + match &token { + RuleElem::NonTerm(s) => { + let id = lritem_set.id as usize; + let label = *nonterm_table.get(s).unwrap(); + goto_table[id][label] = *next as usize; + } + RuleElem::Term(t) => { + let id = lritem_set.id as usize; + let label = action_table[id].get_mut(&t.0).unwrap(); + *label = LRAction::Shift(*next as usize); + } + _ => {} + } + } + + for item in &lritem_set.lr_items { + if item.dot_pos != item.rule.rhs.len() { + continue; + } + if let RuleElem::NonTerm(lhs) = &item.rule.lhs { + for la_token in &item.la_tokens { + if let RuleElem::Term(t) = la_token { + let id = lritem_set.id as usize; + let label = action_table[id].get_mut(&t.0).unwrap(); + *label = LRAction::Reduce( + rule_table[item.rule.id as usize], + *nonterm_table.get(lhs).unwrap(), + item.rule.rhs.len(), + ); + } + if let RuleElem::EOF = la_token { + let id = lritem_set.id as usize; + eof_action_table[id] = if lhs == "__top_dummy" { + LRAction::Accept + } else { + LRAction::Reduce( + rule_table[item.rule.id as usize], + *nonterm_table.get(lhs).unwrap(), + item.rule.rhs.len(), + ) + }; + } + } + } + } + } + + Ok(LR1Configure { + action_table, + eof_action_table, + goto_table, + tokenset: PhantomData, + }) + } +} + +#[derive(Debug)] +struct LRItemDFA<'a, 'b, T: TokenSet<'a>> ( + Vec> +); + +impl<'a, 'b, T: TokenSet<'a>> LRItemDFA<'a, 'b, T> { + fn gen( + init_set: LRItemSet<'a, 'b, T>, + ruleset: &'b RuleSet<'a, T>, + first_set: &HashMap<&'b RuleElem<'a, T>, Vec<&'b RuleElem<'a, T>>>, + ) -> LRItemDFA<'a, 'b, T> { + let issue_id = |old_sets: &Vec>, set: &LRItemSet<'a, 'b, T>| { + if let Some(ex_set) = old_sets.iter().find(|&set0| set0.strict_eq(set)) { + Err(ex_set.id) + } else { + Ok(old_sets.len() as i32) + } + }; + + // "Expand a closure" <--> "Generate next nodes" loop + let mut loop_idx = (0, 1); + let mut lritem_sets = vec![init_set]; + while loop_idx.0 != loop_idx.1 { + let mut new_found_cnt = 0; + for idx in loop_idx.0..loop_idx.1 { + let next_sets = lritem_sets[idx].gen_next_sets(ruleset, first_set); + for (bef_token, mut next_set) in next_sets { + match issue_id(&lritem_sets, &next_set) { + Ok(id) => { + next_set.id = id; + lritem_sets[idx].next.insert(bef_token, id); + lritem_sets.push(next_set); + new_found_cnt += 1; + } + Err(id) => { + lritem_sets[idx].next.insert(bef_token, id); + } + } + } + } + loop_idx = (loop_idx.1, loop_idx.1 + new_found_cnt); + } + + LRItemDFA(lritem_sets) + } +} + +#[derive(Clone, Debug)] +struct LRItemSet<'a, 'b, T: TokenSet<'a>> { + id: i32, + next: HashMap<&'b RuleElem<'a, T>, i32>, + lr_items: HashSet>, +} + +impl<'a, 'b, T: TokenSet<'a>> PartialEq for LRItemSet<'a, 'b, T> { + fn eq(&self, other: &LRItemSet<'a, 'b, T>) -> bool { + self.lr_items == other.lr_items + } +} + +impl<'a, 'b, T: TokenSet<'a>> PartialEq>> for LRItemSet<'a, 'b, T> { + fn eq(&self, other: &HashSet>) -> bool { + &self.lr_items == other + } +} + +impl<'a, 'b, T: TokenSet<'a>> Eq for LRItemSet<'a, 'b, T> {} + +impl<'a, 'b, T: TokenSet<'a>> LRItemSet<'a, 'b, T> { + fn new(id: i32, lr_items: HashSet>) -> Self { + LRItemSet { + id, + next: HashMap::new(), + lr_items, + } + } + + fn strict_eq(&self, other: &Self) -> bool { + if self.lr_items.len() != other.lr_items.len() { + return false; + } + self.lr_items + .iter() + .all(|item| other.lr_items.iter().any(|item_b| item_b.strict_eq(item))) + } + + fn expand_closure<'c>( + mut self, + ruleset: &'b RuleSet<'a, T>, + first_set: &'c HashMap<&'b RuleElem<'a, T>, Vec<&'b RuleElem<'a, T>>>, + ) -> LRItemSet<'a, 'b, T> { + let mut lr_items = self.lr_items.clone(); + let mut lr_items_fetched = self.lr_items; + loop { + let new_items: Vec> = lr_items_fetched + .iter() + .flat_map(|item| item.expand_closure(ruleset, first_set)) + .collect(); + let new_items = LRItem::<'_, '_, _>::unify_all(new_items); + let new_items = HashSet::from_iter(new_items); + + let bef_len = lr_items.len(); + lr_items = LRItem::<'_, '_, _>::unity_set(lr_items, new_items.clone()); + let af_len = lr_items.len(); + if bef_len == af_len { + break; + } + lr_items_fetched = new_items; + } + self.lr_items = lr_items; + + self + } + + fn gen_next_sets<'c>( + &self, + ruleset: &'b RuleSet<'a, T>, + first_set: &'c HashMap<&'b RuleElem<'a, T>, Vec<&'b RuleElem<'a, T>>>, + ) -> HashMap<&'b RuleElem<'a, T>, LRItemSet<'a, 'b, T>> { + let new_items: Vec<(&'b RuleElem<'a, T>, LRItem<'a, 'b, T>)> = self + .lr_items + .iter() + .filter_map(|lr_item| lr_item.next_dot()) + .collect(); + + let mut new_sets: HashMap<&RuleElem, HashSet>> = HashMap::new(); + for (bef_token, lr_item) in new_items { + if new_sets.get(&bef_token).is_none() { + new_sets.insert(bef_token, HashSet::new()); + } + new_sets.get_mut(&bef_token).unwrap().insert(lr_item); + } + + let mut new_sets_expanded: HashMap<&'b RuleElem<'a, T>, LRItemSet<'_, '_, _>> = HashMap::new(); + for (ktoken, new_set) in new_sets { + let new_set = LRItemSet::new(0, new_set); + let new_set = new_set.expand_closure(ruleset, first_set); + new_sets_expanded.insert(ktoken, new_set); + } + + new_sets_expanded + } +} + +#[derive(Clone, Debug)] +struct LRItem<'a, 'b, T: TokenSet<'a>> { + rule: &'b Rule<'a, T>, + dot_pos: usize, + la_tokens: HashSet<&'b RuleElem<'a, T>>, +} + +impl<'a, 'b, T: TokenSet<'a>> Hash for LRItem<'a, 'b, T> { + fn hash(&self, state: &mut H) { + self.rule.hash(state); + self.dot_pos.hash(state); + } +} + +impl<'a, 'b, T: TokenSet<'a>> PartialEq for LRItem<'a, 'b, T> { + fn eq(&self, other: &Self) -> bool { + self.rule == other.rule && self.dot_pos == other.dot_pos + } +} + +impl<'a, 'b, T: TokenSet<'a>> Eq for LRItem<'a, 'b, T> {} + +impl<'a, 'b, T: TokenSet<'a>> LRItem<'a, 'b, T> { + fn new(rule: &'b Rule<'a, T>, la_tokens: HashSet<&'b RuleElem<'a, T>>) -> LRItem<'a, 'b, T> { + LRItem { + rule, + dot_pos: 0, + la_tokens, + } + } + + fn strict_eq(&self, other: &Self) -> bool { + self.rule == other.rule + && self.dot_pos == other.dot_pos + && self.la_tokens == other.la_tokens + } + + fn expand_closure<'c>( + &self, + ruleset: &'b RuleSet<'a, T>, + first_set: &'c HashMap<&'b RuleElem<'a, T>, Vec<&'b RuleElem<'a, T>>>, + ) -> HashSet> { + let af_la_tokens = if self.dot_pos + 1 < self.rule.rhs.len() { + HashSet::from_iter( + first_set + .get(&self.rule.rhs[self.dot_pos + 1]) + .unwrap() + .clone(), + ) + } else { + self.la_tokens.clone() + }; + + if self.dot_pos < self.rule.rhs.len() + && matches!(self.rule.rhs[self.dot_pos], RuleElem::NonTerm(_)) + { + ruleset + .find_rule(&self.rule.rhs[self.dot_pos]) + .into_iter() + .map(|rule| LRItem::<'_, '_, _>::new(rule, af_la_tokens.clone())) + .collect() + } else { + HashSet::new() + } + } + + #[allow(clippy::int_plus_one)] + fn next_dot(&self) -> Option<(&'b RuleElem<'a, T>, LRItem<'a, 'b, T>)> { + if self.dot_pos + 1 <= self.rule.rhs.len() { + let bef_token = &self.rule.rhs[self.dot_pos]; + let item = LRItem { + rule: self.rule, + dot_pos: self.dot_pos + 1, + la_tokens: self.la_tokens.clone(), + }; + Some((bef_token, item)) + } else { + None + } + } + + fn unify(&mut self, other: LRItem<'a, 'b, T>) { + if self != &other { + return; + } + other.la_tokens.into_iter().for_each(|la_token| { + if !self.la_tokens.contains(&la_token) { + self.la_tokens.insert(la_token); + } + }); + } + + fn unify_all(mut items: Vec>) -> Vec> { + for idx in (0..items.len()).permutations(2) { + let (a_idx, b_idx) = (idx[0], idx[1]); + let tmp = items[b_idx].clone(); + items[a_idx].unify(tmp); + } + items + } + + fn unity_set( + items_a: HashSet>, + items_b: HashSet>, + ) -> HashSet> { + let mut items_a = Vec::from_iter(items_a); + let items_b = Vec::from_iter(items_b); + items_a.extend(items_b); + HashSet::from_iter(Self::unify_all(items_a)) + } +} diff --git a/crates/algorithm_lr1/src/driver.rs b/crates/algorithm_lr1/src/driver.rs new file mode 100644 index 0000000..d22c366 --- /dev/null +++ b/crates/algorithm_lr1/src/driver.rs @@ -0,0 +1,61 @@ +use core::cfg::{TokenSet, Syntax}; +use core::lex::LexIterator; + +use super::builder::{LRAction, LR1Configure}; + +pub(super) struct LR1Driver<'a, 'b, T, S> (&'b LR1Configure<'a, T, S>) +where + T: TokenSet<'a> + 'a, + S: Syntax<'a, TokenSet = T>; + +impl<'a, 'b, T, S> LR1Driver<'a, 'b, T, S> +where + T: TokenSet<'a> + 'a, + S: Syntax<'a, TokenSet = T>, +{ + pub fn new(configure: &'b LR1Configure<'a, T, S>) -> LR1Driver<'a, 'b, T, S> { + LR1Driver(configure) + } + + pub fn run<'c>( + &self, + lexer: &mut impl LexIterator<'a, 'c, T>, + ) -> anyhow::Result<()> { + let mut stack = vec![0]; + loop { + let input = lexer.next(); + loop { + let top = stack[stack.len() - 1]; + let action = match input { + Some(token) => ( + self.0.action_table[top].get(&token.kind).unwrap(), + Some(token.as_str()), + ), + None => ( + &self.0.eof_action_table[top], + None + ), + }; + match action.0 { + LRAction::Shift(new_state) => { + stack.push(*new_state); + break; + } + LRAction::Reduce(_, goto, elems_cnt) => { + stack.truncate(stack.len() - elems_cnt); + stack.push(self.0.goto_table[stack[stack.len() - 1]][*goto]); + } + LRAction::None => { + let pos = lexer.pos(); + let pos = match action.1 { + Some(raw) => (pos.0, pos.1 - (raw.len() as u32)), + None => pos, + }; + return Err(anyhow::anyhow!("Error at {:?}", pos).into()); + } + LRAction::Accept => return Ok(()), + } + } + } + } +} diff --git a/crates/algorithm_lr1/src/lib.rs b/crates/algorithm_lr1/src/lib.rs new file mode 100644 index 0000000..4910642 --- /dev/null +++ b/crates/algorithm_lr1/src/lib.rs @@ -0,0 +1,139 @@ +mod builder; +mod driver; + +use serde::{Serialize, Deserialize}; + +use core::cfg::{TokenSet, Syntax}; +use core::lex::LexIterator; +use core::parse::ParserImpl; + +use builder::LR1Configure; +use driver::LR1Driver; + +#[derive(Debug, Serialize, Deserialize)] +pub struct LR1<'a, T, S> (LR1Configure<'a, T, S>) +where + T: TokenSet<'a> + 'a, + S: Syntax<'a, TokenSet = T>; + +impl<'a, T, S> ParserImpl<'a> for LR1<'a, T, S> +where + T: TokenSet<'a> + 'a, + S: Syntax<'a, TokenSet = T>, +{ + type TokenSet = T; + type Syntax = S; + type Output = (); + + fn setup() -> anyhow::Result { + Ok(LR1(LR1Configure::setup()?)) + } + + fn parse<'b>( + &self, + mut lexer: impl LexIterator<'a, 'b, T>, + ) -> anyhow::Result { + LR1Driver::new(&self.0).run(&mut lexer) + } +} + +#[cfg(test)] +mod test { + use serde::{Serialize, Deserialize}; + + use core::cfg::{TokenSet, Syntax, Rule, RuleElem}; + use core::Parser; + use macros::*; + + use super::LR1; + + #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize, Deserialize, TokenSet)] + enum TestTokenSet { + #[token(regex = r"\+")] + Plus, + #[token(regex = r"-")] + Minus, + #[token(regex = r"\*")] + Mul, + #[token(regex = r"/")] + Div, + #[token(regex = r"\(")] + BracketL, + #[token(regex = r"\)")] + BracketR, + #[token(regex = r"[1-9][0-9]*")] + Num, + #[token(regex = r"[ \t\n]+", ignored)] + _Whitespace, + } + + #[derive(Debug, Clone, Copy, Serialize, Deserialize, Syntax)] + enum TestSyntax { + #[rule(" ::= Plus ")] + ExprPlus, + #[rule(" ::= Minus ")] + ExprMinus, + #[rule(" ::= ")] + ExprTerm, + #[rule(" ::= Mul ")] + TermMul, + #[rule(" ::= Div ")] + TermDiv, + #[rule(" ::= ")] + TermNum, + #[rule(" ::= BracketL BracketR")] + NestedNum, + #[rule(" ::= Num")] + Num, + } + + #[test] + fn input_ok() { + let inputs = vec![ + "10", + "10 + 20", + "10 - 20", + "10 * 20", + "10 / 20", + "10 + 20 * 30 - 40", + "(10)", + "((((10))))", + "10 * (20 - 30)", + "((10 + 20) * (30 / 40)) - 50", + ]; + + let parser = Parser::>::new().unwrap(); + for input in inputs { + assert!(parser.parse(input).is_ok(), "{}", input); + } + } + + #[test] + fn input_err() { + let inputs = vec![ + "()", + "(10 -", + "10 +", + "*", + "10 20 + 30", + "10 + 20 * 30 / 40 (", + "(((10))", + ]; + + let parser = Parser::>::new().unwrap(); + for input in inputs { + assert!(parser.parse(input).is_err(), "{}", input); + } + } + + #[test] + fn check_serde() { + type TestParser<'a> = Parser::<'a, LR1<'a, TestTokenSet, TestSyntax>>; + + let parser = TestParser::new().unwrap(); + let serialized = serde_json::to_string(&parser).unwrap(); + let deserialized: TestParser = serde_json::from_str(&serialized).unwrap(); + + deserialized.parse("10 * (20 - 30)").unwrap(); + } +} diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml new file mode 100644 index 0000000..d0c9cd3 --- /dev/null +++ b/crates/core/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "core" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = { workspace = true } +thiserror = { workspace = true } +serde = { workspace = true, features = ["derive"]} +regex = { workspace = true } diff --git a/crates/core/src/cfg.rs b/crates/core/src/cfg.rs new file mode 100644 index 0000000..bb66213 --- /dev/null +++ b/crates/core/src/cfg.rs @@ -0,0 +1,5 @@ +mod token; +mod syntax; + +pub use token::TokenSet; +pub use syntax::{Syntax, Rule, RuleElem, RuleSet}; diff --git a/crates/core/src/cfg/syntax.rs b/crates/core/src/cfg/syntax.rs new file mode 100644 index 0000000..b77243b --- /dev/null +++ b/crates/core/src/cfg/syntax.rs @@ -0,0 +1,427 @@ +use std::collections::HashMap; +use std::hash::Hash; +use std::marker::PhantomData; + +use super::token::TokenSet; + +pub trait Syntax<'a> +where + Self: Clone + Copy + Sized, +{ + type TokenSet: TokenSet<'a>; + + fn enum_iter() -> impl Iterator; + fn to_rule(&self) -> Rule<'a, Self::TokenSet>; + + fn try_into() -> anyhow::Result, Self)>> { + Self::enum_iter() + .map(|elem| Ok((Self::to_rule(&elem), elem))) + .collect::>>() + } +} + +#[derive(PartialEq, Eq, Hash, Debug)] +pub struct Rule<'a, T: TokenSet<'a>> { + pub id: usize, + pub lhs: RuleElem<'a, T>, + pub rhs: Vec>, + tokenset: PhantomData<&'a T>, +} + +impl<'a, T: TokenSet<'a>> From<(RuleElem<'a, T>, Vec>)> for Rule<'a, T> { + fn from((lhs, rhs): (RuleElem<'a, T>, Vec>)) -> Self { + Rule { + id: 0, + lhs, + rhs, + tokenset: PhantomData, + } + } +} + +impl<'a, T: TokenSet<'a>> Rule<'a, T> { + pub fn nonterms<'b>(&'b self) -> Vec<&'b RuleElem<'a, T>> { + let mut l_nonterms = vec![&self.lhs]; + let r_nonterms: Vec<&RuleElem> = self + .rhs + .iter() + .filter(|token| matches!(token, RuleElem::::NonTerm(_))) + .collect(); + l_nonterms.extend(r_nonterms); + l_nonterms + } + + pub fn terms<'b>(&'b self) -> Vec<&'b RuleElem<'a, T>> { + self.rhs + .iter() + .filter(|token| matches!(token, RuleElem::::Term(_))) + .collect() + } +} + +#[derive(Debug)] +pub enum RuleElem<'a, T: TokenSet<'a>> { + NonTerm(String), + Term((T, PhantomData<&'a T>)), + EOF, +} + +impl<'a, T: TokenSet<'a>> Hash for RuleElem<'a, T> { + fn hash(&self, state: &mut H) { + match self { + RuleElem::NonTerm(s) => s.hash(state), + RuleElem::Term(t) => t.hash(state), + RuleElem::EOF => 0.hash(state), + } + } +} + +impl<'a, T: TokenSet<'a>> PartialEq for RuleElem<'a, T> { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (RuleElem::NonTerm(s1), RuleElem::NonTerm(s2)) => s1 == s2, + (RuleElem::Term(t1), RuleElem::Term(t2)) => t1 == t2, + (RuleElem::EOF, RuleElem::EOF) => true, + _ => false, + } + } +} + +impl<'a, T: TokenSet<'a>> Eq for RuleElem<'a, T> {} + +impl<'a, T: TokenSet<'a>> RuleElem<'a, T> { + pub fn new_nonterm>(t: U) -> RuleElem<'a, T> { + RuleElem::NonTerm(t.into()) + } + + pub fn new_term(t: T) -> RuleElem<'a, T> { + RuleElem::Term((t, PhantomData)) + } +} + +#[derive(Debug)] +pub struct RuleSet<'a, T: TokenSet<'a>> { + pub top: String, + pub rules: Vec>, + tokenset: PhantomData<&'a T>, +} + +impl<'a, T: TokenSet<'a>> From>> for RuleSet<'a, T> { + fn from(mut rules: Vec>) -> Self { + let top = match &rules[0].lhs { + RuleElem::NonTerm(s) => s.clone(), + _ => unreachable!(), + }; + + for (idx, rule) in rules.iter_mut().enumerate() { + rule.id = idx; + } + + RuleSet { + top, + rules, + tokenset: PhantomData, + } + } +} + +impl<'a, T: TokenSet<'a>> RuleSet<'a, T> { + pub fn nonterms<'b>(&'b self) -> Vec<&'b RuleElem<'a, T>> { + self.rules.iter().flat_map(|rule| rule.nonterms()).collect() + } + + pub fn terms<'b>(&'b self) -> Vec<&'b RuleElem<'a, T>> { + self.rules.iter().flat_map(|rule| rule.terms()).collect() + } + + pub fn find_rule<'b>(&'b self, target: &RuleElem<'a, T>) -> Vec<&'b Rule<'a, T>> { + self.rules + .iter() + .filter(|rule| &rule.lhs == target) + .collect() + } + + pub fn first_set<'b>(&'b self) -> HashMap<&'b RuleElem<'a, T>, Vec<&'b RuleElem<'a, T>>> { + // 1. Calc a null set + let nulls_set = self.nulls_set(); + + // 2. Initialize a first set + let mut first_set: HashMap<&RuleElem, Vec<&RuleElem>> = HashMap::new(); + first_set.insert(&RuleElem::EOF, vec![&RuleElem::EOF]); + self.terms().into_iter().for_each(|relem| { + first_set.insert(relem, vec![relem]); + }); + self.nonterms().into_iter().for_each(|relem| { + first_set.insert(relem, vec![]); + }); + + // 3. List up candidates from a nonterm set + let mut candidates = vec![]; + for nonterm in self.nonterms() { + let rules = self.find_rule(nonterm); + for rule in rules { + for relem in &rule.rhs { + if &rule.lhs != relem { + candidates.push((nonterm, relem)) + } + if !nulls_set.contains(&relem) { + break; + } + } + } + } + + // 4. Find first set with recursive + let mut updated = true; + while updated { + updated = false; + for (nonterm, candidate) in &candidates { + let found_elems: Vec<&RuleElem> = first_set + .get(candidate) + .unwrap() + .iter() + .filter(|relem| !first_set.get(nonterm).unwrap().contains(relem)) + .copied() + .collect(); + updated = !found_elems.is_empty(); + first_set + .get_mut(nonterm) + .unwrap() + .extend(found_elems.into_iter()); + } + } + + first_set + } + + fn nulls_set<'b>(&'b self) -> Vec<&'b RuleElem<'a, T>> { + // 1. Find null rules + let mut nulls_set: Vec<&RuleElem> = self + .rules + .iter() + .filter(|rule| rule.rhs.is_empty()) + .map(|rule| &rule.lhs) + .collect(); + + // 2. Find null rules with recursive + let mut updated = true; + while updated { + updated = false; + for rule in &self.rules { + if nulls_set.contains(&&rule.lhs) { + continue; + } else if rule.rhs.iter().all(|relem| nulls_set.contains(&relem)) { + nulls_set.push(&rule.lhs); + updated = true; + } else { + continue; + } + } + } + + nulls_set + } +} + +#[cfg(test)] +mod test { + use std::collections::HashMap; + + use super::{TokenSet, Syntax, Rule, RuleElem, RuleSet}; + + #[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)] + enum TestToken { + Num, + Plus, + Minus, + Mul, + Div, + BracketA, + BracketB, + } + + impl TokenSet<'_> for TestToken { + fn enum_iter() -> impl Iterator { + Box::new( + vec![ + TestToken::Num, + TestToken::Plus, + TestToken::Minus, + TestToken::Mul, + TestToken::Div, + TestToken::BracketA, + TestToken::BracketB, + ] + .into_iter(), + ) + } + + fn to_regex(&self) -> &'static str { + match self { + TestToken::Num => r"^[1-9][0-9]*", + TestToken::Plus => r"^\+", + TestToken::Minus => r"^-", + TestToken::Mul => r"^\*", + TestToken::Div => r"^/", + TestToken::BracketA => r"^\(", + TestToken::BracketB => r"^\)", + } + } + + fn ignore_str() -> &'static str { + r"^[ \t\n]+" + } + } + + #[derive(Debug, Clone, Copy)] + enum TestSyntax { + ExprPlus, + ExprMinus, + Expr2Term, + TermMul, + TermDiv, + Term2Fact, + Fact2Expr, + Fact2Num, + } + + impl<'a> Syntax<'a> for TestSyntax { + type TokenSet = TestToken; + + fn enum_iter() -> impl Iterator { + Box::new( + vec![ + TestSyntax::ExprPlus, + TestSyntax::ExprMinus, + TestSyntax::Expr2Term, + TestSyntax::TermMul, + TestSyntax::TermDiv, + TestSyntax::Term2Fact, + TestSyntax::Fact2Expr, + TestSyntax::Fact2Num, + ] + .into_iter(), + ) + } + + fn to_rule(&self) -> Rule<'a, Self::TokenSet> { + let expr_plus = Rule::from(( + RuleElem::new_nonterm("expr"), + vec![ + RuleElem::new_nonterm("expr"), + RuleElem::new_term(TestToken::Plus), + RuleElem::new_nonterm("term"), + ], + )); + + let expr_minus = Rule::from(( + RuleElem::new_nonterm("expr"), + vec![ + RuleElem::new_nonterm("expr"), + RuleElem::new_term(TestToken::Minus), + RuleElem::new_nonterm("term"), + ], + )); + + let expr_2_term = Rule::::from(( + RuleElem::new_nonterm("expr"), + vec![RuleElem::new_nonterm("term")], + )); + + let term_mul = Rule::from(( + RuleElem::new_nonterm("term"), + vec![ + RuleElem::new_nonterm("term"), + RuleElem::new_term(TestToken::Mul), + RuleElem::new_nonterm("fact"), + ], + )); + + let term_div = Rule::from(( + RuleElem::new_nonterm("term"), + vec![ + RuleElem::new_nonterm("term"), + RuleElem::new_term(TestToken::Div), + RuleElem::new_nonterm("fact"), + ], + )); + + let term_2_fact = Rule::::from(( + RuleElem::new_nonterm("term"), + vec![RuleElem::new_nonterm("fact")], + )); + + let fact_2_expr = Rule::from(( + RuleElem::new_nonterm("fact"), + vec![ + RuleElem::new_term(TestToken::BracketA), + RuleElem::new_nonterm("expr"), + RuleElem::new_term(TestToken::BracketB), + ], + )); + + let fact_2_num = Rule::from((RuleElem::new_nonterm("fact"), vec![])); + + match self { + TestSyntax::ExprPlus => expr_plus, + TestSyntax::ExprMinus => expr_minus, + TestSyntax::Expr2Term => expr_2_term, + TestSyntax::TermMul => term_mul, + TestSyntax::TermDiv => term_div, + TestSyntax::Term2Fact => term_2_fact, + TestSyntax::Fact2Expr => fact_2_expr, + TestSyntax::Fact2Num => fact_2_num, + } + } + } + + fn check>( + first_set: &HashMap<&RuleElem, Vec<&RuleElem>>, + nonterm: T, + exp_terms: Vec, + ) { + let nonterms = RuleElem::::new_nonterm(nonterm); + let exp_terms: Vec> = exp_terms + .into_iter() + .map(|term| RuleElem::new_term(term)) + .collect(); + assert!(first_set.get(&nonterms).unwrap().len() == exp_terms.len()); + + let result = first_set + .get(&nonterms) + .unwrap() + .into_iter() + .zip(exp_terms.into_iter()) + .any(|(a, b)| a == &&b); + assert!(result); + } + + #[test] + fn first_set() { + let rules = ::try_into() + .unwrap() + .into_iter() + .map(|(rule, _)| rule) + .collect::>(); + let ruleset = RuleSet::from(rules); + let first_set = ruleset.first_set(); + + check( + &first_set, + "expr", + vec![ + TestToken::Plus, + TestToken::Minus, + TestToken::Mul, + TestToken::Div, + TestToken::BracketA, + ], + ); + check( + &first_set, + "term", + vec![TestToken::Mul, TestToken::Div, TestToken::BracketA], + ); + check(&first_set, "fact", vec![TestToken::BracketA]); + } +} diff --git a/crates/core/src/cfg/token.rs b/crates/core/src/cfg/token.rs new file mode 100644 index 0000000..89c77a1 --- /dev/null +++ b/crates/core/src/cfg/token.rs @@ -0,0 +1,18 @@ +use std::hash::Hash; + +use regex::Regex; + +pub trait TokenSet<'a> +where + Self: Copy + Clone + Hash + Eq, +{ + fn ignore_str() -> &'a str; + fn enum_iter() -> impl Iterator; + fn to_regex(&self) -> &'a str; + + fn try_into() -> anyhow::Result> { + Self::enum_iter() + .map(|token| Ok((Regex::new(Self::to_regex(&token))?, token))) + .collect::>>() + } +} diff --git a/crates/core/src/error.rs b/crates/core/src/error.rs new file mode 100644 index 0000000..eb5b6a4 --- /dev/null +++ b/crates/core/src/error.rs @@ -0,0 +1,35 @@ +use std::error::Error as StdError; +use std::fmt::Display; + +use thiserror::Error; + +#[derive(Debug, Error)] +pub struct ParseError { + err: Box, + pos: Option<(u32, u32)>, +} + +impl Display for ParseError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} at {:?}", self.err, self.pos) + } +} + +impl ParseError { + pub fn from(err: E) -> ParseError + where + E: StdError + Send + Sync + 'static, + { + ParseError { + err: Box::new(err), + pos: None, + } + } + + pub fn with(self, pos: Option<(u32, u32)>) -> ParseError { + ParseError { + err: self.err, + pos, + } + } +} diff --git a/crates/core/src/lex.rs b/crates/core/src/lex.rs new file mode 100644 index 0000000..2c7b703 --- /dev/null +++ b/crates/core/src/lex.rs @@ -0,0 +1,226 @@ +use std::marker::PhantomData; + +use regex::{Regex, RegexSet}; + +use crate::cfg::TokenSet; + +#[derive(Debug, Copy, Clone)] +pub struct Token<'a, 'b, T: TokenSet<'a>> { + pub kind: T, + pub pos: (u32, u32), + orig_txt: &'b str, + tokenset: PhantomData<&'a T>, +} + +impl<'a, 'b, T: TokenSet<'a>> Token<'a, 'b, T> { + pub fn new(kind: T, orig_txt: &'b str, pos: (u32, u32)) -> Self { + Token { + kind, + pos, + orig_txt, + tokenset: PhantomData, + } + } + + pub fn as_str(&self) -> &'b str { + self.orig_txt + } + + pub fn to_string(&self) -> String { + self.orig_txt.to_string() + } +} + +pub struct Lexer; + +impl Lexer { + pub fn new<'a, 'b, T>(input: &'b str) -> anyhow::Result> + where + T: TokenSet<'a> + 'a, + { + let regex_map = T::try_into()?; + + let regex_set = regex_map.iter().map(|(_, token)| T::to_regex(&token)).collect::>(); + let regex_set = RegexSet::new(regex_set)?; + + let regex_istr = Regex::new(T::ignore_str())?; + + Ok(LexDriver::<'a, 'b, T>::new(regex_set, regex_map, regex_istr, input)) + } +} + +pub trait LexIterator<'a, 'b, T: TokenSet<'a> + 'a> +where + Self: Iterator>, +{ + fn pos(&self) -> (u32, u32); + fn remain(&self) -> Option<&'b str>; +} + +struct LexDriver<'a, 'b, T: TokenSet<'a>> { + // Regex + regex_set: RegexSet, + regex_map: Vec<(Regex, T)>, + regex_istr: Regex, + + // State + input: &'b str, + pos: (u32, u32), + + // PhantomData + tokenset: PhantomData<&'a T>, +} + +impl<'a, 'b, T: TokenSet<'a>> LexDriver<'a, 'b, T> { + fn new( + regex_set: RegexSet, + regex_map: Vec<(Regex, T)>, + regex_istr: Regex, + input: &'b str, + ) -> Self { + LexDriver { + regex_set, + regex_map, + regex_istr, + input, + pos: (0, 0), + tokenset: PhantomData, + } + } +} + +impl<'a, 'b, T: TokenSet<'a> + 'a> LexIterator<'a, 'b, T> for LexDriver<'a, 'b, T> { + fn pos(&self) -> (u32, u32) { + self.pos + } + + fn remain(&self) -> Option<&'b str> { + match self.input { + "" => None, + s => Some(s), + } + } +} + +impl<'a, 'b, T: TokenSet<'a> + 'a> Iterator for LexDriver<'a, 'b, T> { + type Item = Token<'a, 'b, T>; + + fn next(&mut self) -> Option { + // Skip spaces + if let Some(acc_s) = self.regex_istr.find(self.input) { + self.update_state(acc_s.as_str()); + } + + // Find the token + let mut matches = self + .regex_set + .matches(self.input) + .into_iter() + .map(|idx| &self.regex_map[idx]) + .map(|(regex, token)| (*token, regex.find(self.input).unwrap().as_str())) + .collect::>(); + matches.sort_by(|(_, a), (_, b)| a.len().cmp(&b.len())); + + // Update myself + let (token, acc_s) = matches.first()?; + let pos = self.pos; + self.update_state(acc_s); + + Some(Token::new(*token, acc_s, pos)) + } +} + +impl<'a, 'b, T: TokenSet<'a>> LexDriver<'a, 'b, T> { + fn update_state(&mut self, acc_s: &str) { + let (mut rows, mut cols) = self.pos; + for c in acc_s.chars() { + match c { + '\n' => { + rows += 1; + cols = 0; + } + _ => { + cols += 1; + } + } + } + + self.input = &self.input[acc_s.len()..]; + self.pos = (rows, cols); + } +} + +#[cfg(test)] +mod test { + use serde::{Deserialize, Serialize}; + + use crate::cfg::TokenSet; + use super::Lexer; + + #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] + enum TestToken { + Num, + Plus, + } + + impl TokenSet<'_> for TestToken { + fn ignore_str() -> &'static str { + r"^[ \t\n]+" + } + + fn enum_iter() -> Box> { + Box::new(vec![TestToken::Num, TestToken::Plus].into_iter()) + } + + fn to_regex(&self) -> &'static str { + match self { + TestToken::Num => r"^[1-9][0-9]*", + TestToken::Plus => r"^\+", + } + } + } + + fn check<'a, 'b>( + expected: &Vec<(TestToken, &'b str, (u32, u32))>, + input: &'b str, + ) -> bool { + Lexer::new::(input) + .unwrap() + .into_iter() + .zip(expected.iter()) + .all(|(a, b)| a.kind == b.0 && a.pos == b.2 && a.orig_txt == b.1) + } + + #[test] + fn input_ok_1() { + let expected = vec![ + (TestToken::Num, "10", (0, 0)), + (TestToken::Plus, "+", (0, 2)), + (TestToken::Num, "20", (0, 3)), + ]; + let input = "10+20"; + assert!(check(&expected, input)); + } + + #[test] + fn input_ok_2() { + let expected = vec![ + (TestToken::Num, "10", (0, 12)), + (TestToken::Plus, "+", (0, 15)), + (TestToken::Num, "20", (1, 6)), + ]; + let input = " 10 +\n 20 "; + assert!(check(&expected, input)); + } + + #[test] + fn input_ok_3() { + let expected = vec![ + (TestToken::Num, "10", (0, 12)), + (TestToken::Plus, "+", (0, 15)), + (TestToken::Num, "20", (1, 6)), + ]; + let input = " 10 +\n 20ffff30 - 40 * 50"; + assert!(check(&expected, input)); + } +} diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs new file mode 100644 index 0000000..9d030ee --- /dev/null +++ b/crates/core/src/lib.rs @@ -0,0 +1,38 @@ +pub mod cfg; +pub mod error; +pub mod parse; +pub mod lex; // TODO : private + +use std::marker::PhantomData; + +use serde::{Serialize, Deserialize}; + +use lex::Lexer; +use parse::ParserImpl; + +#[derive(Debug, Serialize, Deserialize)] +pub struct Parser<'a, Algorithm> +where + Algorithm: ParserImpl<'a>, +{ + r#impl: Algorithm, + phantom: PhantomData<&'a ()>, +} + +#[allow(clippy::new_without_default)] +impl<'a, Algorithm> Parser<'a, Algorithm> +where + Algorithm: ParserImpl<'a>, +{ + pub fn new() -> anyhow::Result> { + Ok(Parser { + r#impl: Algorithm::setup()?, + phantom: PhantomData, + }) + } + + pub fn parse<'b>(&self, input: &'b str) -> anyhow::Result { + let lexer = Lexer::new::(input)?; + self.r#impl.parse(lexer) + } +} diff --git a/crates/core/src/parse.rs b/crates/core/src/parse.rs new file mode 100644 index 0000000..dbe86c6 --- /dev/null +++ b/crates/core/src/parse.rs @@ -0,0 +1,17 @@ +use crate::cfg::{TokenSet, Syntax}; +use super::lex::LexIterator; + +pub trait ParserImpl<'a> +where + Self: Sized, +{ + type TokenSet: TokenSet<'a> + 'a; + type Syntax: Syntax<'a, TokenSet = Self::TokenSet>; + type Output; + + fn setup() -> anyhow::Result; + fn parse<'b>( + &self, + lexer: impl LexIterator<'a, 'b, Self::TokenSet> + ) -> anyhow::Result; +} diff --git a/crates/macros/Cargo.toml b/crates/macros/Cargo.toml new file mode 100644 index 0000000..2a2123e --- /dev/null +++ b/crates/macros/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "macros" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = { workspace = true } +thiserror = { workspace = true } +proc-macro2 = "1.0" +quote = "1.0" +syn = { version ="2.0", features = ["full", "extra-traits"] } + +[lib] +proc-macro = true diff --git a/crates/macros/src/impl.rs b/crates/macros/src/impl.rs new file mode 100644 index 0000000..dd500e0 --- /dev/null +++ b/crates/macros/src/impl.rs @@ -0,0 +1,2 @@ +pub mod tokenset; +pub mod syntax; diff --git a/crates/macros/src/impl/syntax.rs b/crates/macros/src/impl/syntax.rs new file mode 100644 index 0000000..b1e8574 --- /dev/null +++ b/crates/macros/src/impl/syntax.rs @@ -0,0 +1,110 @@ +use proc_macro2::TokenStream; +use quote::quote; +use syn::{Data, DeriveInput, Variant, Ident, LitStr}; + +pub fn syntax_proc_macro_impl(ast: DeriveInput) -> TokenStream { + let data_enum = if let Data::Enum(data_enum) = ast.data { + data_enum + } else { + panic!("\"Syntax\" proc-macro is only implemented for enum.") + }; + + let parsed_variantes = data_enum + .variants + .iter() + .map(|variant| VariantInfo::parse(&ast.ident, variant)) + .collect::>(); + + let enum_name = &ast.ident; + let enum_assoc_type = format!("{}", enum_name) + .replace("Syntax", "TokenSet") + .parse::() + .unwrap(); + let enum_variants = parsed_variantes + .iter() + .map(|variant| variant.gen_ident()); + let enum_rule_table = parsed_variantes + .iter() + .map(|variant| variant.gen_ident_with_rule()); + + quote! { + impl<'a> Syntax<'a> for #enum_name { + type TokenSet = #enum_assoc_type; + + fn enum_iter() -> impl Iterator { + vec![ + #( #enum_variants, )* + ].into_iter() + } + + fn to_rule(&self) -> Rule<'a, Self::TokenSet> { + match self { + #( #enum_rule_table, )* + _ => unimplemented!(), + } + } + } + } +} + +struct VariantInfo<'a> { + parent_ident: &'a Ident, + self_ident: &'a Ident, + rule: Option, +} + +impl<'a> VariantInfo<'a> { + fn parse(parent_ident: &'a Ident, variant: &'a Variant) -> VariantInfo<'a> { + let self_ident = &variant.ident; + + let mut rule = None; + for attr in &variant.attrs { + let attr = attr.parse_args::().unwrap().value(); + rule = Some(Self::parse_rule(&attr)); + } + + VariantInfo { + parent_ident, + self_ident, + rule, + } + } + + fn parse_rule(s: &str) -> TokenStream { + let mut splitted = s.split("::="); + + let lhs = splitted.next().unwrap().trim(); + let lhs = &lhs[1..lhs.len() - 1]; + let lhs = quote! { RuleElem::new_nonterm(#lhs) }; + + let rhs = splitted.collect::() + .split_whitespace() + .map(|s| { + if s.starts_with('<') { + let s = &s[1..s.len() - 1]; + quote! { RuleElem::new_nonterm(#s) } + } else { + let ident = s.parse::().unwrap(); + quote! { RuleElem::new_term(Self::TokenSet::#ident) } + } + }) + .collect::>(); + + quote! { Rule::from((#lhs, vec![ #( #rhs, )* ])) } + } + + fn gen_ident(&self) -> TokenStream { + let parent_ident = self.parent_ident; + let self_ident = self.self_ident; + + quote! { #parent_ident :: #self_ident } + } + + fn gen_ident_with_rule(&self) -> TokenStream { + let ident = self.gen_ident(); + match &self.rule { + Some(rule) => quote! { #ident => #rule }, + None => quote! { unimplemented!() }, + } + } +} diff --git a/crates/macros/src/impl/tokenset.rs b/crates/macros/src/impl/tokenset.rs new file mode 100644 index 0000000..84bcef3 --- /dev/null +++ b/crates/macros/src/impl/tokenset.rs @@ -0,0 +1,110 @@ +use proc_macro2::TokenStream; +use quote::quote; +use syn::{Data, DeriveInput, Variant, Ident, LitStr}; + +pub fn proc_macro_impl(ast: DeriveInput) -> TokenStream { + let data_enum = if let Data::Enum(data_enum) = ast.data { + data_enum + } else { + panic!("\"Tokenset\" proc-macro is only implemented for enum.") + }; + + let parsed_variantes = data_enum + .variants + .iter() + .map(|variant| VariantInfo::parse(&ast.ident, variant)) + .collect::>(); + + let enum_name = &ast.ident; + let enum_ignored = parsed_variantes + .iter() + .find(|variant| variant.ignored) + .map(|variant| variant.regex.as_ref().unwrap().as_str()) + .unwrap_or(""); + let enum_variants = parsed_variantes + .iter() + .filter(|variant| !variant.ignored) + .map(|variant| variant.gen_ident()); + let enum_regex_table = parsed_variantes + .iter() + .filter(|variant| !variant.ignored) + .map(|variant| variant.gen_ident_with_regex()); + + quote! { + impl TokenSet<'_> for #enum_name { + fn ignore_str() -> &'static str { + #enum_ignored + } + + fn enum_iter() -> impl Iterator { + vec![ + #( #enum_variants, )* + ].into_iter() + } + + fn to_regex(&self) -> &'static str { + match self { + #( #enum_regex_table, )* + _ => unimplemented!(), + } + } + } + } +} + +#[derive(Debug)] +struct VariantInfo<'a> { + parent_ident: &'a Ident, + self_ident: &'a Ident, + regex: Option, + ignored: bool, +} + +impl<'a> VariantInfo<'a> { + fn parse(parent_ident: &'a Ident, variant: &'a Variant) -> VariantInfo<'a> { + let self_ident = &variant.ident; + + let mut regex = None; + let mut ignored = false; + for attr in &variant.attrs { + let _ = attr.parse_nested_meta(|meta| { + // #[...(regex = "...")] + if meta.path.is_ident("regex") { + let raw_regex = meta.value()?.parse::()?.value(); + regex = Some(format!("^{}", raw_regex)); + return Ok(()); + } + + // #[...(ignord)] + if meta.path.is_ident("ignored") { + ignored = true; + return Ok(()); + } + + Err(meta.error("Unknown attribute")) + }); + } + + VariantInfo { + parent_ident, + self_ident, + regex, + ignored, + } + } + + fn gen_ident(&self) -> TokenStream { + let parent_ident = self.parent_ident; + let self_ident = self.self_ident; + + quote! { #parent_ident :: #self_ident } + } + + fn gen_ident_with_regex(&self) -> TokenStream { + let ident = self.gen_ident(); + match &self.regex { + Some(regex) => quote! { #ident => #regex }, + None => quote! { unimplemented!() }, + } + } +} diff --git a/crates/macros/src/lib.rs b/crates/macros/src/lib.rs new file mode 100644 index 0000000..9bdfd72 --- /dev/null +++ b/crates/macros/src/lib.rs @@ -0,0 +1,15 @@ +mod r#impl; + +use syn::{parse_macro_input, DeriveInput}; + +#[proc_macro_derive(TokenSet, attributes(token))] +pub fn derive_tokenset(input: proc_macro::TokenStream) -> proc_macro::TokenStream { + let ast = parse_macro_input!(input as DeriveInput); + r#impl::tokenset::proc_macro_impl(ast).into() +} + +#[proc_macro_derive(Syntax, attributes(rule))] +pub fn derive_syntax(input: proc_macro::TokenStream) -> proc_macro::TokenStream { + let ast = parse_macro_input!(input as DeriveInput); + r#impl::syntax::syntax_proc_macro_impl(ast).into() +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..0cda3b2 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,4 @@ +pub use core::*; +pub use algorithm; +#[cfg(feature = "derive")] +pub use macros; From e21840a421d45f32ab6a99361356d81250328256 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Wed, 15 May 2024 22:21:33 +0900 Subject: [PATCH 2/6] =?UTF-8?q?[change]=20macros/=20=E2=86=92=20core=5Fder?= =?UTF-8?q?ive/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 25 +++++++++---------- Cargo.toml | 5 +--- crates/algorithm_lr1/Cargo.toml | 3 +-- crates/algorithm_lr1/src/lib.rs | 1 - crates/core/Cargo.toml | 5 ++++ crates/core/src/cfg.rs | 3 +++ crates/{macros => core_derive}/Cargo.toml | 2 +- crates/{macros => core_derive}/src/impl.rs | 0 .../src/impl/syntax.rs | 0 .../src/impl/tokenset.rs | 0 crates/{macros => core_derive}/src/lib.rs | 0 src/lib.rs | 2 -- 12 files changed, 23 insertions(+), 23 deletions(-) rename crates/{macros => core_derive}/Cargo.toml (92%) rename crates/{macros => core_derive}/src/impl.rs (100%) rename crates/{macros => core_derive}/src/impl/syntax.rs (100%) rename crates/{macros => core_derive}/src/impl/tokenset.rs (100%) rename crates/{macros => core_derive}/src/lib.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index ad16bd1..8b285bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,7 +25,6 @@ dependencies = [ "anyhow", "core", "itertools", - "macros", "serde", "serde_json", "thiserror", @@ -42,11 +41,23 @@ name = "core" version = "0.1.0" dependencies = [ "anyhow", + "core_derive", "regex", "serde", "thiserror", ] +[[package]] +name = "core_derive" +version = "0.1.0" +dependencies = [ + "anyhow", + "proc-macro2", + "quote", + "syn", + "thiserror", +] + [[package]] name = "either" version = "1.11.0" @@ -68,17 +79,6 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" -[[package]] -name = "macros" -version = "0.1.0" -dependencies = [ - "anyhow", - "proc-macro2", - "quote", - "syn", - "thiserror", -] - [[package]] name = "memchr" version = "2.7.2" @@ -92,7 +92,6 @@ dependencies = [ "algorithm", "anyhow", "core", - "macros", "thiserror", ] diff --git a/Cargo.toml b/Cargo.toml index 5978fba..93deedc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,18 +8,16 @@ anyhow = { workspace = true } thiserror = { workspace = true } core = { workspace = true } algorithm = { workspace = true } -macros = { workspace = true, optional = true } [features] default = [] -derive = ["macros"] +derive = ["core/derive"] [workspace] resolver = "2" members = [ "./crates/core", "./crates/algorithm", - "./crates/macros", ] exclude = [] @@ -31,4 +29,3 @@ regex = "1.10.4" regex-macro = "0.2.0" core = { path = "./crates/core" } algorithm = { path = "./crates/algorithm" } -macros = { path = "./crates/macros" } diff --git a/crates/algorithm_lr1/Cargo.toml b/crates/algorithm_lr1/Cargo.toml index a516ff3..289b6f0 100644 --- a/crates/algorithm_lr1/Cargo.toml +++ b/crates/algorithm_lr1/Cargo.toml @@ -8,8 +8,7 @@ anyhow = { workspace = true } thiserror = { workspace = true } serde = { workspace = true, features = ["derive"]} itertools = "0.12.1" -core = { path = "../core" } +core = { path = "../core", features = ["derive"] } [dev-dependencies] serde_json = "1.0.117" -macros = { path = "../macros" } diff --git a/crates/algorithm_lr1/src/lib.rs b/crates/algorithm_lr1/src/lib.rs index 4910642..00f4b42 100644 --- a/crates/algorithm_lr1/src/lib.rs +++ b/crates/algorithm_lr1/src/lib.rs @@ -43,7 +43,6 @@ mod test { use core::cfg::{TokenSet, Syntax, Rule, RuleElem}; use core::Parser; - use macros::*; use super::LR1; diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index d0c9cd3..54256b7 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -8,3 +8,8 @@ anyhow = { workspace = true } thiserror = { workspace = true } serde = { workspace = true, features = ["derive"]} regex = { workspace = true } +derive = { package = "core_derive", path = "../core_derive", optional = true } + +[features] +default = [] +derive = ["dep:derive"] diff --git a/crates/core/src/cfg.rs b/crates/core/src/cfg.rs index bb66213..458da47 100644 --- a/crates/core/src/cfg.rs +++ b/crates/core/src/cfg.rs @@ -1,5 +1,8 @@ mod token; mod syntax; +#[cfg(feature = "derive")] +pub use derive::{TokenSet, Syntax}; + pub use token::TokenSet; pub use syntax::{Syntax, Rule, RuleElem, RuleSet}; diff --git a/crates/macros/Cargo.toml b/crates/core_derive/Cargo.toml similarity index 92% rename from crates/macros/Cargo.toml rename to crates/core_derive/Cargo.toml index 2a2123e..88efe71 100644 --- a/crates/macros/Cargo.toml +++ b/crates/core_derive/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "macros" +name = "core_derive" version = "0.1.0" edition = "2021" diff --git a/crates/macros/src/impl.rs b/crates/core_derive/src/impl.rs similarity index 100% rename from crates/macros/src/impl.rs rename to crates/core_derive/src/impl.rs diff --git a/crates/macros/src/impl/syntax.rs b/crates/core_derive/src/impl/syntax.rs similarity index 100% rename from crates/macros/src/impl/syntax.rs rename to crates/core_derive/src/impl/syntax.rs diff --git a/crates/macros/src/impl/tokenset.rs b/crates/core_derive/src/impl/tokenset.rs similarity index 100% rename from crates/macros/src/impl/tokenset.rs rename to crates/core_derive/src/impl/tokenset.rs diff --git a/crates/macros/src/lib.rs b/crates/core_derive/src/lib.rs similarity index 100% rename from crates/macros/src/lib.rs rename to crates/core_derive/src/lib.rs diff --git a/src/lib.rs b/src/lib.rs index 0cda3b2..0f77213 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,2 @@ pub use core::*; pub use algorithm; -#[cfg(feature = "derive")] -pub use macros; From b58888df42bcc41c456572e3683aff0fd17c5a93 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Wed, 15 May 2024 22:32:45 +0900 Subject: [PATCH 3/6] =?UTF-8?q?[add]=20tests/=20=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 3 +- Cargo.toml | 4 +++ crates/algorithm_lr1/Cargo.toml | 5 +-- crates/algorithm_lr1/src/lib.rs | 17 ++-------- tests/core_derive.rs | 47 +++++++++++++++++++++++++++ tests/serde.rs | 56 +++++++++++++++++++++++++++++++++ 6 files changed, 112 insertions(+), 20 deletions(-) create mode 100644 tests/core_derive.rs create mode 100644 tests/serde.rs diff --git a/Cargo.lock b/Cargo.lock index 8b285bf..a5b89f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,7 +26,6 @@ dependencies = [ "core", "itertools", "serde", - "serde_json", "thiserror", ] @@ -92,6 +91,8 @@ dependencies = [ "algorithm", "anyhow", "core", + "serde", + "serde_json", "thiserror", ] diff --git a/Cargo.toml b/Cargo.toml index 93deedc..aeaa347 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,10 @@ thiserror = { workspace = true } core = { workspace = true } algorithm = { workspace = true } +[dev-dependencies] +serde = { workspace = true } +serde_json = "1.0.117" + [features] default = [] derive = ["core/derive"] diff --git a/crates/algorithm_lr1/Cargo.toml b/crates/algorithm_lr1/Cargo.toml index 289b6f0..2511561 100644 --- a/crates/algorithm_lr1/Cargo.toml +++ b/crates/algorithm_lr1/Cargo.toml @@ -6,9 +6,6 @@ edition = "2021" [dependencies] anyhow = { workspace = true } thiserror = { workspace = true } -serde = { workspace = true, features = ["derive"]} +serde = { workspace = true, features = ["derive"] } itertools = "0.12.1" core = { path = "../core", features = ["derive"] } - -[dev-dependencies] -serde_json = "1.0.117" diff --git a/crates/algorithm_lr1/src/lib.rs b/crates/algorithm_lr1/src/lib.rs index 00f4b42..ab0936b 100644 --- a/crates/algorithm_lr1/src/lib.rs +++ b/crates/algorithm_lr1/src/lib.rs @@ -39,14 +39,12 @@ where #[cfg(test)] mod test { - use serde::{Serialize, Deserialize}; - use core::cfg::{TokenSet, Syntax, Rule, RuleElem}; use core::Parser; use super::LR1; - #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize, Deserialize, TokenSet)] + #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, TokenSet)] enum TestTokenSet { #[token(regex = r"\+")] Plus, @@ -66,7 +64,7 @@ mod test { _Whitespace, } - #[derive(Debug, Clone, Copy, Serialize, Deserialize, Syntax)] + #[derive(Debug, Clone, Copy, Syntax)] enum TestSyntax { #[rule(" ::= Plus ")] ExprPlus, @@ -124,15 +122,4 @@ mod test { assert!(parser.parse(input).is_err(), "{}", input); } } - - #[test] - fn check_serde() { - type TestParser<'a> = Parser::<'a, LR1<'a, TestTokenSet, TestSyntax>>; - - let parser = TestParser::new().unwrap(); - let serialized = serde_json::to_string(&parser).unwrap(); - let deserialized: TestParser = serde_json::from_str(&serialized).unwrap(); - - deserialized.parse("10 * (20 - 30)").unwrap(); - } } diff --git a/tests/core_derive.rs b/tests/core_derive.rs new file mode 100644 index 0000000..af6d4be --- /dev/null +++ b/tests/core_derive.rs @@ -0,0 +1,47 @@ +use core::cfg::*; + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, TokenSet)] +enum TestTokenSet { + #[token(regex = r"\+")] + Plus, + #[token(regex = r"-")] + Minus, + #[token(regex = r"\*")] + Mul, + #[token(regex = r"/")] + Div, + #[token(regex = r"\(")] + BracketL, + #[token(regex = r"\)")] + BracketR, + #[token(regex = r"[1-9][0-9]*")] + Num, + #[token(regex = r"[ \t\n]+", ignored)] + _Whitespace, +} + +#[derive(Debug, Clone, Copy, Syntax)] +enum TestSyntax { + #[rule(" ::= Plus ")] + ExprPlus, + #[rule(" ::= Minus ")] + ExprMinus, + #[rule(" ::= ")] + ExprTerm, + #[rule(" ::= Mul ")] + TermMul, + #[rule(" ::= Div ")] + TermDiv, + #[rule(" ::= ")] + TermNum, + #[rule(" ::= BracketL BracketR")] + NestedNum, + #[rule(" ::= Num")] + Num, +} + +#[test] +fn check_compile() { + let _ = TestTokenSet::to_regex(&self::TestTokenSet::Plus); + let _ = TestSyntax::to_rule(&self::TestSyntax::ExprPlus); +} diff --git a/tests/serde.rs b/tests/serde.rs new file mode 100644 index 0000000..4faa3ed --- /dev/null +++ b/tests/serde.rs @@ -0,0 +1,56 @@ +use serde::{Serialize, Deserialize}; + +use core::cfg::*; +use core::Parser; +use algorithm::LR1; + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize, Deserialize, TokenSet)] +enum TestTokenSet { + #[token(regex = r"\+")] + Plus, + #[token(regex = r"-")] + Minus, + #[token(regex = r"\*")] + Mul, + #[token(regex = r"/")] + Div, + #[token(regex = r"\(")] + BracketL, + #[token(regex = r"\)")] + BracketR, + #[token(regex = r"[1-9][0-9]*")] + Num, + #[token(regex = r"[ \t\n]+", ignored)] + _Whitespace, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, Syntax)] +enum TestSyntax { + #[rule(" ::= Plus ")] + ExprPlus, + #[rule(" ::= Minus ")] + ExprMinus, + #[rule(" ::= ")] + ExprTerm, + #[rule(" ::= Mul ")] + TermMul, + #[rule(" ::= Div ")] + TermDiv, + #[rule(" ::= ")] + TermNum, + #[rule(" ::= BracketL BracketR")] + NestedNum, + #[rule(" ::= Num")] + Num, +} + +#[test] +fn serde() { + type TestParser<'a> = Parser::<'a, LR1<'a, TestTokenSet, TestSyntax>>; + + let parser = TestParser::new().unwrap(); + let serialized = serde_json::to_string(&parser).unwrap(); + let deserialized: TestParser = serde_json::from_str(&serialized).unwrap(); + + deserialized.parse("10 * (20 - 30)").unwrap(); +} From 108bb480127b2271c4fbb42b9341c128d8c5d101 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Wed, 15 May 2024 22:37:33 +0900 Subject: [PATCH 4/6] =?UTF-8?q?[change]=20tets/core=5Fderive.rs=20?= =?UTF-8?q?=E2=86=92=20tests/derive.rs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/{core_derive.rs => derive.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{core_derive.rs => derive.rs} (100%) diff --git a/tests/core_derive.rs b/tests/derive.rs similarity index 100% rename from tests/core_derive.rs rename to tests/derive.rs From 8a1819f0f42e58ac2585f8ecc569289132a1b1c3 Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Wed, 15 May 2024 22:45:04 +0900 Subject: [PATCH 5/6] =?UTF-8?q?[add]=20examples/expr.rs=20=E4=BD=9C?= =?UTF-8?q?=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/expr.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 examples/expr.rs diff --git a/examples/expr.rs b/examples/expr.rs new file mode 100644 index 0000000..7b0b3cc --- /dev/null +++ b/examples/expr.rs @@ -0,0 +1,59 @@ +use std::io::stdin; + +use parsergen::algorithm::LR1; +use parsergen::cfg::*; +use parsergen::Parser; + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, TokenSet)] +enum ExprTokenSet { + #[token(regex = r"\+")] + Plus, + #[token(regex = r"-")] + Minus, + #[token(regex = r"\*")] + Mul, + #[token(regex = r"/")] + Div, + #[token(regex = r"\(")] + BracketL, + #[token(regex = r"\)")] + BracketR, + #[token(regex = r"[1-9][0-9]*")] + Num, + #[token(regex = r"[ \t\n]+", ignored)] + _Whitespace, +} + +#[derive(Debug, Clone, Copy, Syntax)] +enum ExprSyntax { + #[rule(" ::= Plus ")] + ExprPlus, + #[rule(" ::= Minus ")] + ExprMinus, + #[rule(" ::= ")] + ExprTerm, + #[rule(" ::= Mul ")] + TermMul, + #[rule(" ::= Div ")] + TermDiv, + #[rule(" ::= ")] + TermNum, + #[rule(" ::= BracketL BracketR")] + NestedNum, + #[rule(" ::= Num")] + Num, +} + +type ExprParser<'a> = Parser::<'a, LR1<'a, ExprTokenSet, ExprSyntax>>; + +fn main() -> anyhow::Result<()> { + let mut input = String::new(); + stdin().read_line(&mut input)?; + + match ExprParser::new()?.parse(&input) { + Ok(_) => println!("Accepted"), + Err(e) => println!("Rejected: {}", e), + }; + + Ok(()) +} From dacb3b6d0d1c51744d626dcef95d564bcfce14fe Mon Sep 17 00:00:00 2001 From: NakagamiYuta Date: Wed, 15 May 2024 22:48:02 +0900 Subject: [PATCH 6/6] [update] README.md --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 8bb992e..d0ded7b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,21 @@ # Parsergen Rust製パーサジェネレータ + +## Features + +- `derive` + +## Examples + +[examples/expr.rs](examples/expr.rs) + +``` +$ cargo run --example expr +(10+20)/((30*40)-50) +Accepted + +$ cargo run --example expr +10** +Rejected: Error at (0, 3) +```