diff --git a/.gitignore b/.gitignore index d01bd1a..bf7ff1c 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,7 @@ Cargo.lock # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +#.idea/ + +# cargo mutants output +mutants.out*/ \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index cd03087..0cf5afd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,20 +1,18 @@ [package] name = "regexsolver" -version = "0.3.1" -edition = "2021" +version = "1.0.0" +edition = "2024" authors = ["Alexandre van Beurden"] repository = "https://github.com/RegexSolver/regexsolver" license = "MIT" keywords = ["automaton", "intersection", "union", "difference", "regex"] -description = "Manipulate regex and automaton as if they were sets." +description = "High-performance Rust library for building, combining, and analyzing regular expressions and finite automata" readme = "README.md" [dependencies] serde = { version = "1.0", features = ["derive"], optional = true } ciborium = { version = "0.2.2", optional = true } z85 = { version = "3.0.5", optional = true } -aes-gcm-siv = { version = "0.11.1", optional = true } -sha2 = { version = "0.10.8", optional = true } flate2 = { version = "1.0.30", features = [ "zlib-ng", ], default-features = false, optional = true } @@ -26,6 +24,8 @@ lazy_static = "1.4.0" regex = "1.10.3" regex-syntax = "0.8.5" regex-charclass = { version = "1.0.3" } +rayon = "1.10.0" +bit-set = "0.8.0" [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } @@ -34,17 +34,15 @@ serde_json = "1.0.114" [features] -default = ["serde"] -serde = [ +default = [] +serializable = [ "regex-charclass/serde", "dep:serde", "dep:ciborium", "dep:z85", - "dep:aes-gcm-siv", - "dep:sha2", "dep:flate2", ] [[bench]] name = "my_benchmark" -harness = false \ No newline at end of file +harness = false diff --git a/README.md b/README.md index dcb0b47..f45837d 100644 --- a/README.md +++ b/README.md @@ -1,73 +1,311 @@ # RegexSolver - [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) -This repository contains the code of [RegexSolver](https://regexsolver.com/) engine. - -For more information, you can check the library's [documentation](https://docs.rs/regexsolver/latest/regexsolver/). - -If you want to use this library with other programming languages, we provide a wide range of wrappers: +**RegexSolver** is a Rust library for building, combining, and analyzing regular expressions and finite automata. It is designed for constraint solvers, test generators, and other systems that need advanced regex and automaton operations. -- [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) -- [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) -- [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) +## Table of Contents -For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). + - [Installation](#installation) + - [Example](#example) + - [Key Concepts & Limitations](#key-concepts--limitations) + - [API](#api) + - [Term](#term) + - [FastAutomaton](#fastautomaton) + - [RegularExpression](#regularexpression) + - [Bound Execution](#bound-execution) + - [Cross-Language Support](#cross-language-support) + - [License](#license) ## Installation -Add the following line in your `Cargo.toml`: +Add to your `Cargo.toml`: ```toml [dependencies] -regexsolver = "0.3" +regexsolver = "1" ``` -## Examples - -### Union +## Example ```rust use regexsolver::Term; +use regexsolver::error::EngineError; -let term1 = Term::from_regex("abc").unwrap(); -let term2 = Term::from_regex("de").unwrap(); -let term3 = Term::from_regex("fghi").unwrap(); +fn main() -> Result<(), EngineError> { + // Create terms from regex + let t1 = Term::from_pattern("abc.*")?; + let t2 = Term::from_pattern(".*xyz")?; -let union = term1.union(&[term2, term3]).unwrap(); + // Concatenate + let concat = t1.concat(&[t2])?; + assert_eq!(concat.to_pattern(), "abc.*xyz"); -if let Term::RegularExpression(regex) = union { - println!("{}", regex.to_string()); // (abc|de|fghi) + // Union + let union = t1.union(&[Term::from_pattern("fgh")?])?; + assert_eq!(union.to_pattern(), "(abc.*|fgh)"); + + // Intersection + let inter = Term::from_pattern("(ab|xy){2}")? + .intersection(&[Term::from_pattern(".*xy")?])?; + assert_eq!(inter.to_pattern(), "(ab|xy)xy"); + + // Difference + let diff = Term::from_pattern("a*")? + .difference(&Term::from_pattern("")?)?; + assert_eq!(diff.to_pattern(), "a+"); + + // Repetition + let rep = Term::from_pattern("abc")? + .repeat(2, Some(4))?; + assert_eq!(rep.to_pattern(), "(abc){2,4}"); + + // Analyze + assert_eq!(rep.get_length(), (Some(6), Some(12))); + assert!(!rep.is_empty()); + + // Generate examples + let samples = Term::from_pattern("(x|y){1,3}")? + .generate_strings(5)?; + println!("Some matches: {:?}", samples); + + // Equivalence & subset + let a = Term::from_pattern("a+")?; + let b = Term::from_pattern("a*")?; + assert!(!a.equivalent(&b)?); + assert!(a.subset(&b)?); + + Ok(()) } ``` -### Intersection +## Key Concepts & Limitations + +RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: +- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". +- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them returns an error. +- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. +- **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. +- **Line Feed and Dot:** RegexSolver handles all characters the same way. The dot `.` matches any Unicode character including line feed (`\n`). +- **Empty Regular Expressions:** The empty language (matches no string) is represented by constructs like `[]` (empty character class). This is distinct from the empty string. +RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. + +## API + +### Term + +`Term` is an enum designed to represent either a regular expression or an automaton. Used when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. + +#### Build +| Method | Return | Description | +| -------- | ------- | ------- | +| `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | +| `from_pattern(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. | +| `from_regex(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | +| `new_empty()` | `Term` | Creates a term that matches the empty language. | +| `new_empty_string()` | `Term` | Creates a term that only matches the empty string `""`. | +| `new_total()` | `Term` | Creates a term that matches all possible strings. | + +#### Manipulate +| Method | Return | Description | +| -------- | ------- | ------- | +| `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given terms. | +| `difference(&self, other: &Term)` | `Result` | Computes the difference between `self` and `other`. | +| `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given terms. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | +| `union(&self, terms: &[Term])` | `Result` | Computes the union of the given terms. | + +#### Analyze +| Method | Return | Description | +| -------- | ------- | ------- | +| `equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | +| `get_cardinality(&self)` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | +| `is_empty(&self)` | `bool` | Checks if the term matches the empty language. | +| `is_empty_string(&self)` | `bool` | Checks if the term matches only the empty string `""`. | +| `is_total(&self)` | `bool` | Checks if the term matches all possible strings. | +| `subset(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | +| `to_automaton(&self)` | `Result, EngineError>` | Converts the term to a `FastAutomaton`. | +| `to_pattern(&self)` | `String` | Converts the term to a regular expression pattern. | +| `to_regex(&self)` | `Cow` | Converts the term to a `RegularExpression`. | + +### FastAutomaton + +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. + +When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: ```rust -use regexsolver::Term; +Condition::from_range(&range, &spanning_set); +``` +where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: -let term1 = Term::from_regex("(abc|de){2}").unwrap(); -let term2 = Term::from_regex("de.*").unwrap(); -let term3 = Term::from_regex(".*abc").unwrap(); +1. Merge an existing spanning set with another: +```rust +let new_set = SpanningSet::merge(&old_set, &other_set); +``` -let intersection = term1.intersection(&[term2, term3]).unwrap(); +2. Recompute from a list of ranges: +```rust +let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2, …]); +``` -if let Term::RegularExpression(regex) = intersection { - println!("{}", regex.to_string()); // deabc -} +After constructing `new_set`, apply it to the automaton: +```rust +fast_automaton.apply_new_spanning_set(&new_set); ``` -### Difference/Subtraction +This design allows us to perform unions, intersections, and complements of transition conditions in O(1) time, but it does add some complexity to automaton construction. For more details, you can check [this article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). + +#### Build +| Method | Return | Description | +| -------- | ------- | ------- | +| `accept(&mut self, state: State)` | `()` | Marks the provided state as an accepting (final) state. | +| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | `()` | Creates a new epsilon transition between the two states. | +| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | `()` | Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. | +| `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Applies the provided spanning set and projects all existing conditions onto it. | +| `new_empty()` | `FastAutomaton` | Creates an automaton that matches the empty language. | +| `new_empty_string()` | `FastAutomaton` | Creates an automaton that only matches the empty string `""`. | +| `new_from_range(range: &CharRange)` | `FastAutomaton` | Creates an automaton that matches one of the characters in the given `CharRange`. | +| `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | +| `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | +| `remove_state(&mut self, state: State)` | `()` | Removes the state and its connected transitions; panics if it's a start state. | +| `remove_states(&mut self, states: &IntSet)` | `()` | Removes the given states and their connected transitions; panics if any is a start state. | +| `remove_transition(&mut self, from_state: State, to_state: State)` | `()` | Removes the transition between the two provided states if it exists. | + +#### Manipulate +| Method | Return | Description | +| -------- | ------- | ------- | +| `complement(&mut self)` | `Result<(), EngineError>` | Complements the automaton; it must be deterministic. | +| `concat(&self, other: &FastAutomaton)` | `Result` | Computes the concatenation between `self` and `other`. | +| `concat_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the concatenation of all automata in the given iterator. | +| `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result. | +| `difference(&self, other: &FastAutomaton)` | `Result` | Computes the difference between `self` and `other`. | +| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | +| `intersection(&self, other: &FastAutomaton)` | `Result` | Computes the intersection between `self` and `other`. | +| `intersection_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the intersection of all automata in the given iterator. | +| `intersection_all_par<'a, I: IntoParallelIterator>(automata: I)` | `Result` | Computes in parallel the intersection of all automata in the given iterator. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | +| `union(&self, other: &FastAutomaton)` | `Result` | Computes the union between `self` and `other`. | +| `union_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the union of all automata in the given iterator. | +| `union_all_par<'a, I: IntoParallelIterator>(automata: I)` | `Result` | Computes in parallel the union of all automata in the given iterator. | + +#### Analyze +| Method | Return | Description | +| -------- | ------- | ------- | +| `as_dot(&self)` | `String` | Returns the automaton's DOT representation. | +| `direct_states(&self, state: State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | +| `direct_states_vec(&self, state: State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | +| `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | +| `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | +| `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | +| `get_number_of_states(&self)` | `usize` | Returns the number of states in the automaton. | +| `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | +| `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | +| `get_start_state(&self)` | `State` | Returns the start state. | +| `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | +| `has_transition(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | +| `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | +| `is_accepted(&self, state: State)` | `bool` | Returns `true` if the given state is one of the accept states. | +| `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | +| `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | +| `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | +| `is_match(&self, string: &str)` | `bool` | Returns `true` if the automaton matches the given string. | +| `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | +| `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | +| `print_dot(&self)` | `()` | Prints the automaton's DOT representation. | +| `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | +| `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | +| `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | +| `to_regex(&self)` | `RegularExpression` | Converts the term to a `RegularExpression`. | +| `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | +| `transitions_from_vec(&self, state: State)` | `Vec<(Condition, State)>` | Returns a vector of transitions from the given state. | +| `transitions_to_vec(&self, state: State)` | `Vec<(State, Condition)>` | Returns a vector of transitions to the given state. | + + +### RegularExpression + +`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert into a `FastAutomaton` with the method `to_automaton()`. + +#### Build/Manipulate +| Method | Return | Description | +| -------- | ------- | ------- | +| `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. | +| `concat_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the concatenation of all expressions in `patterns`. | +| `new(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns the resulting `RegularExpression`. | +| `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | +| `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | +| `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | +| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. If `simplify` is `true`, the expression is simplified during parsing. | +| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | +| `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | +| `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | +| `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the union of all expressions in `patterns`. | + +#### Analyze +| Method | Return | Description | +| -------- | ------- | ------- | +| `evaluate_complexity(&self)` | `f64` | Returns a heuristic score for the readability of the pattern. | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the regular expression (i.e., the number of possible matched strings). | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of possible matched strings. | +| `is_empty(&self)` | `bool` | Checks if the regular expression matches the empty language. | +| `is_empty_string(&self)` | `bool` | Checks if the regular expression only matches the empty string `""`. | +| `is_total(&self)` | `bool` | Checks if the regular expression matches all possible strings. | +| `to_automaton(&self)` | `Result` | Converts the regular expression to an equivalent `FastAutomaton`. | + + +## Bound Execution + +Use a thread-local `ExecutionProfile` to cap runtime or state explosion; hitting a limit returns a specific `EngineError`. + +### Time-Bounded Execution ```rust -use regexsolver::Term; +use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term1 = Term::from_regex("(abc|de)").unwrap(); -let term2 = Term::from_regex("de").unwrap(); +let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*")?; -let subtraction = term1.subtraction(&term2).unwrap(); +let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(5) // limit in milliseconds + .build(); -if let Term::RegularExpression(regex) = subtraction { - println!("{}", regex.to_string()); // abc -} +// We run the operation with the defined limitation +execution_profile.run(|| { + assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000).unwrap_err()); +}); ``` + +### State-Limited Execution + +```rust +use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; + +let term1 = Term::from_pattern(".*abcdef.*")?; +let term2 = Term::from_pattern(".*defabc.*")?; + +let execution_profile = ExecutionProfileBuilder::new() + .max_number_of_states(5) // we set the limit + .build(); + +// We run the operation with the defined limitation +execution_profile.run(|| { + assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); +}); +``` + +## Cross-Language Support + +If you want to use this library with other programming languages, we provide a wide range of wrappers: +- [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) +- [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) +- [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) + +For more information about how to use the wrappers, you can refer to our [guide](https://docs.regexsolver.com/getting-started.html). + +## License + +This project is licensed under the MIT License. diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs index f2f9fdc..71898ec 100644 --- a/benches/my_benchmark.rs +++ b/benches/my_benchmark.rs @@ -1,4 +1,3 @@ -use ahash::AHashSet; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use regexsolver::{fast_automaton::FastAutomaton, regex::RegularExpression}; @@ -7,18 +6,18 @@ fn parse_regex(regex: &str) -> RegularExpression { } fn to_regex(automaton: &FastAutomaton) -> RegularExpression { - automaton.to_regex().unwrap() + automaton.to_regex() } fn determinize(automaton: &FastAutomaton) -> FastAutomaton { - automaton.determinize().unwrap() + automaton.determinize().unwrap().into_owned() } fn intersection(automaton_1: &FastAutomaton, automaton_2: &FastAutomaton) -> FastAutomaton { automaton_1.intersection(automaton_2).unwrap() } -fn generate_strings(automaton: &FastAutomaton) -> AHashSet { +fn generate_strings(automaton: &FastAutomaton) -> Vec { automaton.generate_strings(2000).unwrap() } diff --git a/src/cardinality/mod.rs b/src/cardinality/mod.rs index 08131e0..54bdcde 100644 --- a/src/cardinality/mod.rs +++ b/src/cardinality/mod.rs @@ -1,10 +1,10 @@ -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] use serde::{Deserialize, Serialize}; /// Represent a number. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(PartialEq, Eq, Debug, Clone)] -#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))] +#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value", rename_all = "camelCase"))] pub enum Cardinality { /// An infinite number. Infinite, diff --git a/src/error/mod.rs b/src/error/mod.rs index 6447ebe..29052b4 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -1,6 +1,7 @@ use std::fmt::{self}; -use crate::tokenizer::token::TokenError; +#[cfg(feature = "serializable")] +use crate::fast_automaton::serializer::tokenizer::token::TokenError; /// An error thrown by the engine. #[derive(Debug, PartialEq, Eq)] @@ -9,18 +10,13 @@ pub enum EngineError { InvalidCharacterInRegex, /// The operation took too much time. OperationTimeOutError, - /// The given automaton should be deterministic. - AutomatonShouldBeDeterministic, /// The automaton has too many states. AutomatonHasTooManyStates, /// The regular expression can not be parsed. RegexSyntaxError(String), - /// Too many terms are used in the operation. - TooMuchTerms(usize, usize), /// The provided range can not be built from the spanning set. ConditionInvalidRange, - /// The provided index is out of bound of the condition. - ConditionIndexOutOfBound, + #[cfg(feature = "serializable")] /// There is an error with one of the token. TokenError(TokenError), } @@ -30,33 +26,18 @@ impl fmt::Display for EngineError { match self { EngineError::InvalidCharacterInRegex => write!(f, "Invalid character used in regex."), EngineError::OperationTimeOutError => write!(f, "The operation took too much time."), - EngineError::AutomatonShouldBeDeterministic => write!(f, "The given automaton should be deterministic."), - EngineError::AutomatonHasTooManyStates => write!(f, "The automaton has too many states."), + EngineError::AutomatonHasTooManyStates => { + write!(f, "The automaton has too many states.") + } EngineError::RegexSyntaxError(err) => write!(f, "{err}."), - EngineError::TooMuchTerms(max, got) => write!(f, "Too many terms are used in this operation, the maximum allowed for your plan is {max} and you used {got}."), - EngineError::TokenError(err) => write!(f, "{err}."), - EngineError::ConditionInvalidRange => write!(f, "The provided range can not be built from the spanning set."), - EngineError::ConditionIndexOutOfBound => write!(f, "The provided index is out of bound of the condition."), + #[cfg(feature = "serializable")] + EngineError::TokenError(err) => write!(f, "{err}."), + EngineError::ConditionInvalidRange => write!( + f, + "The provided range can not be built from the spanning set." + ), } } } impl std::error::Error for EngineError {} - -impl EngineError { - /// Determine if the error is a server error. - /// A server error should not be shown to the end user. - pub fn is_server_error(&self) -> bool { - match self { - EngineError::InvalidCharacterInRegex => false, - EngineError::OperationTimeOutError => false, - EngineError::AutomatonShouldBeDeterministic => true, - EngineError::AutomatonHasTooManyStates => false, - EngineError::RegexSyntaxError(_) => false, - EngineError::TooMuchTerms(_, _) => false, - EngineError::TokenError(_) => false, - EngineError::ConditionInvalidRange => true, - EngineError::ConditionIndexOutOfBound => true, - } - } -} diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 2ae8e2b..b845261 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -1,105 +1,76 @@ -use std::{cell::RefCell, time::SystemTime}; +use std::{ + cell::RefCell, + time::{Duration, Instant}, +}; use crate::error::EngineError; /// Hold settings about limitations and constraints of operations execution within the engine. /// -/// To apply the settings on the current thread you need to call the following function: -/// ``` -/// use regexsolver::execution_profile::{ExecutionProfile, ThreadLocalParams}; -/// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 1, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 10, -/// }; -/// -/// // Store the settings on the current thread. -/// ThreadLocalParams::init_profile(&execution_profile); -/// ``` -/// /// # Examples: /// /// ## Limiting the number of states /// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; -/// -/// let term1 = Term::from_regex(".*abc.*").unwrap(); -/// let term2 = Term::from_regex(".*def.*").unwrap(); -/// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 1, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 10, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); -/// -/// assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); -/// ``` -/// -/// ## Limiting the number of terms -/// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; +/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// -/// let term1 = Term::from_regex(".*abc.*").unwrap(); -/// let term2 = Term::from_regex(".*def.*").unwrap(); -/// let term3 = Term::from_regex(".*hij.*").unwrap(); +/// let term1 = Term::from_pattern(".*abcdef.*").unwrap(); +/// let term2 = Term::from_pattern(".*defabc.*").unwrap(); /// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 2, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); +/// let execution_profile = ExecutionProfileBuilder::new() +/// .max_number_of_states(5) +/// .build(); /// -/// assert_eq!(EngineError::TooMuchTerms(2,3), term1.intersection(&[term2, term3]).unwrap_err()); +/// execution_profile.run(|| { +/// assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); +/// }); /// ``` /// /// ## Limiting the execution time /// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; +/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// use std::time::SystemTime; /// -/// let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); +/// let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); /// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: Some(SystemTime::now()), -/// execution_timeout: 1, -/// max_number_of_terms: 50, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); +/// let execution_profile = ExecutionProfileBuilder::new() +/// .execution_timeout(5) // 5ms +/// .build(); /// -/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(100).unwrap_err()); +/// execution_profile.run(|| { +/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000).unwrap_err()); +/// }); /// ``` +#[derive(Clone, Debug)] pub struct ExecutionProfile { /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. - pub max_number_of_states: usize, - /// Timestamp of when the execution has started, if this value is not set the operations will never timeout. - pub start_execution_time: Option, + max_number_of_states: Option, /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. - pub execution_timeout: u128, - /// The maximum number of terms that an operation can have. - pub max_number_of_terms: usize, + execution_timeout: Option, + /// The time after when a [`EngineError::OperationTimeOutError`] should be thrown. + execution_deadline: Option, +} + +impl PartialEq for ExecutionProfile { + fn eq(&self, other: &ExecutionProfile) -> bool { + self.max_number_of_states == other.max_number_of_states + && self.execution_timeout == other.execution_timeout + } } impl ExecutionProfile { + /// Retrieve the current thread-local execution profile. + pub fn get() -> ExecutionProfile { + ThreadLocalParams::get_execution_profile() + } + /// Assert that `execution_timeout` is not exceeded. /// - /// Return empty if `execution_timeout` is not exceeded or if `start_execution_time` is not set. + /// Return empty if `execution_timeout` is not exceeded. /// /// Return [`EngineError::OperationTimeOutError`] otherwise. - pub fn assert_not_timed_out(&self) -> Result<(), EngineError> { - if let Some(start) = self.start_execution_time { - let run_duration = SystemTime::now() - .duration_since(start) - .expect("Time went backwards") - .as_millis(); - - if run_duration > self.execution_timeout { + pub(crate) fn assert_not_timed_out(&self) -> Result<(), EngineError> { + if let Some(execution_deadline) = self.execution_deadline { + if Instant::now() > execution_deadline { Err(EngineError::OperationTimeOutError) } else { Ok(()) @@ -108,186 +79,281 @@ impl ExecutionProfile { Ok(()) } } + + /// Assert that `max_number_of_states` is not exceeded. + /// + /// Return empty if `max_number_of_states` is not exceeded. + /// + /// Return [`EngineError::AutomatonHasTooManyStates`] otherwise. + pub(crate) fn assert_max_number_of_states( + &self, + number_of_states: usize, + ) -> Result<(), EngineError> { + if let Some(max_number_of_states) = self.max_number_of_states { + if number_of_states >= max_number_of_states { + return Err(EngineError::AutomatonHasTooManyStates); + } + } + Ok(()) + } + + pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { + self.execution_timeout = Some(execution_timeout_in_ms); + self + } + + pub fn with_max_number_of_states(mut self, max_number_of_states: usize) -> Self { + self.max_number_of_states = Some(max_number_of_states); + self + } + + pub fn set(&self) -> &Self { + self + } + + /// Run the given closure with this profile at thread level, setting its start time to now. + pub fn run(&self, f: F) -> R + where + F: FnOnce() -> R, + { + let initial_execution_profile = ThreadLocalParams::get_execution_profile(); + + let mut execution_profile = self.clone(); + if let Some(execution_timeout) = execution_profile.execution_timeout { + execution_profile.execution_deadline = Some(Instant::now() + Duration::from_millis(execution_timeout)); + } + + ThreadLocalParams::set_execution_profile(&execution_profile); + let result = f(); + ThreadLocalParams::set_execution_profile(&initial_execution_profile); + result + } + + /// Like [`ExecutionProfile::run`], but does *not* reset its start time. Useful if you want to pass a profile state to a new thread. + pub fn apply(&self, f: F) -> R + where + F: FnOnce() -> R, + { + let initial_execution_profile = ThreadLocalParams::get_execution_profile(); + + ThreadLocalParams::set_execution_profile(self); + let result = f(); + ThreadLocalParams::set_execution_profile(&initial_execution_profile); + result + } } -/// Hold [`ExecutionProfile`] on the current thread. -/// -/// The default [`ExecutionProfile`] is the following: -/// ``` -/// use regexsolver::execution_profile::ExecutionProfile; -/// -/// ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: None, -/// execution_timeout: 1500, -/// max_number_of_terms: 50, -/// }; -/// ``` -pub struct ThreadLocalParams; +pub struct ExecutionProfileBuilder { + /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. + max_number_of_states: Option, + /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. + execution_timeout: Option, +} +impl Default for ExecutionProfileBuilder { + fn default() -> Self { + Self::new() + } +} + +impl ExecutionProfileBuilder { + pub fn new() -> Self { + Self { + max_number_of_states: None, + execution_timeout: None, + } + } + + pub fn execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { + self.execution_timeout = Some(execution_timeout_in_ms); + self + } + + pub fn max_number_of_states(mut self, max_number_of_states: usize) -> Self { + self.max_number_of_states = Some(max_number_of_states); + self + } + + pub fn build(self) -> ExecutionProfile { + ExecutionProfile { + max_number_of_states: self.max_number_of_states, + execution_timeout: self.execution_timeout, + execution_deadline: None, + } + } +} + +struct ThreadLocalParams; impl ThreadLocalParams { thread_local! { - static MAX_NUMBER_OF_STATES: RefCell = const { RefCell::new(8192) }; - static START_EXECUTION_TIME: RefCell> = const { RefCell::new(None) }; - static EXECUTION_TIMEOUT: RefCell = const { RefCell::new(1500) }; - static MAX_NUMBER_OF_TERMS: RefCell = const { RefCell::new(50) }; + static MAX_NUMBER_OF_STATES: RefCell> = const { RefCell::new(None) }; + static EXECUTION_DEADLINE: RefCell> = const { RefCell::new(None) }; + static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; } /// Store on the current thread [`ExecutionProfile`]. - pub fn init_profile(profile: &ExecutionProfile) { + fn set_execution_profile(profile: &ExecutionProfile) { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| { *cell.borrow_mut() = profile.max_number_of_states; }); - ThreadLocalParams::START_EXECUTION_TIME.with(|cell| { - *cell.borrow_mut() = profile.start_execution_time; + ThreadLocalParams::EXECUTION_DEADLINE.with(|cell| { + *cell.borrow_mut() = profile.execution_deadline; }); ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| { *cell.borrow_mut() = profile.execution_timeout; }); - - ThreadLocalParams::MAX_NUMBER_OF_TERMS.with(|cell| { - *cell.borrow_mut() = profile.max_number_of_terms; - }); } - pub fn get_max_number_of_states() -> usize { + fn get_max_number_of_states() -> Option { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| *cell.borrow()) } - pub fn get_start_execution_time() -> Option { - ThreadLocalParams::START_EXECUTION_TIME.with(|cell| *cell.borrow()) + fn get_execution_deadline() -> Option { + ThreadLocalParams::EXECUTION_DEADLINE.with(|cell| *cell.borrow()) } - pub fn get_execution_timeout() -> u128 { + fn get_execution_timeout() -> Option { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| *cell.borrow()) } - pub fn get_max_number_of_terms() -> usize { - ThreadLocalParams::MAX_NUMBER_OF_TERMS.with(|cell| *cell.borrow()) - } - /// Return the [`ExecutionProfile`] stored on the current thread. - pub fn get_execution_profile() -> ExecutionProfile { + fn get_execution_profile() -> ExecutionProfile { ExecutionProfile { max_number_of_states: Self::get_max_number_of_states(), - start_execution_time: Self::get_start_execution_time(), + execution_deadline: Self::get_execution_deadline(), execution_timeout: Self::get_execution_timeout(), - max_number_of_terms: Self::get_max_number_of_terms(), } } } #[cfg(test)] mod tests { - use crate::{regex::RegularExpression, Term}; + use crate::{Term, regex::RegularExpression}; use super::*; + fn assert_send() {} + fn assert_sync() {} + + #[test] + fn test_traits() -> Result<(), String> { + assert_send::(); + assert_sync::(); + + Ok(()) + } + + #[test] + fn test_execution_get() -> Result<(), String> { + let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(1000) + .max_number_of_states(8192) + .build(); + + execution_profile.run(|| { + assert_eq!(execution_profile, ExecutionProfile::get()); + }); + + Ok(()) + } + #[test] fn test_execution() -> Result<(), String> { - let execution_profile = ExecutionProfile { - max_number_of_states: 1, - start_execution_time: None, - execution_timeout: 1000, - max_number_of_terms: 10, - }; - ThreadLocalParams::init_profile(&execution_profile); - - let regex = RegularExpression::new("test").unwrap(); - - assert!(regex.to_automaton().is_err()); - assert_eq!( - EngineError::AutomatonHasTooManyStates, - regex.to_automaton().unwrap_err() - ); + ExecutionProfileBuilder::new() + .max_number_of_states(1) + .build() + .run(|| { + let regex = RegularExpression::new("test").unwrap(); + + assert!(regex.to_automaton().is_err()); + assert_eq!( + EngineError::AutomatonHasTooManyStates, + regex.to_automaton().unwrap_err() + ); + }); Ok(()) } #[test] fn test_execution_timeout_generate_strings() -> Result<(), String> { - let term = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let term = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + + let execution_timeout_in_ms = 10; + let start_time = Instant::now(); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term.generate_strings(100).unwrap_err() + ); + + let run_duration = Instant::now() + .duration_since(start_time) + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= (execution_timeout_in_ms + 50) as u128); + }); - let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term.generate_strings(100).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); Ok(()) } #[test] fn test_execution_timeout_difference() -> Result<(), String> { - let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); - let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + + let execution_timeout_in_ms = 50; + let start_time = Instant::now(); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term1.difference(&term2).unwrap_err() + ); + + let run_duration = Instant::now() + .duration_since(start_time) + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= (execution_timeout_in_ms + 25) as u128); + }); - let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term1.difference(&term2).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); Ok(()) } - #[test] + /*#[test] fn test_execution_timeout_intersection() -> Result<(), String> { - let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); - let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let execution_timeout_in_ms = 100; let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term1.intersection(&[term2]).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term1.intersection(&[term2]).unwrap_err() + ); + + let run_duration = SystemTime::now() + .duration_since(start_time) + .expect("Time went backwards") + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= execution_timeout_in_ms + 100); + }); + Ok(()) - } + }*/ } diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 04ea226..ec5f514 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -3,18 +3,18 @@ use std::hash::BuildHasherDefault; use super::*; impl FastAutomaton { - pub fn get_cardinality(&self) -> Option> { + /// Returns the cardinality of the automaton (i.e., the number of possible matched strings). + pub fn get_cardinality(&self) -> Cardinality { if self.is_empty() { - return Some(Cardinality::Integer(0)); + return Cardinality::Integer(0); } else if self.cyclic || self.is_total() { - return Some(Cardinality::Infinite); - } else if !self.deterministic { - return None; + return Cardinality::Infinite; } + assert!(self.is_deterministic(), "The automaton should be deterministic."); let topologically_sorted_states = self.topological_sorted_states(); if topologically_sorted_states.is_none() { - return Some(Cardinality::Infinite); + return Cardinality::Infinite; } let topologically_sorted_states = topologically_sorted_states.unwrap(); @@ -40,7 +40,7 @@ impl FastAutomaton { } } - return Some(Cardinality::BigInteger); + return Cardinality::BigInteger; } } } @@ -52,10 +52,10 @@ impl FastAutomaton { temp_cardinality = add; continue; } - return Some(Cardinality::BigInteger); + return Cardinality::BigInteger; } } - Some(Cardinality::Integer(temp_cardinality)) + Cardinality::Integer(temp_cardinality) } fn topological_sorted_states(&self) -> Option> { @@ -65,9 +65,9 @@ impl FastAutomaton { let mut queue = VecDeque::with_capacity(len); let mut order = Vec::with_capacity(len); - for from_state in &self.transitions_vec() { - in_degree.entry(*from_state).or_insert(0); - for to_state in self.transitions_from_state_iter(from_state) { + for &from_state in &self.states_vec() { + in_degree.entry(from_state).or_insert(0); + for to_state in self.direct_states(from_state) { *in_degree.entry(to_state).or_insert(0) += 1; } } @@ -80,7 +80,7 @@ impl FastAutomaton { while let Some(from_state) = queue.pop_front() { order.push(from_state); - for to_state in self.transitions_from_state_iter(&from_state) { + for to_state in self.direct_states(from_state) { *in_degree.entry(to_state).or_default() -= 1; if in_degree[&to_state] == 0 { diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index d81294c..3f70711 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -3,21 +3,22 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - pub fn is_equivalent_of(&self, other: &FastAutomaton) -> Result { + /// Returns `true` if both automata accept the same language. + pub fn equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); } else if self == other { return Ok(true); } - let mut other_complement = other.determinize()?; + let mut other_complement = other.determinize()?.into_owned(); other_complement.complement()?; if self.has_intersection(&other_complement)? { return Ok(false); } - let mut self_complement = self.determinize()?; + let mut self_complement = self.determinize()?.into_owned(); self_complement.complement()?; Ok(!self_complement.has_intersection(other)?) @@ -43,26 +44,26 @@ mod tests { false, ); - let regex_1 = RegularExpression::new("cd").unwrap(); - let regex_2 = RegularExpression::new("cd").unwrap(); + let regex_1 = RegularExpression::parse("cd", false).unwrap(); + let regex_2 = RegularExpression::parse("cd", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, true); - let regex_1 = RegularExpression::new("test.*other").unwrap(); - let regex_2 = RegularExpression::new("test.*othew").unwrap(); + let regex_1 = RegularExpression::parse("test.*other", false).unwrap(); + let regex_2 = RegularExpression::parse("test.*othew", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("test.{0,50}other").unwrap(); - let regex_2 = RegularExpression::new("test.{0,49}other").unwrap(); + let regex_1 = RegularExpression::parse("test.{0,50}other", false).unwrap(); + let regex_2 = RegularExpression::parse("test.{0,49}other", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("[0]").unwrap(); - let regex_2 = RegularExpression::new("[01]").unwrap(); + let regex_1 = RegularExpression::parse("[0]", false).unwrap(); + let regex_2 = RegularExpression::parse("[01]", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("(b+a+)*").unwrap(); - let regex_2 = RegularExpression::new("(b[a-b]*a)?").unwrap(); + let regex_1 = RegularExpression::parse("(b+a+)*", false).unwrap(); + let regex_2 = RegularExpression::parse("(b[a-b]*a)?", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, true); Ok(()) @@ -71,14 +72,14 @@ mod tests { fn assert_equivalent(regex_1: &RegularExpression, regex_2: &RegularExpression, expected: bool) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.is_equivalent_of(&automaton_1).unwrap()); + assert_eq!(true, automaton_1.equivalent(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.is_equivalent_of(&automaton_2).unwrap()); + assert_eq!(true, automaton_2.equivalent(&automaton_2).unwrap()); assert_eq!( expected, - automaton_1.is_equivalent_of(&automaton_2).unwrap() + automaton_1.equivalent(&automaton_2).unwrap() ); } } diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 70eccbd..c753908 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -1,6 +1,7 @@ use super::*; impl FastAutomaton { + /// Returns the minimum and maximum length of matched strings. pub fn get_length(&self) -> (Option, Option) { if self.is_empty() { return (None, None); @@ -26,7 +27,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.transitions_from_state_iter(&state) { + for to_state in self.direct_states(state) { if to_state == state || seen.contains(&to_state) { is_infinite = true; continue; @@ -53,7 +54,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.transitions_from_state_iter(&state) { + for to_state in self.direct_states(state) { if to_state == state || seen.contains(&to_state) { max = None; break; diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 56f0884..700a2d9 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -10,11 +10,13 @@ mod length; mod subset; impl FastAutomaton { + /// Checks if the automaton matches the empty language. #[inline] pub fn is_empty(&self) -> bool { self.accept_states.is_empty() } + /// Checks if the automaton matches all possible strings. #[inline] pub fn is_total(&self) -> bool { if self.accept_states.contains(&self.start_state) { @@ -25,12 +27,21 @@ impl FastAutomaton { false } - pub fn get_reacheable_states(&self) -> IntSet { + /// Checks if the automaton only matches the empty string `""`. + #[inline] + pub fn is_empty_string(&self) -> bool { + self.accept_states.len() == 1 + && self.accept_states.contains(&self.start_state) + && self.in_degree(self.start_state) == 0 + } + + /// Returns the set of all states reachable from the start state. + pub fn get_reachable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); - for from_state in self.transitions_iter() { - for (to_state, transition) in self.transitions_from_state_enumerate_iter(&from_state) { - if transition.is_empty() { + for from_state in self.states() { + for (condition, to_state) in self.transitions_from(from_state) { + if condition.is_empty() { continue; } match states_map.entry(*to_state) { @@ -61,9 +72,43 @@ impl FastAutomaton { live } - pub fn get_ranges(&self) -> Result, EngineError> { - self.spanning_set.get_spanning_ranges().map(|range| { - Condition::from_range(range, &self.spanning_set) - }).collect() + pub(crate) fn get_spanning_bases(&self) -> Result, EngineError> { + self.spanning_set + .get_spanning_ranges() + .map(|range| Condition::from_range(range, &self.spanning_set)) + .collect() + } +} + +#[cfg(test)] +mod tests { + + use crate::fast_automaton::FastAutomaton; + + #[test] + fn test_empty() -> Result<(), String> { + assert!(!FastAutomaton::new_total().is_empty()); + assert!(!FastAutomaton::new_empty_string().is_empty()); + assert!(FastAutomaton::new_empty().is_empty()); + + Ok(()) + } + + #[test] + fn test_empty_string() -> Result<(), String> { + assert!(!FastAutomaton::new_total().is_empty_string()); + assert!(FastAutomaton::new_empty_string().is_empty_string()); + assert!(!FastAutomaton::new_empty().is_empty_string()); + + Ok(()) + } + + #[test] + fn test_total() -> Result<(), String> { + assert!(FastAutomaton::new_total().is_total()); + assert!(!FastAutomaton::new_empty_string().is_total()); + assert!(!FastAutomaton::new_empty().is_total()); + + Ok(()) } } diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index 5705fc2..e4ca7d6 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -3,14 +3,15 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - pub fn is_subset_of(&self, other: &FastAutomaton) -> Result { + /// Returns `true` if all strings accepted by `self` are also accepted by `other`. + pub fn subset(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); } else if other.is_empty() || self.is_total() { return Ok(false); } - let mut other = other.determinize()?; + let mut other = other.determinize()?.into_owned(); other.complement()?; Ok(!self.has_intersection(&other)?) @@ -38,33 +39,33 @@ mod tests { true, ); - let regex1 = RegularExpression::new("test.*other").unwrap(); - let regex2 = RegularExpression::new("test.*othew").unwrap(); + let regex1 = RegularExpression::parse("test.*other", false).unwrap(); + let regex2 = RegularExpression::parse("test.*othew", false).unwrap(); assert_subset(®ex1, ®ex2, false, false); - let regex1 = RegularExpression::new("test.{0,50}other").unwrap(); - let regex2 = RegularExpression::new("test.{0,49}other").unwrap(); + let regex1 = RegularExpression::parse("test.{0,50}other", false).unwrap(); + let regex2 = RegularExpression::parse("test.{0,49}other", false).unwrap(); assert_subset(®ex1, ®ex2, false, true); - let regex1 = RegularExpression::new("(abc|def)").unwrap(); - let regex2 = RegularExpression::new("(abc|def|xyz)").unwrap(); + let regex1 = RegularExpression::parse("(abc|def)", false).unwrap(); + let regex2 = RegularExpression::parse("(abc|def|xyz)", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("[0]").unwrap(); - let regex2 = RegularExpression::new("[01]").unwrap(); + let regex1 = RegularExpression::parse("[0]", false).unwrap(); + let regex2 = RegularExpression::parse("[01]", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("a.*b.*c.*").unwrap(); - let regex2 = RegularExpression::new("a.*b.*").unwrap(); + let regex1 = RegularExpression::parse("a.*b.*c.*", false).unwrap(); + let regex2 = RegularExpression::parse("a.*b.*", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("1..").unwrap(); - let regex2 = RegularExpression::new("...").unwrap(); + let regex1 = RegularExpression::parse("1..", false).unwrap(); + let regex2 = RegularExpression::parse("...", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); @@ -79,18 +80,18 @@ mod tests { ) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.is_subset_of(&automaton_1).unwrap()); + assert_eq!(true, automaton_1.subset(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.is_subset_of(&automaton_2).unwrap()); + assert_eq!(true, automaton_2.subset(&automaton_2).unwrap()); assert_eq!( expected_1_2, - automaton_1.is_subset_of(&automaton_2).unwrap() + automaton_1.subset(&automaton_2).unwrap() ); assert_eq!( expected_2_1, - automaton_2.is_subset_of(&automaton_1).unwrap() + automaton_2.subset(&automaton_1).unwrap() ); } } diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index b6cf50b..8b8f844 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -5,6 +5,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Creates an automaton that matches the empty language. #[inline] pub fn new_empty() -> Self { Self { @@ -19,6 +20,7 @@ impl FastAutomaton { } } + /// Creates an automaton that only matches the empty string `""`. #[inline] pub fn new_empty_string() -> Self { let mut automaton = Self::new_empty(); @@ -26,73 +28,33 @@ impl FastAutomaton { automaton } + /// Creates an automaton that matches all possible strings. #[inline] pub fn new_total() -> Self { let mut automaton: FastAutomaton = Self::new_empty(); automaton.spanning_set = SpanningSet::new_total(); automaton.accept(automaton.start_state); - automaton.add_transition_to(0, 0, &Condition::total(&automaton.spanning_set)); + automaton.add_transition(0, 0, &Condition::total(&automaton.spanning_set)); automaton } - #[inline] - pub fn make_empty(&mut self) { - self.apply_model(&Self::new_empty()) - } - - #[inline] - pub fn make_total(&mut self) { - self.apply_model(&Self::new_total()) - } - - pub fn make_from_range(range: &Range) -> Result { + /// Creates an automaton that matches one of the characters in the given [`CharRange`]. + pub fn new_from_range(range: &CharRange) -> Self { let mut automaton = Self::new_empty(); if range.is_empty() { - return Ok(automaton); + return automaton; } let new_state = automaton.new_state(); let spanning_set = SpanningSet::compute_spanning_set(&[range.clone()]); - let condition = Condition::from_range(range, &spanning_set)?; + let condition = Condition::from_range(range, &spanning_set).expect("The spanning set should be valid"); automaton.spanning_set = spanning_set; - automaton.add_transition_to(0, new_state, &condition); + automaton.add_transition(0, new_state, &condition); automaton.accept(new_state); - Ok(automaton) - } - - pub fn apply_new_spanning_set( - &mut self, - new_spanning_set: &SpanningSet, - ) -> Result<(), EngineError> { - if new_spanning_set == &self.spanning_set { - return Ok(()); - } - let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; - for from_state in &self.transitions_vec() { - for to_state in self.transitions_from_state(from_state) { - match self.transitions[*from_state].entry(to_state) { - Entry::Occupied(mut o) => { - o.insert(condition_converter.convert(o.get())?); - } - Entry::Vacant(_) => {} - }; - } - } - self.spanning_set = new_spanning_set.clone(); - Ok(()) - } - - #[inline] - pub fn apply_model(&mut self, model: &FastAutomaton) { - self.transitions = model.transitions.clone(); - self.start_state = model.start_state; - self.accept_states = model.accept_states.clone(); - self.removed_states = model.removed_states.clone(); - self.spanning_set = model.spanning_set.clone(); - self.deterministic = model.deterministic; - self.cyclic = model.cyclic; + automaton } + /// Creates a new state and returns its identifier. #[inline] pub fn new_state(&mut self) -> State { if let Some(new_state) = self.removed_states.clone().iter().next() { @@ -104,13 +66,52 @@ impl FastAutomaton { } } + /// Marks the provided state as an accepting (final) state. #[inline] pub fn accept(&mut self, state: State) { self.assert_state_exists(state); self.accept_states.insert(state); } - pub fn add_transition_to(&mut self, from_state: State, to_state: State, new_cond: &Condition) { + /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. + /// + /// This method accepts a [`Condition`] rather than a raw character set. To build a [`Condition`], call: + /// ```rust + /// # use regexsolver::CharRange; + /// # use regexsolver::fast_automaton::{condition::Condition, spanning_set::SpanningSet}; + /// # let range = CharRange::total(); + /// # let spanning_set = SpanningSet::new_total(); + /// Condition::from_range(&range, &spanning_set); + /// ``` + /// where `spanning_set` is the automaton's current [`SpanningSet`]. The [`CharRange`] you pass must be fully covered by that spanning set. If it isn't, you have two options: + /// + /// 1. Merge an existing spanning set with another: + /// ```rust + /// # use regexsolver::fast_automaton::spanning_set::SpanningSet; + /// # let old_set = SpanningSet::new_total(); + /// # let other_set = SpanningSet::new_total(); + /// let new_set = SpanningSet::merge(&old_set, &other_set); + /// ``` + /// + /// 2. Recompute from a list of ranges: + /// ```rust + /// # use regexsolver::CharRange; + /// # use regexsolver::fast_automaton::spanning_set::SpanningSet; + /// # let range_set1 = CharRange::total(); + /// # let range_set2 = CharRange::total(); + /// let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2]); + /// ``` + /// + /// After constructing `new_set`, apply it to the automaton: + /// ```rust + /// # use regexsolver::fast_automaton::{FastAutomaton, spanning_set::SpanningSet}; + /// # let mut fast_automaton = FastAutomaton::new_total(); + /// # let new_set = SpanningSet::new_total(); + /// fast_automaton.apply_new_spanning_set(&new_set); + /// ``` + /// + /// This design allows us to perform unions, intersections, and complements of transition conditions in O(1) time, but it does add some complexity to automaton construction. For more details, you can check [this article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). + pub fn add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition) { self.assert_state_exists(from_state); if from_state != to_state { self.assert_state_exists(to_state); @@ -121,7 +122,7 @@ impl FastAutomaton { if self.deterministic { let mut deterministic = true; - for (state, condition) in self.transitions_from_state_enumerate_iter(&from_state) { + for (condition, state) in self.transitions_from(from_state) { if state == &to_state { continue; } @@ -147,7 +148,8 @@ impl FastAutomaton { }; } - pub fn add_epsilon(&mut self, from_state: State, to_state: State) { + /// Creates a new epsilon transition between the two states. + pub fn add_epsilon_transition(&mut self, from_state: State, to_state: State) { if from_state == to_state { return; } @@ -157,12 +159,15 @@ impl FastAutomaton { self.accept_states.insert(from_state); } - let transitions_to: Vec<_> = self.transitions_from_state_into_iter(&to_state).collect(); + let transitions_to: Vec<_> = self + .transitions_from(to_state) + .map(|(cond, to_state)| (cond.clone(), *to_state)) + .collect(); - for (state, cond) in transitions_to { + for (cond, state) in transitions_to { if self.deterministic { let mut deterministic = true; - for (s, c) in self.transitions_from_state_enumerate_iter(&from_state) { + for (c, s) in self.transitions_from(from_state) { if state == *s { continue; } @@ -188,13 +193,25 @@ impl FastAutomaton { } } + /// Removes the transition between the two provided states if it exists. + pub fn remove_transition(&mut self, from_state: State, to_state: State) { + self.assert_state_exists(from_state); + if from_state != to_state { + self.assert_state_exists(to_state); + } + + self.transitions_in + .entry(to_state) + .or_default() + .remove(&from_state); + self.transitions[from_state].remove(&to_state); + } + + /// Removes the state and its connected transitions; panics if it's a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state { - panic!( - "Can not remove the state {}, it is still used as start state.", - state - ); + panic!("Can not remove the state {state}, it is still used as start state."); } self.accept_states.remove(&state); self.transitions_in.remove(&state); @@ -220,6 +237,7 @@ impl FastAutomaton { } } + /// Removes the given states and their connected transitions; panics if any is a start state. pub fn remove_states(&mut self, states: &IntSet) { self.accept_states.retain(|e| !states.contains(e)); @@ -227,10 +245,7 @@ impl FastAutomaton { for &state in states { if self.start_state == state { - panic!( - "Can not remove the state {}, it is still used as start state.", - state - ); + panic!("Can not remove the state {state}, it is still used as start state."); } if self.transitions.len() - 1 == state { self.transitions.remove(state); @@ -261,6 +276,51 @@ impl FastAutomaton { } } } + + /// Applies the provided spanning set and projects all existing conditions onto it. + pub fn apply_new_spanning_set( + &mut self, + new_spanning_set: &SpanningSet, + ) -> Result<(), EngineError> { + if new_spanning_set == &self.spanning_set { + return Ok(()); + } + let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; + for &from_state in &self.states_vec() { + for to_state in self.direct_states_vec(from_state) { + match self.transitions[from_state].entry(to_state) { + Entry::Occupied(mut o) => { + o.insert(condition_converter.convert(o.get())?); + } + Entry::Vacant(_) => {} + }; + } + } + self.spanning_set = new_spanning_set.clone(); + Ok(()) + } + + #[inline] + pub(crate) fn make_empty(&mut self) { + self.apply_model(&Self::new_empty()) + } + + #[inline] + pub(crate) fn make_total(&mut self) { + self.apply_model(&Self::new_total()) + } + + #[inline] + pub(crate) fn apply_model(&mut self, model: &FastAutomaton) { + self.transitions = model.transitions.clone(); + self.transitions_in = model.transitions_in.clone(); + self.start_state = model.start_state; + self.accept_states = model.accept_states.clone(); + self.removed_states = model.removed_states.clone(); + self.spanning_set = model.spanning_set.clone(); + self.deterministic = model.deterministic; + self.cyclic = model.cyclic; + } } #[cfg(test)] @@ -278,10 +338,10 @@ mod tests { } fn assert_regex_build_deterministic_automaton(regex: &str, deterministic: bool) { - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); - assert_eq!(deterministic, automaton.is_determinitic()); + assert_eq!(deterministic, automaton.is_deterministic()); } } diff --git a/src/fast_automaton/condition/converter.rs b/src/fast_automaton/condition/converter.rs index 89bb123..9fabd11 100644 --- a/src/fast_automaton/condition/converter.rs +++ b/src/fast_automaton/condition/converter.rs @@ -59,14 +59,10 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { pub fn convert(&self, condition: &Condition) -> Result { let mut new_condition = Condition::empty(self.to_spanning_set); for (from_index, to_indexes) in self.equivalence_map.iter().enumerate() { - if let Some(has) = condition.0.get(from_index) { - if has && !to_indexes.is_empty() { - to_indexes.iter().for_each(|&to_index| { - new_condition.0.set(to_index, true); - }); - } - } else { - return Err(EngineError::ConditionIndexOutOfBound); + if condition.0.get(from_index) && !to_indexes.is_empty() { + to_indexes.iter().for_each(|&to_index| { + new_condition.0.set(to_index, true); + }); } } @@ -86,17 +82,16 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { #[cfg(test)] mod tests { + use crate::CharRange; use regex_charclass::{char::Char, irange::range::AnyRange}; - use crate::Range; - use super::*; fn get_from_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\0')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\0')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; SpanningSet::compute_spanning_set(&ranges) @@ -104,11 +99,11 @@ mod tests { fn get_to_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\0')..=Char::new('\u{1}')), - Range::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), - Range::new_from_range(Char::new('\u{20}')..=Char::new('\u{22}')), + CharRange::new_from_range(Char::new('\0')..=Char::new('\u{1}')), + CharRange::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{20}')..=Char::new('\u{22}')), ]; SpanningSet::compute_spanning_set(&ranges) @@ -127,7 +122,7 @@ mod tests { let total = Condition::total(&from_spanning_set); assert!(converter.convert(&total).unwrap().is_total()); - let range = Range::new_from_range(Char::new('\0')..=Char::new('\u{2}')); + let range = CharRange::new_from_range(Char::new('\0')..=Char::new('\u{2}')); let condition = Condition::from_range(&range, &from_spanning_set).unwrap(); assert_eq!( range, @@ -138,7 +133,7 @@ mod tests { .unwrap() ); - let range = Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')); + let range = CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')); let condition = Condition::from_range(&range, &from_spanning_set).unwrap(); assert_eq!( range, @@ -149,7 +144,7 @@ mod tests { .unwrap() ); - let range = Range::new_from_ranges(&[ + let range = CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{4}')..=Char::new('\u{6}')), AnyRange::from(Char::new('\u{9}')..=Char::new('\u{9}')), ]); diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index bbf4376..9c85a43 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -7,8 +7,8 @@ pub struct FastBitVec { impl std::fmt::Display for FastBitVec { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { for i in 0..self.n { - let bit = if self.get(i).unwrap() { 1 } else { 0 }; - write!(f, "{}", bit)?; + let bit = if self.get(i) { 1 } else { 0 }; + write!(f, "{bit}")?; } Ok(()) } @@ -48,13 +48,11 @@ impl FastBitVec { } #[inline] - pub fn get(&self, i: usize) -> Option { - if i >= self.n { - return None; - } + pub fn get(&self, i: usize) -> bool { + assert!(i < self.n, "The provided bit index is out of bound."); let w = i / 64; let b = i % 64; - self.bits.get(w).map(|&block| (block & (1 << b)) != 0) + (self.bits[w] & (1 << b)) != 0 } #[inline] @@ -124,10 +122,10 @@ impl FastBitVec { } pub fn get_bits(&self) -> Vec { - let mut hot_bits = Vec::with_capacity(self.n); + let mut bits = Vec::with_capacity(self.n); for i in 0..self.n { - hot_bits.push(self.get(i).unwrap()); + bits.push(self.get(i)); } - hot_bits + bits } } diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index da9c2b8..122ccc9 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -1,10 +1,9 @@ use std::hash::Hash; -use crate::Range; use fast_bit_vec::FastBitVec; -use regex_charclass::{char::Char, CharacterClass}; +use regex_charclass::{CharacterClass, char::Char}; -use crate::error::EngineError; +use crate::{CharRange, error::EngineError}; use super::spanning_set::SpanningSet; pub mod converter; @@ -43,7 +42,7 @@ impl Condition { )) } - pub fn from_range(range: &Range, spanning_set: &SpanningSet) -> Result { + pub fn from_range(range: &CharRange, spanning_set: &SpanningSet) -> Result { if range.is_empty() { return Ok(Self::empty(spanning_set)); } else if range.is_total() { @@ -69,20 +68,16 @@ impl Condition { Ok(cond) } - pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { - let mut range = Range::empty(); + pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { + let mut range = CharRange::empty(); for (i, base) in spanning_set .get_spanning_ranges_with_rest() .iter() .enumerate() { - if let Some(has) = self.0.get(i) { - if has { - range = range.union(base); - } - } else { - return Err(EngineError::ConditionIndexOutOfBound); + if self.0.get(i) { + range = range.union(base); } } @@ -151,7 +146,8 @@ impl Condition { Ok(self.to_range(spanning_set)?.get_cardinality()) } - pub fn get_bits(&self) -> Vec { + #[inline] + pub fn get_binary_representation(&self) -> Vec { self.0.get_bits() } } @@ -165,25 +161,25 @@ mod tests { fn get_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; SpanningSet::compute_spanning_set(&ranges) } - fn get_test_cases_range() -> Vec { + fn get_test_cases_range() -> Vec { vec![ - Range::empty(), - Range::total(), - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_ranges(&[ + CharRange::empty(), + CharRange::total(), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{0}')..=Char::new('\u{2}')), AnyRange::from(Char::new('\u{4}')..=Char::new('\u{6}')), ]), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ] } @@ -193,22 +189,28 @@ mod tests { let empty = Condition::empty(&spanning_set); //println!("{empty}"); assert!(empty.is_empty()); - assert_eq!(vec![false, false, false, false], empty.get_bits()); + assert_eq!( + vec![false, false, false, false], + empty.get_binary_representation() + ); let total = Condition::total(&spanning_set); //println!("{total}"); assert!(total.is_total()); - assert_eq!(vec![true, true, true, true], total.get_bits()); + assert_eq!( + vec![true, true, true, true], + total.get_binary_representation() + ); - assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); - assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); assert_eq!( empty, - Condition::from_range(&Range::empty(), &spanning_set).unwrap() + Condition::from_range(&CharRange::empty(), &spanning_set).unwrap() ); assert_eq!( total, - Condition::from_range(&Range::total(), &spanning_set).unwrap() + Condition::from_range(&CharRange::total(), &spanning_set).unwrap() ); assert_eq!(empty, total.complement()); @@ -218,20 +220,20 @@ mod tests { let empty = Condition::empty(&spanning_set); let total = Condition::total(&spanning_set); - assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); - assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); assert_eq!( empty, - Condition::from_range(&Range::empty(), &spanning_set).unwrap() + Condition::from_range(&CharRange::empty(), &spanning_set).unwrap() ); - assert_eq!(vec![false], empty.get_bits()); + assert_eq!(vec![false], empty.get_binary_representation()); assert_eq!( total, - Condition::from_range(&Range::total(), &spanning_set).unwrap() + Condition::from_range(&CharRange::total(), &spanning_set).unwrap() ); - assert_eq!(vec![true], total.get_bits()); + assert_eq!(vec![true], total.get_binary_representation()); assert_eq!(empty, total.complement()); assert_eq!(total, empty.complement()); @@ -251,7 +253,7 @@ mod tests { Ok(()) } - fn assert_range_convertion_to_range(range: &Range, spanning_set: &SpanningSet) { + fn assert_range_convertion_to_range(range: &CharRange, spanning_set: &SpanningSet) { let condition = Condition::from_range(range, spanning_set).unwrap(); let range_from_condition = condition.to_range(spanning_set).unwrap(); assert_eq!(range, &range_from_condition); @@ -266,11 +268,11 @@ mod tests { let current_spanning_set = get_spanning_set(); let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{1}')), - Range::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{5}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{1}')), + CharRange::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{5}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); let condition_converter = @@ -295,7 +297,7 @@ mod tests { } fn assert_project_to( - range: &Range, + range: &CharRange, currently_used_spanning_set: &SpanningSet, newly_used_spanning_set: &SpanningSet, condition_converter: &ConditionConverter, @@ -347,8 +349,8 @@ mod tests { } fn assert_union_intersection_complement( - range_1: &Range, - range_2: &Range, + range_1: &CharRange, + range_2: &CharRange, used_characters: &SpanningSet, ) { let condition_1 = Condition::from_range(range_1, used_characters).unwrap(); @@ -377,14 +379,14 @@ mod tests { #[test] fn test_1() -> Result<(), String> { let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{9}')), - Range::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')), - Range::new_from_range(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')), + CharRange::new_from_range(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), ]; let spanning_set = SpanningSet::compute_spanning_set(&ranges); println!("{:?}", spanning_set); - let range1 = Range::new_from_ranges(&[ + let range1 = CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{0}')..=Char::new('\u{9}')), AnyRange::from(Char::new('\u{B}')..=Char::new('\u{63}')), AnyRange::from(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), @@ -392,7 +394,7 @@ mod tests { let condition1 = Condition::from_range(&range1, &spanning_set).unwrap(); assert_eq!(range1, condition1.to_range(&spanning_set).unwrap()); - let range2 = Range::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')); + let range2 = CharRange::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')); let condition2 = Condition::from_range(&range2, &spanning_set).unwrap(); assert_eq!(range2, condition2.to_range(&spanning_set).unwrap()); diff --git a/src/fast_automaton/convert/to_regex/builder/scc.rs b/src/fast_automaton/convert/to_regex/builder/scc.rs deleted file mode 100644 index 815188a..0000000 --- a/src/fast_automaton/convert/to_regex/builder/scc.rs +++ /dev/null @@ -1,207 +0,0 @@ -use super::*; - -impl StateEliminationAutomaton { - pub fn identify_and_apply_components(&mut self) -> Result<(), EngineError> { - let mut index = 0; - let mut stack = Vec::new(); - let mut indices = vec![-1; self.transitions.len()]; - let mut lowlink = vec![-1; self.transitions.len()]; - let mut on_stack = vec![false; self.transitions.len()]; - let mut scc = Vec::new(); - - for state in self.states_iter() { - if self.removed_states.contains(&state) { - continue; - } - if indices[state] == -1 { - self.strongconnect( - state, - &mut index, - &mut stack, - &mut indices, - &mut lowlink, - &mut on_stack, - &mut scc, - ); - } - } - - let scc = scc - .into_iter() - .filter(|states| { - let first_state = states.iter().next().unwrap(); - let self_loop = if let Some(transitions_in) = self.transitions_in.get(first_state) { - transitions_in.contains(first_state) - } else { - false - }; - states.len() != 1 || self_loop - }) - .collect::>(); - - for component in scc { - self.build_component(&component)?; - } - - self.cyclic = false; - - Ok(()) - } - - #[allow(clippy::too_many_arguments)] - fn strongconnect( - &self, - v: usize, - index: &mut usize, - stack: &mut Vec, - indices: &mut Vec, - lowlink: &mut Vec, - on_stack: &mut Vec, - scc: &mut Vec>, - ) { - indices[v] = *index as i32; - lowlink[v] = *index as i32; - *index += 1; - stack.push(v); - on_stack[v] = true; - - if let Some(neighbors) = self.transitions.get(v) { - for &w in neighbors.keys() { - if indices[w] == -1 { - self.strongconnect(w, index, stack, indices, lowlink, on_stack, scc); - lowlink[v] = lowlink[v].min(lowlink[w]); - } else if on_stack[w] { - lowlink[v] = lowlink[v].min(indices[w]); - } - } - } - - if lowlink[v] == indices[v] { - let mut component = Vec::new(); - while let Some(w) = stack.pop() { - on_stack[w] = false; - component.push(w); - if w == v { - break; - } - } - scc.push(component); - } - } - - fn build_component(&mut self, states: &[usize]) -> Result<(), EngineError> { - let state_set = states.iter().copied().collect::>(); - let mut start_states = IntMap::new(); - let mut accept_states = IntMap::new(); - - let mut state_elimination_automaton = StateEliminationAutomaton { - start_state: 0, // start_state is not set yet - accept_state: 0, // accept_state is not set yet - transitions: Vec::with_capacity(states.len()), - transitions_in: IntMap::with_capacity(states.len()), - removed_states: IntSet::new(), - cyclic: true, - }; - - let mut states_map = IntMap::with_capacity(states.len()); - for from_state in states { - if *from_state == self.accept_state { - self.accept_state = self.new_state(); - self.add_transition_to(*from_state, self.accept_state, GraphTransition::Epsilon); - } - if *from_state == self.start_state { - self.start_state = self.new_state(); - self.add_transition_to(self.start_state, *from_state, GraphTransition::Epsilon); - } - let from_state_new = *states_map - .entry(*from_state) - .or_insert_with(|| state_elimination_automaton.new_state()); - for (to_state, transition) in self.transitions_from_state_enumerate_iter(from_state) { - if !state_set.contains(to_state) { - accept_states - .entry(*to_state) - .or_insert_with(Vec::new) - .push((from_state_new, transition.clone())); - continue; - } - - let to_state_new = *states_map - .entry(*to_state) - .or_insert_with(|| state_elimination_automaton.new_state()); - - state_elimination_automaton.add_transition_to( - from_state_new, - to_state_new, - transition.clone(), - ); - } - - for (parent_state, transition) in self.in_transitions_vec(*from_state) { - if !state_set.contains(&parent_state) { - start_states - .entry(from_state_new) - .or_insert_with(Vec::new) - .push((parent_state, transition.clone())); - } - } - } - - for state in states { - self.remove_state(*state); - } - - for (start_state, parent_states) in &start_states { - for (parent_state, transition) in parent_states { - let new_parent_state = if !transition.is_empty_string() { - let new_parent_state = self.new_state(); - - self.add_transition_to(*parent_state, new_parent_state, transition.clone()); - new_parent_state - } else { - *parent_state - }; - for (target_state, accept_states_transition) in &accept_states { - let mut new_automaton = state_elimination_automaton.clone(); - - let target_state = if accept_states_transition.len() > 1 { - new_automaton.accept_state = new_automaton.new_state(); - for (accept_state, transition) in accept_states_transition { - new_automaton.add_transition_to( - *accept_state, - new_automaton.accept_state, - transition.clone(), - ); - } - *target_state - } else { - let (accept_state, transition) = - accept_states_transition.iter().next().unwrap(); - - new_automaton.accept_state = *accept_state; - if !transition.is_empty_string() { - let new_target_state = self.new_state(); - self.add_transition_to( - new_target_state, - *target_state, - transition.clone(), - ); - new_target_state - } else { - *target_state - } - }; - - new_automaton.start_state = *start_state; - - self.add_transition_to( - new_parent_state, - target_state, - GraphTransition::Graph(new_automaton), - ); - } - } - } - - Ok(()) - } -} diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index d9a1dd0..fbe36ce 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -1,288 +1,13 @@ -use std::{ - collections::{hash_map::Entry, VecDeque}, - fmt::Display, -}; +use super::*; -use ahash::{HashMapExt, HashSetExt}; -use log::warn; -use nohash_hasher::IntMap; - -use crate::{error::EngineError, execution_profile::ThreadLocalParams, regex::RegularExpression}; - -use super::{FastAutomaton, IntSet, Range, State}; - -mod builder; +mod state_elimination; mod transform; -#[derive(Clone, Debug)] -enum GraphTransition { - Graph(StateEliminationAutomaton), - Weight(T), - Epsilon, -} - -impl GraphTransition { - pub fn is_empty_string(&self) -> bool { - matches!(self, GraphTransition::Epsilon) - } - - pub fn get_weight(&self) -> Option<&T> { - if let GraphTransition::Weight(weight) = self { - Some(weight) - } else { - None - } - } -} - -#[derive(Clone, Debug)] -struct StateEliminationAutomaton { - start_state: usize, - accept_state: usize, - transitions: Vec>>, - transitions_in: IntMap>, - removed_states: IntSet, - cyclic: bool, -} - -impl Display for StateEliminationAutomaton { - fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.to_graph_dot(sb, None) - } -} - -impl StateEliminationAutomaton { - //#[cfg(test)] - #[allow(dead_code)] - #[inline] - pub fn to_dot(&self) { - println!("{}", self); - } - - #[inline] - fn to_graph_dot( - &self, - sb: &mut std::fmt::Formatter<'_>, - prefix: Option<&str>, - ) -> std::fmt::Result { - let is_subgraph; - let indent; - let prefix = if let Some(prefix) = prefix { - writeln!(sb, "\tsubgraph cluster_{} {{", prefix)?; - writeln!(sb, "\t\tlabel = \"{} - cyclic={}\";", prefix, self.cyclic)?; - indent = "\t"; - is_subgraph = true; - prefix - } else { - writeln!(sb, "digraph Automaton {{")?; - writeln!(sb, "\trankdir = LR;")?; - writeln!(sb, "\tlabel = \"cyclic={}\";", self.cyclic)?; - indent = ""; - is_subgraph = false; - "" - }; - - for from_state in self.states_iter() { - let from_state_with_prefix = if is_subgraph { - format!("S{prefix}_{from_state}") - } else { - format!("S{from_state}") - }; - - write!(sb, "{indent}\t{}", from_state_with_prefix)?; - if !is_subgraph && self.accept_state == from_state { - writeln!(sb, "\t[shape=doublecircle,label=\"{}\"];", from_state)?; - } else { - writeln!(sb, "{indent}\t[shape=circle,label=\"{}\"];", from_state)?; - } - - if !is_subgraph && self.start_state == from_state { - writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {}", from_state_with_prefix)?; - } - for (to_state, weight) in self.transitions_from_state_enumerate_iter(&from_state) { - let to_state_with_prefix = if is_subgraph { - format!("S{prefix}_{to_state}") - } else { - format!("S{to_state}") - }; - - match weight { - GraphTransition::Graph(state_elimination_automaton) => { - let subgraph_prefix = if is_subgraph { - format!("{prefix}_{from_state}_{to_state}") - } else { - format!("{from_state}_{to_state}") - }; - state_elimination_automaton.to_graph_dot(sb, Some(&subgraph_prefix))?; - writeln!(sb)?; - let subgraph_start_state = format!( - "S{}_{}", - subgraph_prefix, state_elimination_automaton.start_state - ); - writeln!( - sb, - "{indent}\t{} -> {} [label=\"ε\"]", - from_state_with_prefix, subgraph_start_state - )?; - - let subgraph_accept_state = format!( - "S{}_{}", - subgraph_prefix, state_elimination_automaton.accept_state - ); - writeln!( - sb, - "{indent}\t{} -> {} [label=\"ε\"]", - subgraph_accept_state, to_state_with_prefix - ) - } - GraphTransition::Weight(range) => { - writeln!( - sb, - "{indent}\t{} -> {} [label=\"{}\"]", - from_state_with_prefix, - to_state_with_prefix, - RegularExpression::Character(range.clone()) - .to_string() - .replace('\\', "\\\\") - .replace('"', "\\\"") - ) - } - GraphTransition::Epsilon => writeln!( - sb, - "{indent}\t{} -> {} [label=\"ε\"]", - from_state_with_prefix, to_state_with_prefix - ), - }?; - } - } - write!(sb, "{indent}}}") - } - - #[inline] - pub fn states_iter(&self) -> impl Iterator + '_ { - (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_from_state_enumerate_iter( - &self, - from_state: &State, - ) -> impl Iterator)> { - self.transitions[*from_state] - .iter() - .filter(|s| !self.removed_states.contains(s.0)) - } - - #[inline] - pub fn transitions_from_state_vec(&self, from_state: &State) -> Vec { - self.transitions[*from_state] - .keys() - .filter(|s| !self.removed_states.contains(s)) - .copied() - .collect() - } - - pub fn in_transitions_vec(&self, to_state: State) -> Vec<(State, GraphTransition)> { - let mut in_transitions = vec![]; - for from_state in self.transitions_in.get(&to_state).unwrap_or(&IntSet::new()) { - for (state, transition) in self.transitions_from_state_enumerate_iter(from_state) { - if to_state == *state { - in_transitions.push((*from_state, transition.clone())); - } - } - } - in_transitions - } - - pub fn states_topo_vec(&self) -> Vec { - if self.cyclic { - panic!("The graph has a cycle"); - } - - let mut in_degree: IntMap = self - .transitions_in - .iter() - .map(|(state, parents)| (*state, parents.len())) - .collect(); - - let mut worklist: VecDeque = VecDeque::new(); - for (&state, °ree) in &in_degree { - if degree == 0 { - worklist.push_back(state); - } - } - - let mut sorted_order = Vec::with_capacity(self.get_number_of_states()); - while let Some(state) = worklist.pop_front() { - sorted_order.push(state); - - if let Some(neighbors) = self.transitions.get(state) { - let neighbors = neighbors.keys(); - for &neighbor in neighbors { - if let Some(degree) = in_degree.get_mut(&neighbor) { - *degree -= 1; - if *degree == 0 { - worklist.push_back(neighbor); - } - } - } - } - } - - if sorted_order.len() == self.get_number_of_states() { - sorted_order - } else { - panic!("The graph has a cycle"); - } - } - - #[inline] - pub fn get_number_of_states(&self) -> usize { - self.transitions.len() - self.removed_states.len() - } -} - impl FastAutomaton { - /// Try to convert the current FastAutomaton to a RegularExpression. - /// If it cannot find an equivalent regex it returns None. - /// This method is still a work in progress. - pub fn to_regex(&self) -> Option { - if self.is_empty() { - return Some(RegularExpression::new_empty()); - } - let execution_profile = ThreadLocalParams::get_execution_profile(); - if let Ok(graph) = StateEliminationAutomaton::new(self) { - if let Ok(regex) = graph?.convert_to_regex(&execution_profile) { - let regex = regex?; - match regex.to_automaton() { - Ok(automaton) => match self.is_equivalent_of(&automaton) { - Ok(result) => { - if !result { - warn!("The automaton is not equivalent to the generated regex; automaton={}, regex={}", self, regex); - None - } else { - Some(regex) - } - } - Err(err) => { - warn!("Engine error while checking for equivalence ({}); automaton={}, regex={}", err, self, regex); - None - } - }, - Err(err) => { - if let crate::error::EngineError::RegexSyntaxError(err) = err { - warn!("The generated regex cannot be converted to automaton to be checked for equivalence ({}); automaton={}, regex={}", err, self, regex); - } - None - } - } - } else { - None - } - } else { - None - } + /// Converts the term to a [`RegularExpression`]. + pub fn to_regex(&self) -> RegularExpression { + let transformed_automaton = transform::transform(self); + state_elimination::convert_to_regex(&transformed_automaton) } } @@ -292,6 +17,15 @@ mod tests { #[test] fn test_convert() -> Result<(), String> { + + assert_convert(".*u(ab|de)"); + assert_convert(".*sf.*uif(ab|de)"); + + assert_convert("(a+|,)*"); + assert_convert("((ab)*,(cd)*)*"); + assert_convert("(a*,a*,a*)*"); + assert_convert("(a*,a*)*"); + assert_convert("(ac|ads|a)*"); assert_convert(".*sf"); assert_convert(".*sf.*uif(ab|de)"); @@ -325,47 +59,43 @@ mod tests { } fn assert_convert(regex: &str) { - let input_regex = RegularExpression::new(regex).unwrap(); + let input_regex = RegularExpression::parse(regex, false).unwrap(); println!("IN : {}", input_regex); let input_automaton = input_regex.to_automaton().unwrap(); - //input_automaton.to_dot(); - - let output_regex = input_automaton.to_regex().unwrap(); + let output_regex = input_automaton.to_regex(); println!("OUT (non deterministic): {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.is_equivalent_of(&output_automaton).unwrap()); + assert!(input_automaton.equivalent(&output_automaton).unwrap()); let input_automaton = input_automaton.determinize().unwrap(); - //input_automaton.to_dot(); - let output_regex = input_automaton.to_regex().unwrap(); + let output_regex = input_automaton.to_regex(); println!("OUT (deterministic) : {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.is_equivalent_of(&output_automaton).unwrap()); + assert!(input_automaton.equivalent(&output_automaton).unwrap()); } #[test] fn test_convert_after_operation_1() -> Result<(), String> { - let automaton1 = RegularExpression::new("(ab|cd)") + let automaton1 = RegularExpression::parse("(ab|cd)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("ab") + let automaton2 = RegularExpression::parse("ab", false) .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); + let result = automaton1.difference(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let output_regex = result.to_regex().unwrap(); + let output_regex = result.to_regex(); assert_eq!("cd", output_regex.to_string()); Ok(()) @@ -373,20 +103,20 @@ mod tests { #[test] fn test_convert_after_operation_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("a*") + let automaton1 = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("b*") + let automaton2 = RegularExpression::parse("b*", false) .unwrap() .to_automaton() .unwrap(); let result = automaton1.intersection(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let output_regex = result.to_regex().unwrap(); + let output_regex = result.to_regex(); assert_eq!("", output_regex.to_string()); Ok(()) @@ -394,72 +124,72 @@ mod tests { #[test] fn test_convert_after_operation_3() -> Result<(), String> { - let automaton1 = RegularExpression::new("x*") + let automaton1 = RegularExpression::parse("x*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(xxx)*") + let automaton2 = RegularExpression::parse("(xxx)*", false) .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); - result.to_dot(); + let result = automaton1.difference(&automaton2).unwrap(); + result.print_dot(); - let result = result.to_regex().unwrap(); + let result = result.to_regex(); - assert_eq!("(x{3})*x{1,2}", result.to_string()); + assert_eq!("x(x{3})*x?", result.to_string()); Ok(()) } #[test] fn test_convert_after_operation_4() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*abc.*") + let automaton1 = RegularExpression::parse(".*abc.*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new(".*def.*") + let automaton2 = RegularExpression::parse(".*def.*", false) .unwrap() .to_automaton() .unwrap(); let result = automaton1.intersection(&automaton2).unwrap(); - let result = result.to_regex().unwrap(); + let result = result.to_regex(); assert_eq!(".*(abc.*def|def.*abc).*", result.to_string()); Ok(()) } - /*#[test] - fn test_convert_after_operation_5() -> Result<(), String> { - if std::env::var_os("RUST_LOG").is_none() { - std::env::set_var("RUST_LOG", "regexsolver=debug"); - } - env_logger::init(); - - let automaton1 = RegularExpression::new(".*abc.*") + #[test] + fn test_automaton() -> Result<(), String> { + let automaton = RegularExpression::parse("a*ba*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new(".*def.*") + automaton.print_dot(); + + let automaton1 = RegularExpression::parse("(a*ba*)*", false) .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + automaton1.print_dot(); + + automaton1.determinize().unwrap().print_dot(); - let result = automaton1.subtraction(&automaton2).unwrap(); - result.to_dot(); + // (a*b[ab]*)? + // a*b+a+b+ - let result = result.to_regex().unwrap(); + let automaton2 = RegularExpression::parse("(a*b[ab]*)?", false) + .unwrap() + .to_automaton() + .unwrap(); - assert_eq!("(x{3})*x{1,2}", result.to_string()); + assert!(automaton1.equivalent(&automaton2).unwrap()); Ok(()) - }*/ + } } diff --git a/src/fast_automaton/convert/to_regex/builder/mod.rs b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs similarity index 58% rename from src/fast_automaton/convert/to_regex/builder/mod.rs rename to src/fast_automaton/convert/to_regex/state_elimination/builder.rs index b6c8dd5..54cec00 100644 --- a/src/fast_automaton/convert/to_regex/builder/mod.rs +++ b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs @@ -1,48 +1,64 @@ -use super::*; - -mod scc; +use ahash::HashMapExt; -impl StateEliminationAutomaton { - pub fn new(automaton: &FastAutomaton) -> Result, EngineError> { - if automaton.is_empty() { - return Ok(None); - } +use super::*; - let mut state_elimination_automaton = StateEliminationAutomaton { +impl Gnfa { + pub(super) fn from_automaton(automaton: &FastAutomaton) -> Gnfa { + let mut state_elimination_automaton = Gnfa { start_state: 0, // start_state is not set yet accept_state: 0, // accept_state is not set yet transitions: Vec::with_capacity(automaton.get_number_of_states()), transitions_in: IntMap::with_capacity(automaton.get_number_of_states()), - removed_states: IntSet::new(), - cyclic: false, + removed_states: IntSet::with_capacity(automaton.get_number_of_states()), + empty: false }; + if automaton.is_empty() { + state_elimination_automaton.empty = true; + return state_elimination_automaton; + } + let mut states_map = IntMap::with_capacity(automaton.get_number_of_states()); - for from_state in automaton.transitions_iter() { + for from_state in automaton.states() { let new_from_state = *states_map .entry(from_state) .or_insert_with(|| state_elimination_automaton.new_state()); - for (to_state, condition) in - automaton.transitions_from_state_enumerate_into_iter(&from_state) - { + for (condition, to_state) in automaton.transitions_from(from_state) { let new_to_state = *states_map - .entry(to_state) + .entry(*to_state) .or_insert_with(|| state_elimination_automaton.new_state()); - state_elimination_automaton.add_transition_to( + state_elimination_automaton.add_transition( new_from_state, new_to_state, - GraphTransition::Weight(condition.to_range(automaton.get_spanning_set())?), + RegularExpression::Character( + condition.to_range(automaton.get_spanning_set()).unwrap(), + ), ); } } - state_elimination_automaton.start_state = - *states_map.get(&automaton.get_start_state()).unwrap(); // We finally set start_state + if automaton.in_degree(automaton.get_start_state()) == 0 { + // If the start state does not have any incoming state we just set it + state_elimination_automaton.start_state = + *states_map.get(&automaton.get_start_state()).unwrap(); + } else { + // If not we create a new state that will be the new start state + state_elimination_automaton.start_state = state_elimination_automaton.new_state(); + + let previous_start_state = *states_map.get(&automaton.get_start_state()).unwrap(); + // We add an empty string transition to the new start state + state_elimination_automaton.add_transition( + state_elimination_automaton.start_state, + previous_start_state, + RegularExpression::new_empty_string(), + ); + } - if automaton.get_accept_states().len() == 1 { - // If there is only one accept state with just set it + let accept_state = *automaton.get_accept_states().iter().next().unwrap(); + if automaton.get_accept_states().len() == 1 && automaton.out_degree(accept_state) == 0 { + // If there is only one accept state we just set it state_elimination_automaton.accept_state = *states_map .get(automaton.get_accept_states().iter().next().unwrap()) .unwrap(); @@ -52,19 +68,18 @@ impl StateEliminationAutomaton { for accept_state in automaton.get_accept_states() { let accept_state = *states_map.get(accept_state).unwrap(); // We add an empty string transition to the new accept state - state_elimination_automaton.add_transition_to( + state_elimination_automaton.add_transition( accept_state, state_elimination_automaton.accept_state, - GraphTransition::Epsilon, + RegularExpression::new_empty_string(), ); } } - state_elimination_automaton.identify_and_apply_components()?; - //state_elimination_automaton.to_dot(); - Ok(Some(state_elimination_automaton)) + + state_elimination_automaton } - pub fn new_state(&mut self) -> usize { + fn new_state(&mut self) -> usize { if let Some(new_state) = self.removed_states.clone().iter().next() { self.removed_states.remove(new_state); self.transitions_in.insert(*new_state, IntSet::new()); @@ -78,22 +93,22 @@ impl StateEliminationAutomaton { } #[inline] - pub fn has_state(&self, state: State) -> bool { + pub(super) fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) } #[inline] fn assert_state_exists(&self, state: State) { if !self.has_state(state) { - panic!("The state {} does not exist", state); + panic!("The state {state} does not exist"); } } - pub fn add_transition_to( + pub(crate) fn add_transition( &mut self, from_state: State, to_state: State, - transition: GraphTransition, + transition: RegularExpression, ) { self.assert_state_exists(from_state); if from_state != to_state { @@ -106,13 +121,8 @@ impl StateEliminationAutomaton { .insert(from_state); match self.transitions[from_state].entry(to_state) { Entry::Occupied(mut o) => { - if let (GraphTransition::Weight(current_regex), GraphTransition::Weight(regex)) = - (o.get(), transition) - { - o.insert(GraphTransition::Weight(current_regex.union(®ex))); - } else { - panic!("Cannot add transition"); - } + //o.insert(RegularExpression::Alternation(vec![transition, o.get().clone()])); + o.insert(transition.union(o.get())); } Entry::Vacant(v) => { v.insert(transition); @@ -120,12 +130,11 @@ impl StateEliminationAutomaton { }; } - pub fn remove_state(&mut self, state: State) { + pub(super) fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state || self.accept_state == state { panic!( - "Can not remove the state {}, it is still used as start state or accept state.", - state + "Can not remove the state {state}, it is still used as start state or accept state." ); } self.transitions_in.remove(&state); @@ -150,21 +159,4 @@ impl StateEliminationAutomaton { transitions.remove(&state); } } - - pub fn remove_transition(&mut self, from_state: State, to_state: State) { - self.assert_state_exists(from_state); - if from_state != to_state { - self.assert_state_exists(to_state); - } - - if let Some(from_states) = self.transitions_in.get_mut(&to_state) { - from_states.remove(&from_state); - } - - self.transitions[from_state].remove(&to_state); - } - - pub fn get_transition(&self, from_state: State, to_state: State) -> Option<&GraphTransition> { - self.transitions.get(from_state)?.get(&to_state) - } } diff --git a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs new file mode 100644 index 0000000..c587a99 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs @@ -0,0 +1,109 @@ +use super::*; + +impl Gnfa { + pub(super) fn convert(&mut self) -> RegularExpression { + if self.empty { + return RegularExpression::new_empty(); + } + + while let Some(state) = self.get_next_state_to_eliminate() { + self.eliminate_state(state); + } + + self.get_transition(self.start_state, self.accept_state) + .cloned() + .unwrap_or(RegularExpression::new_empty_string()) + } + + fn get_next_state_to_eliminate(&self) -> Option { + let states: Vec = self + .all_states_iter() + .filter(|&s| s != self.start_state && s != self.accept_state) + .collect(); + + states + .into_par_iter() + .filter_map(|state| { + let preds = self.transitions_to_vec(state); + let succs = self.transitions_from_vec(state); + + let in_deg = preds.len() as u128; + let out_deg = succs.len() as u128; + + if in_deg == 0 || out_deg == 0 { + let score = (state as u128) & 0xFF; + return Some((score, state)); + } + + let mut score: u128 = in_deg * out_deg; + + if self.has_self_loop(state) { + score = score + (score >> 1); + } + + let mut label_cost: u128 = 0; + + for (_, regex) in &preds { + label_cost += regex.evaluate_complexity() as u128; + } + for (regex, _) in &succs { + label_cost += regex.evaluate_complexity() as u128; + } + if let Some(re) = self.get_transition(state, state) { + label_cost += (re.evaluate_complexity() as u128) * 2; + } + + score = score.saturating_add(label_cost); + + let tie = (state as u128) & 0xFFFF; + Some((score.saturating_add(tie), state)) + }) + .reduce_with(|a, b| if a.0 < b.0 { a } else { b }) + .map(|(_, state)| state) + } + + fn eliminate_state(&mut self, k: usize) { + if self.removed_states.contains(&k) { + return; + } + + let in_states = self + .transitions_in + .get(&k) + .unwrap() + .iter() + .cloned() + .filter(|&s| s != k) + .collect::>(); + let out_states = self.transitions[k] + .keys() + .cloned() + .filter(|&s| s != k) + .collect::>(); + + for p in in_states { + for &q in &out_states { + self.bridge(p, k, q); + } + } + + self.remove_state(k); + } + + fn bridge(&mut self, p: usize, k: usize, q: usize) { + let rpk = self.get_transition(p, k); + let rkk = self.get_transition(k, k); + let rkq = self.get_transition(k, q); + + if let (Some(rpk), Some(rkq)) = (rpk, rkq) { + let mut regex = rpk.clone(); + if let Some(rkk) = rkk { + //regex = RegularExpression::Concat(VecDeque::from_iter(vec![regex, RegularExpression::Repetition(Box::new(rkk.clone()), 0, None)])); + regex = regex.concat(&rkk.repeat(0, None), true); + } + //regex = RegularExpression::Concat(VecDeque::from_iter(vec![regex, rkq.clone()])); + regex = regex.concat(rkq, true); + self.add_transition(p, q, regex); + } + } +} diff --git a/src/fast_automaton/convert/to_regex/state_elimination/mod.rs b/src/fast_automaton/convert/to_regex/state_elimination/mod.rs new file mode 100644 index 0000000..023d6b1 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/state_elimination/mod.rs @@ -0,0 +1,121 @@ +use super::*; + +mod builder; +mod eliminate; + +struct Gnfa { + start_state: usize, + accept_state: usize, + transitions: Vec>, + transitions_in: IntMap>, + removed_states: IntSet, + empty: bool, +} + +impl Display for Gnfa { + fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(sb, "digraph GNFA {{")?; + writeln!(sb, "\trankdir = LR;")?; + for from_state in self.all_states_iter() { + write!(sb, "\t{from_state}")?; + if self.accept_state == from_state { + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; + } else { + writeln!(sb, "\t[shape=circle,label=\"{from_state}\"];")?; + } + + if self.start_state == from_state { + writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; + writeln!(sb, "\tinitial -> {from_state}")?; + } + for (regex, to_state) in self.transitions_from_vec(from_state) { + writeln!(sb, "\t{from_state} -> {to_state} [label=\"{regex}\"]")?; + } + } + write!(sb, "}}") + } +} + +impl Gnfa { + fn get_transition(&self, from_state: State, to_state: State) -> Option<&RegularExpression> { + self.transitions.get(from_state)?.get(&to_state) + } + + #[inline] + fn all_states_iter(&self) -> impl Iterator + '_ { + (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) + } + + fn transitions_to_vec(&self, state: State) -> Vec<(State, RegularExpression)> { + let mut in_transitions = vec![]; + for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { + for (condition, to_state) in self.transitions_from_vec(*from_state) { + if to_state == state { + in_transitions.push((*from_state, condition)); + break; + } + } + } + in_transitions + } + + #[inline] + fn transitions_from_vec(&self, state: State) -> Vec<(RegularExpression, State)> { + self.transitions[state] + .iter() + .map(|(s, c)| (c.clone(), *s)) + .filter(|s| !self.removed_states.contains(&s.1)) + .collect() + } + + #[inline] + fn has_self_loop(&self, state: State) -> bool { + self.get_transition(state, state).is_some() + } +} + +pub(super) fn convert_to_regex(automaton: &FastAutomaton) -> RegularExpression { + let mut gnfa = Gnfa::from_automaton(automaton); + gnfa.convert() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_state_elimination() -> Result<(), String> { + test_correct("abc"); + test_correct(".*de"); + test_correct(".*def"); + test_correct("(a*ba*)*"); + test_correct(".*u(ab|d)"); + test_correct(".*u(ab|de)"); + Ok(()) + } + + fn test_correct(pattern: &str) { + println!("Pattern: {pattern}"); + + let automaton = RegularExpression::new(pattern) + .unwrap() + .to_automaton() + .unwrap(); + + let regex = Gnfa::from_automaton(&automaton).convert(); + println!("-> {regex}"); + + let new_automaton = regex.to_automaton().unwrap(); + + assert!(automaton.equivalent(&new_automaton).unwrap()); + + let automaton = automaton.determinize().unwrap().into_owned(); + + let regex = Gnfa::from_automaton(&automaton).convert(); + println!("-> {regex}"); + + let new_automaton = regex.to_automaton().unwrap(); + + assert!(automaton.equivalent(&new_automaton).unwrap()); + } +} diff --git a/src/fast_automaton/convert/to_regex/transform.rs b/src/fast_automaton/convert/to_regex/transform.rs deleted file mode 100644 index aaeca76..0000000 --- a/src/fast_automaton/convert/to_regex/transform.rs +++ /dev/null @@ -1,208 +0,0 @@ -use std::hash::BuildHasherDefault; - -use crate::execution_profile::ExecutionProfile; - -use super::*; - -impl StateEliminationAutomaton { - pub fn convert_to_regex( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - if self.cyclic { - return self.convert_graph_to_regex(execution_profile); - } - execution_profile.assert_not_timed_out()?; - - let mut regex_map: IntMap = IntMap::with_capacity_and_hasher( - self.get_number_of_states(), - BuildHasherDefault::default(), - ); - regex_map.insert(self.start_state, RegularExpression::new_empty_string()); - for from_state in self.states_topo_vec() { - let current_regex = if let Some(current_regex) = regex_map.get(&from_state) { - current_regex.clone() - } else { - RegularExpression::new_empty_string() - }; - if let Some(transitions) = self.transitions.get(from_state) { - for (to_state, transition) in transitions { - let transition_regex = match transition { - GraphTransition::Graph(graph) => { - if let Some(regex) = graph.convert_graph_to_regex(execution_profile)? { - regex - } else { - return Ok(None); - } - } - GraphTransition::Weight(range) => { - RegularExpression::Character(range.clone()) - } - GraphTransition::Epsilon => RegularExpression::new_empty_string(), - }; - let new_regex = current_regex.concat(&transition_regex, true); - match regex_map.entry(*to_state) { - Entry::Occupied(mut o) => { - o.insert(new_regex.union(o.get()).simplify()); - } - Entry::Vacant(v) => { - v.insert(new_regex); - } - }; - } - } - } - - Ok(regex_map.get(&self.accept_state).cloned()) - } - - fn convert_graph_to_regex( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - execution_profile.assert_not_timed_out()?; - if let Some(regex) = self.convert_shape_dot_star(execution_profile)? { - return Ok(Some(regex)); - } else if let Some(regex) = self.convert_shape_self_loop(execution_profile)? { - return Ok(Some(regex)); - } - Ok(None) - } - - /// We try to idenfify the regex following the shape: - /// A*B - fn convert_shape_dot_star( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - if self.get_number_of_states() < 2 { - return Ok(None); - } - //self.to_dot(); - let mut dot_value = - if let Some(dot_value) = self.get_transition(self.start_state, self.start_state) { - if let Some(dot_value) = dot_value.get_weight() { - dot_value.clone() - } else { - return Ok(None); - } - } else { - return Ok(None); - }; - - for state in self.states_iter() { - if state == self.start_state { - continue; - } - let weight = if let Some(weight) = self.get_transition(state, self.start_state) { - if let Some(weight) = weight.get_weight() { - weight - } else { - return Ok(None); - } - } else if state == self.accept_state { - continue; - } else { - return Ok(None); - }; - - if !dot_value.contains_all(weight) { - return Ok(None); - } - } - - let mut graph = self.clone(); - - for (from_state, transition) in graph.in_transitions_vec(graph.start_state) { - let weight = if let Some(weight) = transition.get_weight() { - weight - } else { - return Ok(None); - }; - dot_value = dot_value.union(weight); - graph.remove_transition(from_state, graph.start_state); - } - - let mut worklist = VecDeque::new(); - let mut seen = IntSet::with_capacity(graph.get_number_of_states()); - - worklist.push_back(graph.start_state); - seen.insert(self.start_state); - - while let Some(from_state) = worklist.pop_front() { - for to_state in graph.transitions_from_state_vec(&from_state) { - let transition = - if let Some(transition) = graph.get_transition(from_state, to_state) { - transition - } else { - return Ok(None); - }; - let weight = if let Some(weight) = transition.get_weight() { - weight - } else { - continue; - }; - dot_value = dot_value.union(weight); - if seen.contains(&to_state) { - if graph.accept_state != to_state || to_state == from_state { - graph.remove_transition(from_state, to_state); - } - } else { - seen.insert(to_state); - worklist.push_back(to_state); - } - } - } - - graph.add_transition_to( - self.start_state, - self.start_state, - GraphTransition::Weight(dot_value), - ); - - graph.identify_and_apply_components()?; - graph.convert_to_regex(execution_profile) - } - - /// We try to identify the regex following the shape: - /// A*B - fn convert_shape_self_loop( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - let mut graph = self.clone(); - - graph.accept_state = graph.new_state(); - - for (from_state, transition) in graph.in_transitions_vec(self.start_state) { - graph.remove_transition(from_state, self.start_state); - - graph.add_transition_to(from_state, graph.accept_state, transition); - } - - graph.identify_and_apply_components()?; - - let a_part = if let Some(a_part) = graph.convert_to_regex(execution_profile)? { - a_part - } else { - return Ok(None); - }; - - let mut graph = self.clone(); - - for (from_state, _) in graph.in_transitions_vec(self.start_state) { - graph.remove_transition(from_state, self.start_state); - } - - graph.identify_and_apply_components()?; - let b_part = if let Some(b_part) = graph.convert_to_regex(execution_profile)? { - b_part - } else { - return Ok(None); - }; - - let regex = a_part.repeat(0, None).concat(&b_part, true); - - Ok(Some(regex)) - } -} diff --git a/src/fast_automaton/convert/to_regex/transform/mod.rs b/src/fast_automaton/convert/to_regex/transform/mod.rs new file mode 100644 index 0000000..643e6ba --- /dev/null +++ b/src/fast_automaton/convert/to_regex/transform/mod.rs @@ -0,0 +1,47 @@ +use crate::fast_automaton::{ + FastAutomaton, convert::to_regex::transform::shape::dotstar::dot_star, +}; + +mod shape; + +const TRANSFORM_FUNCTION: &[fn(&FastAutomaton) -> FastAutomaton] = &[dot_star]; + +pub fn transform(automaton: &FastAutomaton) -> FastAutomaton { + let mut automaton = automaton.clone(); + for transform in TRANSFORM_FUNCTION { + automaton = transform(&automaton); + } + + automaton +} + +#[cfg(test)] +mod tests { + use crate::{ + fast_automaton::convert::to_regex::transform::transform, regex::RegularExpression, + }; + + #[test] + fn test_equivalence() -> Result<(), String> { + assert_equivalent("abc"); + assert_equivalent(".*abc"); + assert_equivalent(".*abc.*def"); + assert_equivalent(".*abc.*def(ab|fr)"); + assert_equivalent(".*abc.*def(ab|fr).*mpa"); + + Ok(()) + } + + fn assert_equivalent(pattern: &str) { + let before = RegularExpression::parse(pattern, false) + .unwrap() + .to_automaton() + .unwrap(); + + let before = before.determinize().unwrap(); + + let after = transform(&before); + + assert!(before.equivalent(&after).unwrap()); + } +} diff --git a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs new file mode 100644 index 0000000..6c91106 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs @@ -0,0 +1,172 @@ +use nohash_hasher::IntSet; + +use crate::fast_automaton::{FastAutomaton, State, condition::Condition}; + +pub(crate) fn dot_star(automaton: &FastAutomaton) -> FastAutomaton { + let components = identify_and_apply_components(automaton); + + let mut automaton = automaton.clone(); + for component in components { + dot_star_component(&mut automaton, &component); + } + + automaton +} + +fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) { + let mut start_state = if component.contains(&automaton.start_state) { + Some(automaton.start_state) + } else { + None + }; + for &state in component { + for (from_state, _) in automaton.transitions_to_vec(state) { + if !component.contains(&from_state) { + if start_state.is_none() { + start_state = Some(state); + } else { + // Only one start state possible + return; + } + } + } + } + + if start_state.is_none() { + // Only one start state possible + return; + } + let start_state = start_state.unwrap(); + + let mut first_hop = automaton + .direct_states(start_state) + .filter(|&s| s != start_state) + .collect::>(); + let mut states_to_remove = vec![]; + + for state in &first_hop { + let transitions = automaton.transitions_to_vec(*state); + if !transitions.iter().all(|(_, c)| *c == transitions[0].1) { + // Some condition(s) to a given first hop state are not the same. + return; + } + + if transitions.len() != component.len() { + states_to_remove.push(*state); + } + } + + states_to_remove.iter().for_each(|s| { + first_hop.remove(s); + }); + + let mut out_condition = None; + for &state in component { + let mut has_transition_to_start_state = false; + + let mut this_condition = Condition::empty(automaton.get_spanning_set()); + for (condition, &to_state) in automaton.transitions_from(state) { + if to_state == start_state { + has_transition_to_start_state = true; + } + + this_condition = this_condition.union(condition); + } + if !has_transition_to_start_state { + // Some state(s) do not have transition to the start state. + return; + } + + if let Some(condition) = &out_condition { + if &this_condition != condition { + // The union of outcoming condition for some states are not identical + return; + } + } else { + out_condition = Some(this_condition); + } + } + + automaton.add_transition(start_state, start_state, &out_condition.unwrap()); + for &state in component { + for to_state in automaton.direct_states_vec(state) { + if !component.contains(&to_state) { + continue; + } + + if state != start_state && (to_state == start_state || first_hop.contains(&to_state)) { + automaton.remove_transition(state, to_state); + } + } + } + for state in states_to_remove { + automaton.remove_state(state); + } +} + +pub fn identify_and_apply_components(automaton: &FastAutomaton) -> Vec> { + let mut index = 0; + let mut stack = Vec::new(); + let mut indices = vec![-1; automaton.transitions.len()]; + let mut lowlink = vec![-1; automaton.transitions.len()]; + let mut on_stack = vec![false; automaton.transitions.len()]; + let mut scc = Vec::new(); + + for state in automaton.states() { + if indices[state] == -1 { + strongconnect( + automaton, + state, + &mut index, + &mut stack, + &mut indices, + &mut lowlink, + &mut on_stack, + &mut scc, + ); + } + } + + scc.into_iter() + .filter(|states| states.len() != 1) + .collect::>() +} + +#[allow(clippy::too_many_arguments)] +fn strongconnect( + automaton: &FastAutomaton, + v: usize, + index: &mut usize, + stack: &mut Vec, + indices: &mut Vec, + lowlink: &mut Vec, + on_stack: &mut Vec, + scc: &mut Vec>, +) { + indices[v] = *index as i32; + lowlink[v] = *index as i32; + *index += 1; + stack.push(v); + on_stack[v] = true; + + for w in automaton.direct_states(v) { + if indices[w] == -1 { + strongconnect(automaton, w, index, stack, indices, lowlink, on_stack, scc); + lowlink[v] = lowlink[v].min(lowlink[w]); + } else if on_stack[w] { + lowlink[v] = lowlink[v].min(indices[w]); + } + } + + if lowlink[v] == indices[v] { + let mut component = IntSet::default(); + while let Some(w) = stack.pop() { + on_stack[w] = false; + component.insert(w); + if w == v { + break; + } + } + scc.push(component); + } +} diff --git a/src/fast_automaton/convert/to_regex/transform/shape/mod.rs b/src/fast_automaton/convert/to_regex/transform/shape/mod.rs new file mode 100644 index 0000000..5c83bf6 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/transform/shape/mod.rs @@ -0,0 +1 @@ +pub(super) mod dotstar; \ No newline at end of file diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 638ba11..e0d5ae3 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -1,87 +1,125 @@ -use std::cmp; - -use crate::{execution_profile::ThreadLocalParams, EngineError}; +use crate::{EngineError, execution_profile::ExecutionProfile}; use ahash::AHashSet; use super::*; impl FastAutomaton { - pub fn generate_strings(&self, number: usize) -> Result, EngineError> { + /// Generates `count` strings matched by the automaton. + pub fn generate_strings(&self, count: usize) -> Result, EngineError> { if self.is_empty() { - return Ok(AHashSet::new()); + return Ok(vec![]); } - let mut strings = AHashSet::with_capacity(cmp::min(number, 1000)); - - let execution_profile = ThreadLocalParams::get_execution_profile(); - - let mut ranges_cache: AHashMap<&Condition, Range> = - AHashMap::with_capacity(self.get_number_of_states()); - - let mut worklist: VecDeque<(Vec, usize)> = - VecDeque::with_capacity(cmp::min(number, 1000)); - let mut visited = AHashSet::with_capacity(cmp::min(number, 1000)); + let (min, max) = self.get_length(); + let max_len = if let Some(max) = max { + max + } else { + let min = min.expect("A non empty automaton should have a minimum length"); + min.saturating_add(100) + } as usize; + + let execution_profile = ExecutionProfile::get(); + + let mut ranges_cache = AHashMap::with_capacity(self.get_number_of_states()); + let mut strings = AHashSet::with_capacity(count); + let mut visited = AHashSet::with_capacity(self.get_number_of_states()); + let mut q = VecDeque::with_capacity(self.get_number_of_states()); + q.push_back((self.get_start_state(), vec![], 0u64)); + while let Some((state, ranges, h)) = q.pop_front() { + execution_profile.assert_not_timed_out()?; + + if ranges.len() > max_len { + continue; + } - worklist.push_back((vec![], self.start_state)); - while let Some((ranges, state)) = worklist.pop_front() { - if self.accept_states.contains(&state) { + if self.is_accepted(state) { if ranges.is_empty() { strings.insert(String::new()); } else { - let mut end = false; - let mut ranges_iter: Vec<_> = ranges.iter().map(|range| range.iter()).collect(); - while strings.len() < number { - execution_profile.assert_not_timed_out()?; - let mut string = vec![]; - for i in 0..ranges.len() { - if let Some(character) = ranges_iter[i].next() { - string.push(character); - } else { - ranges_iter[i] = ranges[i].iter(); - if i + 1 < ranges.len() { - string.push(ranges_iter[i].next().unwrap()); - } else { - end = true; - break; - } - } - } - if end { - break; - } - strings.insert(string.into_iter().map(|c| c.to_char()).collect()); - } + Self::ranges_to_strings(&mut strings, &ranges, count, &execution_profile)?; } - if strings.len() == number { + if strings.len() >= count { break; } } - for (to_state, cond) in self.transitions_from_state_enumerate_iter(&state) { + + for (cond, &to_state) in self.transitions_from(state) { + let hash = + Self::path_mix(h, Self::mix64(state as u64 ^ Self::mix64(to_state as u64))); + + if visited.insert((to_state, ranges.len() + 1, hash)) { + let mut new_ranges = ranges.clone(); + new_ranges.push( + ranges_cache + .entry(cond) + .or_insert_with(|| cond.to_range(&self.spanning_set).unwrap()) + .clone(), + ); + + q.push_back((to_state, new_ranges, hash)); + } + } + } + let mut strings: Vec = strings.into_iter().collect(); + strings.sort_unstable_by(|a, b| a.len().cmp(&b.len()).then_with(|| a.cmp(b))); + Ok(strings) + } + + pub fn ranges_to_strings( + strings: &mut AHashSet, + ranges: &Vec, + count: usize, + execution_profile: &ExecutionProfile, + ) -> Result<(), EngineError> { + let n = count - strings.len(); + if n == 0 { + return Ok(()); + } + + let mut end = false; + let mut out: Vec = Vec::with_capacity(n); + out.push(String::with_capacity(ranges.len())); + for r in ranges { + let mut next = Vec::with_capacity(n); + for prefix in out.into_iter() { execution_profile.assert_not_timed_out()?; - let range = match ranges_cache.entry(cond) { - Entry::Occupied(o) => o.get().clone(), - Entry::Vacant(v) => { - let range = cond.to_range(&self.spanning_set)?; - v.insert(range.clone()); - range + for ch in r.clone().iter() { + let mut s = prefix.clone(); + s.push(ch.to_char()); + next.push(s); + if next.len() == n { + end = true; + break; } - }; - if range.is_empty() { - continue; } - let mut new_ranges = ranges.clone(); - new_ranges.push(range); - let element = (new_ranges, *to_state); - - if !visited.contains(&element) { - visited.insert(element.clone()); - worklist.push_back(element); + if end { + end = false; + break; } } + out = next; + if out.is_empty() { + break; + } } + strings.extend(out); + Ok(()) + } - Ok(strings) + #[inline] + fn mix64(mut x: u64) -> u64 { + // splitmix64 + x = x.wrapping_add(0x9E3779B97F4A7C15); + let mut z = x; + z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9); + z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB); + z ^ (z >> 31) + } + + #[inline] + fn path_mix(h: u64, x: u64) -> u64 { + h.wrapping_mul(0x9E3779B97F4A7C15).rotate_left(7) ^ x } } @@ -93,12 +131,28 @@ mod tests { #[test] fn test_generate_strings() -> Result<(), String> { + assert_generate_strings("a{100}[a-z]", 100); + assert_generate_strings("(ab|cd)e", 100); + assert_generate_strings("[a-z]+", 100); + assert_generate_strings("[a-z]+@", 100); assert_generate_strings("ù", 1000); + assert_generate_strings("[0-9]+[A-Z]*", 500); + assert_generate_strings("a+(ba+)*", 200); + assert_generate_strings("((a|bc)*|d)", 200); + assert_generate_strings(".*", 50); + assert_generate_strings("(ac|ads|a)*", 200); + assert_generate_strings("((aad|ads|a)*|q)", 200); + + assert_generate_strings( + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + 1000, + ); + assert_generate_strings("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", 500); assert_generate_strings( "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", - 500 + 500, ); assert_generate_strings("[0-9]+[A-Z]*", 500); assert_generate_strings("a+(ba+)*", 200); @@ -108,32 +162,32 @@ mod tests { assert_generate_strings("((aad|ads|a)*|q)", 200); assert_generate_strings("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)", 1000); //((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,5} + Ok(()) } fn assert_generate_strings(regex: &str, number: usize) { println!(":{}", regex); - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); - println!("{}", automaton.get_number_of_states()); + //println!("{}", automaton.get_number_of_states()); //automaton.to_dot(); let re = Regex::new(&format!("(?s)^{}$", regex)).unwrap(); let strings = automaton.generate_strings(number).unwrap(); - let mut strings: Vec<_> = strings.iter().collect(); - strings.sort_unstable(); println!("nb of strings: {}/{}", strings.len(), number); assert!(number >= strings.len()); for string in strings { - if !re.is_match(string) { + // println!("{string}"); + if !re.is_match(&string) { for byte in string.as_bytes() { print!("{:02x} ", byte); } panic!("'{string}'") } - assert!(re.is_match(string), "'{string}'"); + assert!(re.is_match(&string), "'{string}'"); } } } diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 6d6fcbc..10b269c 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -1,25 +1,27 @@ -use crate::Range; +use crate::error::EngineError; use ahash::{AHashMap, HashSetExt}; use condition::Condition; use regex_charclass::CharacterClass; use spanning_set::SpanningSet; -use std::collections::hash_map::Entry; use std::collections::VecDeque; +use std::collections::hash_map::Entry; use std::fmt::Display; -use crate::{IntMap, IntSet}; +use super::*; -pub(crate) type State = usize; pub(crate) type Transitions = IntMap; +/// The identifier of state in an [`FastAutomaton`] +pub type State = usize; + mod analyze; mod builder; pub mod condition; mod convert; mod generate; mod operation; -#[cfg(feature = "serde")] -mod serializer; +#[cfg(feature = "serializable")] +pub mod serializer; pub mod spanning_set; /// Represent a finite state automaton. @@ -39,24 +41,22 @@ impl Display for FastAutomaton { fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(sb, "digraph Automaton {{")?; writeln!(sb, "\trankdir = LR;")?; - for from_state in self.transitions_iter() { - write!(sb, "\t{}", from_state)?; + for from_state in self.states() { + write!(sb, "\t{from_state}")?; if self.accept_states.contains(&from_state) { - writeln!(sb, "\t[shape=doublecircle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; } else { - writeln!(sb, "\t[shape=circle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=circle,label=\"{from_state}\"];")?; } if self.start_state == from_state { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {}", from_state)?; + writeln!(sb, "\tinitial -> {from_state}")?; } - for (to_state, cond) in self.transitions_from_state_enumerate_iter(&from_state) { + for (cond, to_state) in self.transitions_from(from_state) { writeln!( sb, - "\t{} -> {} [label=\"{}\"]", - from_state, - to_state, + "\t{from_state} -> {to_state} [label=\"{}\"]", cond.to_range(&self.spanning_set) .expect("Cannot convert condition to range.") .to_regex() @@ -73,10 +73,11 @@ impl FastAutomaton { #[inline] fn assert_state_exists(&self, state: State) { if !self.has_state(state) { - panic!("The state {} does not exist", state); + panic!("The state {state} does not exist"); } } + /// Returns the number of transitions to the provided state. #[inline] pub fn in_degree(&self, state: State) -> usize { self.transitions_in @@ -85,74 +86,78 @@ impl FastAutomaton { .len() } + /// Returns the number of transitions from the provided state. #[inline] pub fn out_degree(&self, state: State) -> usize { self.transitions[state].len() } - pub fn in_transitions(&self, state: State) -> Vec<(usize, Condition)> { - let mut in_transitions = vec![]; - for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { - for (to_state, condition) in self.transitions_from_state_enumerate_vec(from_state) { - if to_state == state { - in_transitions.push((*from_state, condition)); - } - } - } - in_transitions + /// Returns an iterator over the automaton’s states. + #[inline] + pub fn states(&self) -> impl Iterator + '_ { + (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) } - pub fn in_states(&self, state: State) -> IntSet { - self.transitions_in - .get(&state) - .unwrap_or(&IntSet::new()) - .clone() + /// Returns a vector containing the automaton’s states. + #[inline] + pub fn states_vec(&self) -> Vec { + self.states().collect() } + /// Returns an iterator over states directly reachable from the given state in one transition. #[inline] - pub fn transitions_iter(&self) -> impl Iterator + '_ { - (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) + pub fn direct_states(&self, state: State) -> impl Iterator + '_ { + self.transitions[state] + .keys() + .cloned() + .filter(|s| !self.removed_states.contains(s)) } + /// Returns a vector of states directly reachable from the given state in one transition. #[inline] - pub fn transitions_vec(&self) -> Vec { - self.transitions_iter().collect() + pub fn direct_states_vec(&self, state: State) -> Vec { + self.direct_states(state).collect() } - #[inline] - pub fn transitions_from_state_enumerate_iter( - &self, - from_state: &State, - ) -> impl Iterator { - self.transitions[*from_state] - .iter() - .filter(|s| !self.removed_states.contains(s.0)) + /// Returns a vector of transitions to the given state. + pub fn transitions_to_vec(&self, state: State) -> Vec<(State, Condition)> { + let mut in_transitions = vec![]; + for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { + for (condition, to_state) in self.transitions_from_vec(*from_state) { + if to_state == state { + in_transitions.push((*from_state, condition)); + break; + } + } + } + in_transitions } + /// Returns a vector of transitions from the given state. #[inline] - pub fn transitions_from_state_enumerate_iter_mut( - &mut self, - from_state: &State, - ) -> impl Iterator { - self.transitions[*from_state] - .iter_mut() - .filter(|s| !self.removed_states.contains(s.0)) + pub fn transitions_from_vec(&self, state: State) -> Vec<(Condition, State)> { + self.transitions[state] + .iter() + .map(|(s, c)| (c.clone(), *s)) + .filter(|s| !self.removed_states.contains(&s.1)) + .collect() } + /// Returns an iterator over transitions from the given state. #[inline] - pub fn transitions_from_state_enumerate_vec( + pub fn transitions_from( &self, - from_state: &State, - ) -> Vec<(State, Condition)> { - self.transitions[*from_state] + state: State, + ) -> impl Iterator { + self.transitions[state] .iter() - .map(|(s, c)| (*s, c.clone())) - .filter(|s| !self.removed_states.contains(&s.0)) - .collect() + .map(|(s, c)| (c, s)) + .filter(|s| !self.removed_states.contains(s.1)) } + /// Returns `true` if there is a directed transition from `from_state` to `to_state`. #[inline] - pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { + pub fn has_transition(&self, from_state: State, to_state: State) -> bool { if !self.has_state(from_state) || !self.has_state(to_state) { return false; } @@ -173,111 +178,76 @@ impl FastAutomaton { .collect() } - #[inline] - pub fn transitions_from_state_enumerate_into_iter( - &self, - from_state: &State, - ) -> impl Iterator + '_ { - self.transitions - .get(*from_state) // Assume transitions is a map; adjust accordingly. - .into_iter() // Creates an iterator over Option<&V> - .flat_map(|transitions| transitions.iter()) // Flattens into Iterator - .filter(move |(state, _)| !self.removed_states.contains(state)) // Filters out removed states - .map(|(state, condition)| (*state, condition.clone())) // Creates owned data; adjust if cloning is expensive - } - - #[inline] - pub fn transitions_from_state_iter( - &self, - from_state: &State, - ) -> impl Iterator + '_ { - self.transitions[*from_state] - .keys() - .cloned() - .filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_from_state(&self, from_state: &State) -> Vec { - self.transitions_from_state_iter(from_state).collect() - } - - #[inline] - pub fn transitions_from_state_into_iter<'a>( - &'a self, - from_state: &State, - ) -> impl Iterator + 'a { - self.transitions[*from_state] - .clone() - .into_iter() - .filter(|s| !self.removed_states.contains(&s.0)) - } - + /// Returns the number of states in the automaton. #[inline] pub fn get_number_of_states(&self) -> usize { self.transitions.len() - self.removed_states.len() } + /// Returns a reference to the condition of the directed transition between the two states, if any. #[inline] - pub fn get_condition(&self, from_state: &State, to_state: &State) -> Option<&Condition> { - self.transitions[*from_state].get(to_state) + pub fn get_condition(&self, from_state: State, to_state: State) -> Option<&Condition> { + self.transitions[from_state].get(&to_state) } + /// Returns the start state. #[inline] pub fn get_start_state(&self) -> State { self.start_state } - #[inline] - pub fn get_removed_states(&self) -> &IntSet { - &self.removed_states - } - + /// Returns a reference to the set of accept (final) states. #[inline] pub fn get_accept_states(&self) -> &IntSet { &self.accept_states } + /// Returns a reference to the automaton's spanning set. #[inline] pub fn get_spanning_set(&self) -> &SpanningSet { &self.spanning_set } + /// Returns `true` if the given state is one of the accept states. #[inline] - pub fn is_accepted(&self, state: &State) -> bool { - self.accept_states.contains(state) + pub fn is_accepted(&self, state: State) -> bool { + self.accept_states.contains(&state) } + /// Returns `true` if the automaton is deterministic. #[inline] - pub fn is_determinitic(&self) -> bool { + pub fn is_deterministic(&self) -> bool { self.deterministic } + /// Returns `true` if the automaton contains at least one cycle. #[inline] pub fn is_cyclic(&self) -> bool { self.cyclic } + /// Returns `true` if the automaton contains the given state. #[inline] pub fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) } - pub fn match_string(&self, input: &str) -> bool { + /// Returns `true` if the automaton matches the given string. + pub fn is_match(&self, string: &str) -> bool { let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); worklist.push_back((0, &self.start_state)); while let Some((position, current_state)) = worklist.pop_back() { - if input.len() == position { + if string.len() == position { if self.accept_states.contains(current_state) { return true; } continue; } - let curr_char = input.chars().nth(position).unwrap() as u32; - for (to_state, cond) in self.transitions_from_state_enumerate_iter(current_state) { + let curr_char = string.chars().nth(position).unwrap() as u32; + for (cond, to_state) in self.transitions_from(*current_state) { if cond.has_character(&curr_char, &self.spanning_set).unwrap() { - if position + 1 == input.len() { + if position + 1 == string.len() { if self.accept_states.contains(to_state) { return true; } @@ -290,9 +260,16 @@ impl FastAutomaton { false } + /// Returns the automaton's DOT representation. + #[inline] + pub fn as_dot(&self) -> String { + format!("{self}") + } + + /// Prints the automaton's DOT representation. #[inline] - pub fn to_dot(&self) { - println!("{}", self); + pub fn print_dot(&self) { + println!("{self}"); } } @@ -315,4 +292,15 @@ mod tests { assert!(automaton.is_total()); Ok(()) } + + fn assert_send() {} + fn assert_sync() {} + + #[test] + fn test_traits() -> Result<(), String> { + assert_send::(); + assert_sync::(); + + Ok(()) + } } diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs deleted file mode 100644 index 06c386e..0000000 --- a/src/fast_automaton/operation/alternation.rs +++ /dev/null @@ -1,274 +0,0 @@ -use std::hash::BuildHasherDefault; - -use condition::converter::ConditionConverter; - -use crate::error::EngineError; - -use super::*; - -impl FastAutomaton { - pub fn union(&self, that: &FastAutomaton) -> Result { - let mut union = self.clone(); - union.alternate(that)?; - Ok(union) - } - - pub fn alternation(automatons: Vec) -> Result { - if automatons.len() == 1 { - return Ok(automatons[0].clone()); - } - let mut new_automaton = FastAutomaton::new_empty(); - if automatons.is_empty() { - return Ok(new_automaton); - } - for automaton in automatons { - new_automaton.alternate(&automaton)?; - } - Ok(new_automaton) - } - - fn prepare_start_states( - &mut self, - other: &FastAutomaton, - new_states: &mut IntMap, - condition_converter: &ConditionConverter, - ) -> Result, EngineError> { - let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); - let self_start_state_in_degree = self.in_degree(self.start_state); - let other_start_state_in_degree = other.in_degree(other.start_state); - if self_start_state_in_degree == 0 && other_start_state_in_degree == 0 { - // The start states can be the same state without any consequence - new_states.insert(other.start_state, self.start_state); - imcomplete_states.insert(self.start_state); - } else { - if self_start_state_in_degree != 0 { - let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { - self.accept(new_state); - } - - for (to_state, cond) in self.transitions_from_state_enumerate_vec(&self.start_state) - { - self.add_transition_to(new_state, to_state, &cond); - } - self.start_state = new_state; - } - if other_start_state_in_degree != 0 { - let new_state = self.new_state(); - if other.is_accepted(&other.start_state) { - self.accept(new_state); - self.accept(self.start_state); - } - - new_states.insert(other.start_state, new_state); - imcomplete_states.insert(new_state); - - for (other_to_state, cond) in - other.transitions_from_state_enumerate_vec(&other.start_state) - { - let cond = condition_converter.convert(&cond)?; - let to_state = match new_states.entry(other_to_state) { - Entry::Occupied(o) => *o.get(), - Entry::Vacant(v) => { - let new_state = self.new_state(); - imcomplete_states.insert(new_state); - v.insert(new_state); - new_state - } - }; - self.add_transition_to(self.start_state, to_state, &cond); - } - } - } - Ok(imcomplete_states) - } - - fn prepare_accept_states( - &mut self, - other: &FastAutomaton, - new_states: &mut IntMap, - imcomplete_states: &IntSet, - ) { - let mut self_accept_states_without_outgoing_edges = vec![]; - for &state in &self.accept_states { - if self.out_degree(state) == 0 && !imcomplete_states.contains(&state) { - self_accept_states_without_outgoing_edges.push(state); - } - } - let accept_state_without_outgoing_edges = - match self_accept_states_without_outgoing_edges.len() { - 1 => self_accept_states_without_outgoing_edges[0], - n if n > 1 => { - let new_state = self.new_state(); - self.accept(new_state); - - for &accept_state in &self_accept_states_without_outgoing_edges { - for (from_state, condition) in self.in_transitions(accept_state) { - self.add_transition_to(from_state, new_state, &condition); - } - self.remove_state(accept_state); - } - new_state - } - _ => { - let new_state = self.new_state(); - self.accept(new_state); - new_state - } - }; - - for &state in &other.accept_states { - if other.out_degree(state) == 0 { - new_states - .entry(state) - .or_insert(accept_state_without_outgoing_edges); - } else if new_states.get(&state).is_none() { - let new_accept_state = self.new_state(); - self.accept(new_accept_state); - new_states.insert(state, new_accept_state); - } - } - } - - /* Important things to remember before modifying this method: - * - the start states can't be merged if they have incoming edges - * - the accept states can't be merged if they have outgoing edges - */ - fn alternate(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { - if other.is_empty() || self.is_total() { - return Ok(()); - } else if other.is_total() { - self.make_total(); - return Ok(()); - } else if self.is_empty() { - self.apply_model(other); - return Ok(()); - } - - let new_spanning_set = &self.spanning_set.merge(&other.spanning_set); - self.apply_new_spanning_set(new_spanning_set)?; - let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; - - let mut new_states: IntMap = IntMap::with_capacity_and_hasher( - other.get_number_of_states(), - BuildHasherDefault::default(), - ); - - let imcomplete_states = - self.prepare_start_states(other, &mut new_states, &condition_converter)?; - self.prepare_accept_states(other, &mut new_states, &imcomplete_states); - - for from_state in other.transitions_iter() { - let new_from_state = match new_states.entry(from_state) { - Entry::Occupied(o) => *o.get(), - Entry::Vacant(v) => { - let new_state = self.new_state(); - v.insert(new_state); - new_state - } - }; - for (to_state, condition) in other.transitions_from_state_enumerate_iter(&from_state) { - let new_condition = condition_converter.convert(condition)?; - let new_to_state = match new_states.entry(*to_state) { - Entry::Occupied(o) => *o.get(), - Entry::Vacant(v) => { - let new_state = self.new_state(); - v.insert(new_state); - new_state - } - }; - self.add_transition_to(new_from_state, new_to_state, &new_condition); - } - } - self.cyclic = self.cyclic || other.cyclic; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use crate::regex::RegularExpression; - - #[test] - fn test_simple_alternation_regex_1() -> Result<(), String> { - let automaton = RegularExpression::new("(abc|ac|aaa)") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("abc")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("aaa")); - assert!(!automaton.match_string("abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("acc")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aaaa")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("")); - Ok(()) - } - - #[test] - fn test_simple_alternation_regex_2() -> Result<(), String> { - let automaton = RegularExpression::new("(b?|b{2})") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("b")); - assert!(automaton.match_string("bb")); - assert!(!automaton.match_string("bbb")); - assert!(!automaton.match_string("bbbb")); - Ok(()) - } - - #[test] - fn test_simple_alternation_regex_3() -> Result<(), String> { - let automaton = RegularExpression::new("((a|bc)*|d)") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("abcaaabcbc")); - assert!(automaton.match_string("d")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("abcd")); - Ok(()) - } - - #[test] - fn test_simple_alternation_regex_4() -> Result<(), String> { - let automaton = RegularExpression::new("(a+(ba+)*|ca*c)") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("cc")); - assert!(automaton.match_string("caaac")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aababa")); - Ok(()) - } - - #[test] - fn test_simple_alternation_regex_5() -> Result<(), String> { - let automaton = RegularExpression::new("((aad|ads|a)*|q)") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("q")); - assert!(automaton.match_string("aad")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aadadsaaa")); - assert!(!automaton.match_string("aaaas")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("adsq")); - assert!(!automaton.match_string("qq")); - Ok(()) - } -} diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs new file mode 100644 index 0000000..7eac73f --- /dev/null +++ b/src/fast_automaton/operation/concat.rs @@ -0,0 +1,426 @@ +use std::hash::BuildHasherDefault; + +use condition::converter::ConditionConverter; + +use crate::error::EngineError; + +use super::*; + +impl FastAutomaton { + /// Computes the concatenation between `self` and `other`. + pub fn concat(&self, other: &FastAutomaton) -> Result { + Self::concat_all([self, other]) + } + + /// Computes the concatenation of all automata in the given iterator. + pub fn concat_all<'a, I: IntoIterator>(automata: I) -> Result + { + let mut new_automaton = FastAutomaton::new_empty_string(); + for automaton in automata { + new_automaton.concat_mut(automaton)?; + } + + Ok(new_automaton) + } + + pub(crate) fn concat_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + if other.is_empty() { + return Ok(()); + } + if self.is_empty() { + self.apply_model(other); + return Ok(()); + } + + let new_spanning_set = &self.spanning_set.merge(&other.spanning_set); + self.apply_new_spanning_set(new_spanning_set)?; + let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; + + let mut new_states: IntMap = IntMap::with_capacity_and_hasher( + other.get_number_of_states(), + BuildHasherDefault::default(), + ); + + let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 + && self + .accept_states + .iter() + .cloned() + .any(|s| self.out_degree(s) > 0); + + let accept_states = self.accept_states.iter().cloned().collect::>(); + + self.accept_states.clear(); + + if other.accept_states.contains(&other.start_state) { + for &accept_state in accept_states.iter() { + self.accept(accept_state); + } + } + + if start_state_and_accept_states_not_mergeable { + let new_start_state = new_states + .entry(other.start_state) + .or_insert(self.new_state()); + if other.accept_states.contains(&other.start_state) { + self.accept(*new_start_state); + } + } + + for from_state in other.states() { + let new_from_states = match new_states.entry(from_state) { + Entry::Occupied(o) => { + vec![*o.get()] + } + Entry::Vacant(v) => { + if from_state == other.start_state { + accept_states.clone() + } else { + let new_state = self.new_state(); + if other.accept_states.contains(&from_state) { + self.accept(new_state); + } + v.insert(new_state); + vec![new_state] + } + } + }; + + for (condition, to_state) in other.transitions_from(from_state) { + let new_to_states = match new_states.entry(*to_state) { + Entry::Occupied(o) => { + vec![*o.get()] + } + Entry::Vacant(v) => { + if *to_state == other.start_state { + accept_states.clone() + } else { + let new_state = self.new_state(); + if other.accept_states.contains(to_state) { + self.accept(new_state); + } + v.insert(new_state); + vec![new_state] + } + } + }; + let projected_condition = condition_converter.convert(condition)?; + for new_from_state in new_from_states.iter() { + for new_to_state in new_to_states.iter() { + self.add_transition( + *new_from_state, + *new_to_state, + &projected_condition, + ); + } + } + } + } + + if start_state_and_accept_states_not_mergeable { + if let Some(&other_start_state) = new_states.get(&other.start_state) { + for accept_state in &accept_states { + self.add_epsilon_transition(*accept_state, other_start_state); + } + } + } + self.cyclic = self.cyclic || other.cyclic; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::regex::RegularExpression; + + #[test] + fn test_simple_concatenation_regex() -> Result<(), String> { + let automaton = RegularExpression::parse("abc", false) + .unwrap() + .to_automaton() + .unwrap(); + + automaton.print_dot(); + assert!(automaton.is_match("abc")); + assert!(!automaton.is_match("abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_concat_alternation_regex() -> Result<(), String> { + let automaton = RegularExpression::parse("0101(abc|ac|aaa)", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("0101abc")); + assert!(automaton.is_match("0101ac")); + assert!(automaton.is_match("0101aaa")); + assert!(!automaton.is_match("abc")); + assert!(!automaton.is_match("0101abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("acc")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aaaa")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_concat_repeat_regex() -> Result<(), String> { + let automaton = RegularExpression::parse("A+B*", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("AAABBB")); + assert!(automaton.is_match("AA")); + assert!(automaton.is_match("AB")); + assert!(!automaton.is_match("B")); + assert!(!automaton.is_match("ABA")); + assert!(!automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_01() -> Result<(), String> { + let automaton = RegularExpression::parse("a+", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("aaaaaaa")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("")); + + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_02() -> Result<(), String> { + let automaton = RegularExpression::parse("a*c", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("c")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("aac")); + assert!(automaton.is_match("aaaaaaac")); + assert!(!automaton.is_match("abc")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_03() -> Result<(), String> { + let automaton = RegularExpression::parse("(ab){3,4}", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("ababab")); + assert!(automaton.is_match("abababab")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("abab")); + assert!(!automaton.is_match("ababababab")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_04() -> Result<(), String> { + let automaton = RegularExpression::parse("a{3,}", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("aaa")); + assert!(automaton.is_match("aaaaa")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_05() -> Result<(), String> { + let automaton = RegularExpression::parse("a?", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("aaa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_06() -> Result<(), String> { + let automaton = RegularExpression::parse("a{0,2}", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(!automaton.is_match("aaa")); + assert!(!automaton.is_match("aaaa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_07() -> Result<(), String> { + let automaton = RegularExpression::parse("a{1,3}", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(!automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("aaa")); + assert!(!automaton.is_match("aaaa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_08() -> Result<(), String> { + let automaton = RegularExpression::parse("a+(ba+)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(!automaton.is_match("")); + assert!(!automaton.is_match("aab")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aaa")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("aaba")); + assert!(automaton.is_match("aabaaa")); + assert!(automaton.is_match("aaabaaabaaba")); + assert!(!automaton.is_match("aaabbaa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_09() -> Result<(), String> { + let automaton = RegularExpression::parse("(ac|ads|a)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("acaadsac")); + assert!(automaton.is_match("adsaaaaaaaacaa")); + assert!(!automaton.is_match("as")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("c")); + assert!(!automaton.is_match("ds")); + assert!(!automaton.is_match("d")); + assert!(!automaton.is_match("s")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_10() -> Result<(), String> { + let automaton = RegularExpression::parse("(ef|ads|a)+", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(!automaton.is_match("")); + assert!(automaton.is_match("ef")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("efadsa")); + assert!(automaton.is_match("aaadsefef")); + assert!(!automaton.is_match("as")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("e")); + assert!(!automaton.is_match("ds")); + assert!(!automaton.is_match("d")); + assert!(!automaton.is_match("s")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_11() -> Result<(), String> { + let automaton = RegularExpression::parse("(a|bc)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("bc")); + assert!(automaton.is_match("abcbca")); + assert!(automaton.is_match("bcabcbcaaaa")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("c")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_12() -> Result<(), String> { + let automaton = RegularExpression::parse("([ab]*a)?", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("ba")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("abbaabbaba")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("abab")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_13() -> Result<(), String> { + let automaton = RegularExpression::parse("([ab]*a)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("ba")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("abbaabbaba")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("abab")); + Ok(()) + } + + #[test] + fn test_simple_repeat_right_number_of_states_1() -> Result<(), String> { + let automaton = RegularExpression::parse("a*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert_eq!(1, automaton.get_number_of_states()); + Ok(()) + } + + #[test] + fn test_simple_concat_right_number_of_states_2() -> Result<(), String> { + let automaton = RegularExpression::parse("(a*bc)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert_eq!(3, automaton.get_number_of_states()); + Ok(()) + } +} +//(a|bc)* diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs deleted file mode 100644 index 3741e01..0000000 --- a/src/fast_automaton/operation/concatenate.rs +++ /dev/null @@ -1,520 +0,0 @@ -use std::hash::BuildHasherDefault; - -use condition::converter::ConditionConverter; - -use crate::error::EngineError; - -use super::*; - -impl FastAutomaton { - pub fn concatenate(automatons: Vec) -> Result { - if automatons.len() == 1 { - return Ok(automatons[0].clone()); - } - let mut new_automaton = FastAutomaton::new_empty_string(); - if automatons.is_empty() { - return Ok(new_automaton); - } - for automaton in automatons { - new_automaton.concat(&automaton)?; - } - - Ok(new_automaton) - } - - pub fn repeat(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { - if let Some(max) = max_opt { - if min > max { - self.make_empty(); - return Ok(()); - } - } - - let automaton_to_repeat = self.clone(); - - if min == 0 && self.in_degree(self.start_state) != 0 { - let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { - self.accept(new_state); - } - - for to_state in self.transitions_from_state(&self.start_state) { - self.add_epsilon(new_state, to_state); - } - self.start_state = new_state; - - if max_opt.is_none() { - for accept_state in self.accept_states.clone() { - self.add_epsilon(accept_state, self.start_state); - } - self.accept(self.start_state); - return Ok(()); - } - } - - if let Some(max) = max_opt { - if min <= 1 && max == 1 { - if min == 0 { - self.accept_states.insert(self.start_state); - } - return Ok(()); - } - } - - let iter = if min == 0 { 0..0 } else { 0..min - 1 }; - for _ in iter { - self.concat(&automaton_to_repeat)?; - } - - if max_opt.is_none() { - let mut automaton_to_repeat = automaton_to_repeat.clone(); - - let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); - if automaton_to_repeat.accept_states.len() == 1 - && automaton_to_repeat.out_degree(accept_state) == 0 - && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 - { - automaton_to_repeat.add_epsilon(accept_state, automaton_to_repeat.start_state); - let old_start_state = automaton_to_repeat.start_state; - automaton_to_repeat.start_state = accept_state; - automaton_to_repeat.remove_state(old_start_state); - } else { - let t = Self::transitions_from_state_set( - &automaton_to_repeat.transitions, - automaton_to_repeat.start_state, - ); - let transitions = - Self::transitions_from_state_enumerate(&t, &automaton_to_repeat.removed_states); - - for state in automaton_to_repeat.accept_states.clone() { - for &(to_state, condition) in &transitions { - automaton_to_repeat.add_transition_to(state, *to_state, condition); - } - } - - automaton_to_repeat.accept(automaton_to_repeat.get_start_state()); - } - automaton_to_repeat.cyclic = true; - - if min == 0 { - self.apply_model(&automaton_to_repeat); - } else { - self.concat(&automaton_to_repeat)?; - } - - return Ok(()); - } - - let mut end_states = self.accept_states.iter().cloned().collect::>(); - for _ in cmp::max(min, 1)..max_opt.unwrap() { - self.concat(&automaton_to_repeat)?; - end_states.extend(self.accept_states.iter()); - } - self.accept_states.extend(end_states); - if min == 0 { - self.accept(self.start_state); - } - Ok(()) - } - - fn concat(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { - if other.is_empty() { - return Ok(()); - } - if self.is_empty() { - self.apply_model(other); - return Ok(()); - } - - let new_spanning_set = &self.spanning_set.merge(&other.spanning_set); - self.apply_new_spanning_set(new_spanning_set)?; - let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; - - let mut new_states: IntMap = IntMap::with_capacity_and_hasher( - other.get_number_of_states(), - BuildHasherDefault::default(), - ); - - let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 - && self - .accept_states - .iter() - .cloned() - .any(|s| self.out_degree(s) > 0); - - let accept_states = self.accept_states.iter().cloned().collect::>(); - - self.accept_states.clear(); - - if other.accept_states.contains(&other.start_state) { - for &accept_state in accept_states.iter() { - self.accept(accept_state); - } - } - - if start_state_and_accept_states_not_mergeable { - let new_start_state = new_states - .entry(other.start_state) - .or_insert(self.new_state()); - if other.accept_states.contains(&other.start_state) { - self.accept(*new_start_state); - } - } - - for from_state in other.transitions_iter() { - let new_from_states = match new_states.entry(from_state) { - Entry::Occupied(o) => { - vec![*o.get()] - } - Entry::Vacant(v) => { - if from_state == other.start_state { - accept_states.clone() - } else { - let new_state = self.new_state(); - if other.accept_states.contains(&from_state) { - self.accept(new_state); - } - v.insert(new_state); - vec![new_state] - } - } - }; - - for (to_state, condition) in other.transitions_from_state_enumerate_iter(&from_state) { - let new_to_states = match new_states.entry(*to_state) { - Entry::Occupied(o) => { - vec![*o.get()] - } - Entry::Vacant(v) => { - if *to_state == other.start_state { - accept_states.clone() - } else { - let new_state = self.new_state(); - if other.accept_states.contains(to_state) { - self.accept(new_state); - } - v.insert(new_state); - vec![new_state] - } - } - }; - let projected_condition = condition_converter.convert(condition)?; - for new_from_state in new_from_states.iter() { - for new_to_state in new_to_states.iter() { - self.add_transition_to( - *new_from_state, - *new_to_state, - &projected_condition, - ); - } - } - } - } - - if start_state_and_accept_states_not_mergeable { - if let Some(&other_start_state) = new_states.get(&other.start_state) { - for accept_state in &accept_states { - self.add_epsilon(*accept_state, other_start_state); - } - } - } - self.cyclic = self.cyclic || other.cyclic; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use crate::regex::RegularExpression; - - #[test] - fn test_simple_concatenation_regex() -> Result<(), String> { - let automaton = RegularExpression::new("abc") - .unwrap() - .to_automaton() - .unwrap(); - - automaton.to_dot(); - assert!(automaton.match_string("abc")); - assert!(!automaton.match_string("abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("")); - Ok(()) - } - - #[test] - fn test_simple_concat_alternation_regex() -> Result<(), String> { - let automaton = RegularExpression::new("0101(abc|ac|aaa)") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("0101abc")); - assert!(automaton.match_string("0101ac")); - assert!(automaton.match_string("0101aaa")); - assert!(!automaton.match_string("abc")); - assert!(!automaton.match_string("0101abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("acc")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aaaa")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("")); - Ok(()) - } - - #[test] - fn test_simple_concat_repeat_regex() -> Result<(), String> { - let automaton = RegularExpression::new("A+B*") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("AAABBB")); - assert!(automaton.match_string("AA")); - assert!(automaton.match_string("AB")); - assert!(!automaton.match_string("B")); - assert!(!automaton.match_string("ABA")); - assert!(!automaton.match_string("")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_01() -> Result<(), String> { - let automaton = RegularExpression::new("a+") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("aaaaaaa")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("")); - - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_02() -> Result<(), String> { - let automaton = RegularExpression::new("a*c") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("c")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("aac")); - assert!(automaton.match_string("aaaaaaac")); - assert!(!automaton.match_string("abc")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_03() -> Result<(), String> { - let automaton = RegularExpression::new("(ab){3,4}") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("ababab")); - assert!(automaton.match_string("abababab")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("abab")); - assert!(!automaton.match_string("ababababab")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_04() -> Result<(), String> { - let automaton = RegularExpression::new("a{3,}") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("aaa")); - assert!(automaton.match_string("aaaaa")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_05() -> Result<(), String> { - let automaton = RegularExpression::new("a?") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("aaa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_06() -> Result<(), String> { - let automaton = RegularExpression::new("a{0,2}") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(!automaton.match_string("aaa")); - assert!(!automaton.match_string("aaaa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_07() -> Result<(), String> { - let automaton = RegularExpression::new("a{1,3}") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(!automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("aaa")); - assert!(!automaton.match_string("aaaa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_08() -> Result<(), String> { - let automaton = RegularExpression::new("a+(ba+)*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(!automaton.match_string("")); - assert!(!automaton.match_string("aab")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aaa")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("aaba")); - assert!(automaton.match_string("aabaaa")); - assert!(automaton.match_string("aaabaaabaaba")); - assert!(!automaton.match_string("aaabbaa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_09() -> Result<(), String> { - let automaton = RegularExpression::new("(ac|ads|a)*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("acaadsac")); - assert!(automaton.match_string("adsaaaaaaaacaa")); - assert!(!automaton.match_string("as")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("c")); - assert!(!automaton.match_string("ds")); - assert!(!automaton.match_string("d")); - assert!(!automaton.match_string("s")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_10() -> Result<(), String> { - let automaton = RegularExpression::new("(ef|ads|a)+") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(!automaton.match_string("")); - assert!(automaton.match_string("ef")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("efadsa")); - assert!(automaton.match_string("aaadsefef")); - assert!(!automaton.match_string("as")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("e")); - assert!(!automaton.match_string("ds")); - assert!(!automaton.match_string("d")); - assert!(!automaton.match_string("s")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_11() -> Result<(), String> { - let automaton = RegularExpression::new("(a|bc)*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("bc")); - assert!(automaton.match_string("abcbca")); - assert!(automaton.match_string("bcabcbcaaaa")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("c")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_12() -> Result<(), String> { - let automaton = RegularExpression::new("([ab]*a)?") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("ba")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("abbaabbaba")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("abab")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_13() -> Result<(), String> { - let automaton = RegularExpression::new("([ab]*a)*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("ba")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("abbaabbaba")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("abab")); - Ok(()) - } - - #[test] - fn test_simple_repeat_right_number_of_states_1() -> Result<(), String> { - let automaton = RegularExpression::new("a*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert_eq!(1, automaton.get_number_of_states()); - Ok(()) - } - - #[test] - fn test_simple_concat_right_number_of_states_2() -> Result<(), String> { - let automaton = RegularExpression::new("(a*bc)") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert_eq!(3, automaton.get_number_of_states()); - Ok(()) - } -} -//(a|bc)* diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 3d4057b..6a578eb 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -1,79 +1,77 @@ -use ahash::HashMapExt; +use bit_set::BitSet; -use crate::{execution_profile::ThreadLocalParams, EngineError}; +use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - pub fn determinize(&self) -> Result { + /// Determinizes the automaton and returns the result. + pub fn determinize(&self) -> Result, EngineError> { if self.deterministic { - return Ok(self.clone()); + return Ok(Cow::Borrowed(self)); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); - let ranges = self.get_ranges()?; - - let initial_vec = VecDeque::from(vec![self.start_state]); + let bases = self.get_spanning_bases()?; let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); let map_capacity = (self.get_number_of_states() as f64 / 0.75).ceil() as usize; - let mut new_states = IntMap::with_capacity(map_capacity); + let mut new_states = AHashMap::with_capacity(map_capacity); + + let mut accept_states = BitSet::new(); + for &state in &self.accept_states { + accept_states.insert(state); + } let mut new_automaton = FastAutomaton::new_empty(); new_automaton.spanning_set = self.spanning_set.clone(); - worklist.push_back((vec![self.start_state], new_automaton.start_state)); - new_states.insert(Self::simple_hash(&initial_vec), new_automaton.start_state); + let mut initial_state = BitSet::new(); + initial_state.insert(self.start_state); + + worklist.push_back((initial_state.clone(), new_automaton.start_state)); + new_states.insert(initial_state, new_automaton.start_state); - let mut new_states_to_add = VecDeque::with_capacity(self.get_number_of_states()); + let mut new_states_to_add = BitSet::new(); while let Some((states, r)) = worklist.pop_front() { execution_profile.assert_not_timed_out()?; - for state in &states { - if self.accept_states.contains(state) { - new_automaton.accept_states.insert(r); - break; - } + if !states.is_disjoint(&accept_states) { + new_automaton.accept_states.insert(r); } - for base in &ranges { + for base in &bases { for from_state in &states { - for (to_state, cond) in self.transitions_from_state_enumerate_iter(from_state) { + for (cond, to_state) in self.transitions_from(from_state) { if cond.has_intersection(base) { - match new_states_to_add.binary_search(to_state) { - Ok(_) => {} // element already in vector @ `pos` - Err(pos) => new_states_to_add.insert(pos, *to_state), - }; + new_states_to_add.insert(*to_state); } } } if !new_states_to_add.is_empty() { - let q = match new_states.entry(Self::simple_hash(&new_states_to_add)) { - Entry::Occupied(o) => *o.get(), + match new_states.entry(new_states_to_add.clone()) { + Entry::Occupied(o) => { + let q = *o.get(); + + new_states_to_add.clear(); + + new_automaton.add_transition(r, q, base); + } Entry::Vacant(v) => { let new_q = new_automaton.new_state(); - worklist - .push_back((new_states_to_add.iter().cloned().collect(), new_q)); v.insert(new_q); - new_q + + let new_states = std::mem::take(&mut new_states_to_add); + worklist.push_back((new_states, new_q)); + + new_automaton.add_transition(r, new_q, base); } }; - - new_automaton.add_transition_to(r, q, base); } - new_states_to_add.clear(); } } - Ok(new_automaton) - } - - fn simple_hash(list: &VecDeque) -> u64 { - let mut hasher = AHasher::default(); - for &item in list { - hasher.write_usize(item); - } - hasher.finish() + Ok(Cow::Owned(new_automaton)) } } @@ -81,20 +79,6 @@ impl FastAutomaton { mod tests { use crate::regex::RegularExpression; - #[test] - fn test_determinize_1() -> Result<(), String> { - let automaton = RegularExpression::new(".*ab") - .unwrap() - .to_automaton() - .unwrap(); - - let deterministic_automaton = automaton.determinize().unwrap(); - - assert!(deterministic_automaton.is_determinitic()); - - Ok(()) - } - #[test] fn test_determinize_regex() -> Result<(), String> { assert_determinization("(aad|ads|a)"); @@ -112,22 +96,23 @@ mod tests { fn assert_determinization(regex: &str) { println!(":{}", regex); - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); - //automaton.compute_determinization_cost(); - //println!("Determinization Cost: {:?}", automaton.determinisation_cost); println!("States Before: {}", automaton.get_number_of_states()); let deterministic_automaton = automaton.determinize().unwrap(); println!( "States After: {}", deterministic_automaton.get_number_of_states() ); - assert!(deterministic_automaton.is_determinitic()); - assert!(automaton - .subtraction(&deterministic_automaton) - .unwrap() - .is_empty()); + assert!(deterministic_automaton.is_deterministic()); + //deterministic_automaton.print_dot(); + assert!( + automaton + .difference(&deterministic_automaton) + .unwrap() + .is_empty() + ); } } diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/difference.rs similarity index 78% rename from src/fast_automaton/operation/subtraction.rs rename to src/fast_automaton/operation/difference.rs index d513fbb..59ca6ed 100644 --- a/src/fast_automaton/operation/subtraction.rs +++ b/src/fast_automaton/operation/difference.rs @@ -6,9 +6,8 @@ use super::*; impl FastAutomaton { fn totalize(&mut self) -> Result<(), EngineError> { - if !self.is_determinitic() { - return Err(EngineError::AutomatonShouldBeDeterministic); - } + assert!(self.is_deterministic(), "The automaton should be deterministic."); + let crash_state = self.new_state(); let mut transitions_to_crash_state: IntMap = IntMap::with_capacity_and_hasher( @@ -17,9 +16,9 @@ impl FastAutomaton { ); let mut ranges = Vec::with_capacity(self.get_number_of_states()); - for from_state in self.transitions_iter() { + for from_state in self.states() { let mut new_condition = Condition::empty(&self.spanning_set); - for (_, condition) in self.transitions_from_state_enumerate_iter(&from_state) { + for (condition, _) in self.transitions_from(from_state) { new_condition = new_condition.union(condition); ranges.push(condition.to_range(self.get_spanning_set())?); } @@ -30,7 +29,7 @@ impl FastAutomaton { } for (from_state, condition) in &transitions_to_crash_state { - self.add_transition_to(*from_state, crash_state, condition); + self.add_transition(*from_state, crash_state, condition); ranges.push(condition.to_range(self.get_spanning_set())?); } @@ -43,11 +42,12 @@ impl FastAutomaton { Ok(()) } + /// Complements the automaton; it must be deterministic. pub fn complement(&mut self) -> Result<(), EngineError> { self.totalize()?; let mut new_accept_states = IntSet::default(); - for state in self.transitions_iter() { + for state in self.states() { if self.accept_states.contains(&state) { continue; } @@ -58,7 +58,8 @@ impl FastAutomaton { Ok(()) } - pub fn subtraction(&self, other: &FastAutomaton) -> Result { + /// Computes the difference between `self` and `other`. + pub fn difference(&self, other: &FastAutomaton) -> Result { let mut complement = other.clone(); match complement.complement() { Ok(()) => self.intersection(&complement), diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 96007e6..e373c55 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -1,19 +1,72 @@ +use std::borrow::Cow; + +use rayon::prelude::*; + use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ThreadLocalParams}; +use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - pub fn intersection(&self, other: &FastAutomaton) -> Result { + /// Computes the intersection between `self` and `other`. + pub fn intersection(&self, other: &FastAutomaton) -> Result { + FastAutomaton::intersection_all([self, other]) + } + + /// Computes the intersection of all automata in the given iterator. + pub fn intersection_all<'a, I: IntoIterator>( + automata: I, + ) -> Result { + let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); + + for automaton in automata { + result = result.intersection_internal(automaton)?; + + if result.is_empty() { + break; + } + } + + Ok(result.into_owned()) + } + + /// Computes in parallel the intersection of all automata in the given iterator. + pub fn intersection_all_par<'a, I: IntoParallelIterator>( + automata: I, + ) -> Result { + let execution_profile = ExecutionProfile::get(); + + let total = FastAutomaton::new_total(); + + automata + .into_par_iter() + .try_fold( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(next)?.into_owned())) + }, + ) + .try_reduce( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) + }, + ) + } + + fn intersection_internal<'a>( + &self, + other: &'a FastAutomaton, + ) -> Result, EngineError> { if self.is_empty() || other.is_empty() { - return Ok(Self::new_empty()); + return Ok(Cow::Owned(Self::new_empty())); } else if self.is_total() { - return Ok(other.clone()); + return Ok(Cow::Borrowed(other)); } else if other.is_total() { - return Ok(self.clone()); + return Ok(Cow::Owned(self.clone())); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let new_spanning_set = self.spanning_set.merge(&other.spanning_set); @@ -48,8 +101,8 @@ impl FastAutomaton { let transitions_2 = other.get_projected_transitions(p.2, &condition_converter_other_to_new)?; - for (n1, condition_1) in transitions_1 { - for (n2, condition_2) in &transitions_2 { + for (condition_1, n1) in transitions_1 { + for (condition_2, n2) in &transitions_2 { let intersection = condition_1.intersection(condition_2); if intersection.is_empty() { continue; @@ -64,22 +117,23 @@ impl FastAutomaton { new_r } }; - new_automaton.add_transition_to(p.0, r.0, &intersection); + new_automaton.add_transition(p.0, r.0, &intersection); } } } new_automaton.spanning_set = new_spanning_set; new_automaton.remove_dead_transitions(); - Ok(new_automaton) + Ok(Cow::Owned(new_automaton)) } + /// Returns `true` if the two automata have a non-empty intersection. pub fn has_intersection(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_empty() { return Ok(false); } else if self.is_total() || other.is_total() { return Ok(true); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let new_spanning_set = self.spanning_set.merge(&other.spanning_set); @@ -114,8 +168,8 @@ impl FastAutomaton { let transitions_2 = other.get_projected_transitions(p.2, &condition_converter_other_to_new)?; - for (n1, condition_1) in transitions_1 { - for (n2, condition_2) in &transitions_2 { + for (condition_1, n1) in transitions_1 { + for (condition_2, n2) in &transitions_2 { let intersection = condition_1.intersection(condition_2); if intersection.is_empty() { continue; @@ -130,7 +184,7 @@ impl FastAutomaton { new_r } }; - new_automaton.add_transition_to(p.0, r.0, &intersection); + new_automaton.add_transition(p.0, r.0, &intersection); } } } @@ -141,11 +195,11 @@ impl FastAutomaton { &self, state: State, condition_converter: &ConditionConverter, - ) -> Result, EngineError> { + ) -> Result, EngineError> { let transitions_1: Result, EngineError> = self - .transitions_from_state_enumerate_iter(&state) - .map(|(&s, c)| match condition_converter.convert(c) { - Ok(condition) => Ok((s, condition)), + .transitions_from(state) + .map(|(c, &s)| match condition_converter.convert(c) { + Ok(condition) => Ok((condition, s)), Err(err) => Err(err), }) .collect(); @@ -160,100 +214,100 @@ mod tests { #[test] fn test_simple_intersection_regex_1() -> Result<(), String> { - let automaton1 = RegularExpression::new("(abc|ac|aaa)") + let automaton1 = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("ac")); - assert!(!intersection.match_string("abc")); - assert!(!intersection.match_string("aaa")); - assert!(!intersection.match_string("abcd")); - assert!(!intersection.match_string("aba")); + assert!(intersection.is_match("ac")); + assert!(!intersection.is_match("abc")); + assert!(!intersection.is_match("aaa")); + assert!(!intersection.is_match("abcd")); + assert!(!intersection.is_match("aba")); Ok(()) } #[test] fn test_simple_intersection_regex_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("a*") + let automaton1 = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("b*") + let automaton2 = RegularExpression::parse("b*", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("")); - assert!(!intersection.match_string("a")); - assert!(!intersection.match_string("b")); + assert!(intersection.is_match("")); + assert!(!intersection.is_match("a")); + assert!(!intersection.is_match("b")); Ok(()) } #[test] fn test_simple_intersection_regex_3() -> Result<(), String> { - let automaton1 = RegularExpression::new("x*") + let automaton1 = RegularExpression::parse("x*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(xxx)*") + let automaton2 = RegularExpression::parse("(xxx)*", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("")); - assert!(intersection.match_string("xxx")); - assert!(intersection.match_string("xxxxxx")); - assert!(!intersection.match_string("xx")); - assert!(!intersection.match_string("xxxx")); + assert!(intersection.is_match("")); + assert!(intersection.is_match("xxx")); + assert!(intersection.is_match("xxxxxx")); + assert!(!intersection.is_match("xx")); + assert!(!intersection.is_match("xxxx")); Ok(()) } #[test] fn test_complex_intersection_regex_1() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*(abc|ac|aaa)") + let automaton1 = RegularExpression::parse(".*(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("ac")); - assert!(!intersection.match_string("aaac")); - assert!(!intersection.match_string("abc")); - assert!(!intersection.match_string("aaa")); - assert!(!intersection.match_string("abcd")); - assert!(!intersection.match_string("aba")); + assert!(intersection.is_match("ac")); + assert!(!intersection.is_match("aaac")); + assert!(!intersection.is_match("abc")); + assert!(!intersection.is_match("aaa")); + assert!(!intersection.is_match("abcd")); + assert!(!intersection.is_match("aba")); Ok(()) } #[test] fn test_complex_intersection_regex_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("(?:[a-z0-9]+(?:\\.[a-z0-9]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])") + let automaton1 = RegularExpression::parse("(?:[a-z0-9]+(?:\\.[a-z0-9]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", false) .unwrap() .to_automaton().unwrap(); - let automaton2 = RegularExpression::new("avb@.*") + let automaton2 = RegularExpression::parse("avb@.*", false) .unwrap() .to_automaton() .unwrap(); - automaton1.to_dot(); - automaton2.to_dot(); + automaton1.print_dot(); + automaton2.print_dot(); let intersection = automaton1.intersection(&automaton2).unwrap(); assert!(!intersection.is_empty()); - assert!(intersection.match_string("avb@gmail.com")); + assert!(intersection.is_match("avb@gmail.com")); Ok(()) } } diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 7c7c0f1..37241a8 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -1,22 +1,21 @@ -use std::{cmp, hash::Hasher}; - -use ahash::AHasher; +use std::cmp; use super::*; -mod alternation; -mod concatenate; +mod concat; mod determinize; +mod difference; mod intersection; -mod subtraction; +mod repeat; +mod union; impl FastAutomaton { - pub fn remove_dead_transitions(&mut self) { + pub(crate) fn remove_dead_transitions(&mut self) { if !self.is_empty() { - let reacheable_states = self.get_reacheable_states(); + let reacheable_states = self.get_reachable_states(); let mut dead_states = IntSet::default(); - for from_state in self.transitions_iter() { + for from_state in self.states() { if !reacheable_states.contains(&from_state) { dead_states.insert(from_state); } @@ -34,17 +33,17 @@ mod tests { #[test] fn test_remove_dead_states() -> Result<(), String> { - let automaton1 = RegularExpression::new("(abc|ac|aaa)") + let automaton1 = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); assert_eq!(3, intersection.get_number_of_states()); - assert_eq!(3, intersection.get_reacheable_states().len()); + assert_eq!(3, intersection.get_reachable_states().len()); Ok(()) } } diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs new file mode 100644 index 0000000..f9e256d --- /dev/null +++ b/src/fast_automaton/operation/repeat.rs @@ -0,0 +1,127 @@ +use super::*; + +impl FastAutomaton { + /// Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + pub fn repeat(&self, min: u32, max_opt: Option) -> Result { + let mut automaton = self.clone(); + if let Err(error) = automaton.repeat_mut(min, max_opt) { + Err(error) + } else { + Ok(automaton) + } + } + + pub(crate) fn repeat_mut(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { + if let Some(max) = max_opt { + if min > max { + self.make_empty(); + return Ok(()); + } + } + + let automaton_to_repeat = self.clone(); + + if min == 0 && self.in_degree(self.start_state) != 0 { + let new_state = self.new_state(); + if self.is_accepted(self.start_state) { + self.accept(new_state); + } + + self.add_epsilon_transition(new_state, self.start_state); + self.start_state = new_state; + + if max_opt.is_none() { + for accept_state in self.accept_states.clone() { + self.add_epsilon_transition(accept_state, self.start_state); + } + self.accept(self.start_state); + return Ok(()); + } + } + + if let Some(max) = max_opt { + if min <= 1 && max == 1 { + if min == 0 { + self.accept_states.insert(self.start_state); + } + return Ok(()); + } + } + + let iter = if min == 0 { 0..0 } else { 0..min - 1 }; + for _ in iter { + self.concat_mut(&automaton_to_repeat)?; + } + + if max_opt.is_none() { + let mut automaton_to_repeat = automaton_to_repeat.clone(); + + let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); + if automaton_to_repeat.accept_states.len() == 1 + && automaton_to_repeat.out_degree(accept_state) == 0 + && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 + { + automaton_to_repeat + .add_epsilon_transition(accept_state, automaton_to_repeat.start_state); + let old_start_state = automaton_to_repeat.start_state; + automaton_to_repeat.start_state = accept_state; + automaton_to_repeat.remove_state(old_start_state); + } else { + let t = Self::transitions_from_state_set( + &automaton_to_repeat.transitions, + automaton_to_repeat.start_state, + ); + let transitions = + Self::transitions_from_state_enumerate(&t, &automaton_to_repeat.removed_states); + + for state in automaton_to_repeat.accept_states.clone() { + for &(to_state, condition) in &transitions { + automaton_to_repeat.add_transition(state, *to_state, condition); + } + } + + automaton_to_repeat.accept(automaton_to_repeat.get_start_state()); + } + automaton_to_repeat.cyclic = true; + + if min == 0 { + self.apply_model(&automaton_to_repeat); + } else { + self.concat_mut(&automaton_to_repeat)?; + } + + return Ok(()); + } + + let mut end_states = self.accept_states.iter().cloned().collect::>(); + for _ in cmp::max(min, 1)..max_opt.unwrap() { + self.concat_mut(&automaton_to_repeat)?; + end_states.extend(self.accept_states.iter()); + } + self.accept_states.extend(end_states); + if min == 0 { + self.accept(self.start_state); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::regex::RegularExpression; + + #[test] + fn test_repeat_1() -> Result<(), String> { + let automaton = RegularExpression::parse("(a*,a*)?", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("")); + assert!(automaton.is_match(",")); + assert!(automaton.is_match("aaa,")); + assert!(automaton.is_match("aaaa,aa")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aa")); + Ok(()) + } +} diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs new file mode 100644 index 0000000..96480ea --- /dev/null +++ b/src/fast_automaton/operation/union.rs @@ -0,0 +1,374 @@ +use std::hash::BuildHasherDefault; + +use condition::converter::ConditionConverter; +use rayon::prelude::*; + +use crate::{error::EngineError, execution_profile::ExecutionProfile}; + +use super::*; + +impl FastAutomaton { + /// Computes the union between `self` and `other`. + pub fn union(&self, other: &FastAutomaton) -> Result { + Self::union_all([self, other]) + } + + /// Computes the union of all automata in the given iterator. + pub fn union_all<'a, I: IntoIterator>( + automata: I, + ) -> Result { + let mut new_automaton = FastAutomaton::new_empty(); + for automaton in automata { + new_automaton.union_mut(automaton)?; + } + Ok(new_automaton) + } + + /// Computes in parallel the union of all automata in the given iterator. + pub fn union_all_par<'a, I: IntoParallelIterator>( + automata: I, + ) -> Result { + let execution_profile = ExecutionProfile::get(); + + let empty = FastAutomaton::new_empty(); + + automata + .into_par_iter() + .try_fold( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(next)?; + Ok(acc) + }) + }, + ) + .try_reduce( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(&next)?; + Ok(acc) + }) + }, + ) + } + + fn prepare_start_states( + &mut self, + other: &FastAutomaton, + new_states: &mut IntMap, + condition_converter: &ConditionConverter, + ) -> Result, EngineError> { + let mut imcomplete_states = + IntSet::with_capacity(other.out_degree(other.start_state) + 1); + if other.is_accepted(other.start_state) { + self.accept(self.start_state); + } + let self_start_state_in_degree = self.in_degree(self.start_state); + let other_start_state_in_degree = other.in_degree(other.start_state); + if self_start_state_in_degree == 0 && other_start_state_in_degree == 0 { + // The start states can be the same state without any consequence + new_states.insert(other.start_state, self.start_state); + imcomplete_states.insert(self.start_state); + } else { + if self_start_state_in_degree != 0 { + let new_state = self.new_state(); + + self.add_epsilon_transition(new_state, self.start_state); + self.start_state = new_state; + new_states.insert(other.start_state, self.start_state); + imcomplete_states.insert(self.start_state); + } + if other_start_state_in_degree != 0 { + let new_state = self.new_state(); + if other.is_accepted(other.start_state) { + self.accept(new_state); + } + + new_states.insert(other.start_state, new_state); + imcomplete_states.insert(new_state); + + for (cond, other_to_state) in other.transitions_from_vec(other.start_state) { + let cond = condition_converter.convert(&cond)?; + let to_state = match new_states.entry(other_to_state) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let new_state = self.new_state(); + imcomplete_states.insert(new_state); + v.insert(new_state); + new_state + } + }; + self.add_transition(self.start_state, to_state, &cond); + } + } + } + Ok(imcomplete_states) + } + + fn prepare_accept_states( + &mut self, + other: &FastAutomaton, + new_states: &mut IntMap, + imcomplete_states: &IntSet, + ) { + let mut self_accept_states_without_outgoing_edges = vec![]; + for &state in &self.accept_states { + if self.out_degree(state) == 0 && !imcomplete_states.contains(&state) { + self_accept_states_without_outgoing_edges.push(state); + } + } + let accept_state_without_outgoing_edges = + match self_accept_states_without_outgoing_edges.len() { + 1 => Some(self_accept_states_without_outgoing_edges[0]), + n if n > 1 => { + let new_state = self.new_state(); + self.accept(new_state); + + for &accept_state in &self_accept_states_without_outgoing_edges { + for (from_state, condition) in self.transitions_to_vec(accept_state) { + self.add_transition(from_state, new_state, &condition); + } + self.remove_state(accept_state); + } + Some(new_state) + } + _ => None, + }; + + for &state in &other.accept_states { + match accept_state_without_outgoing_edges { + Some(accept_state) if other.out_degree(state) == 0 => { + new_states.entry(state).or_insert(accept_state); + } + _ => { + if new_states.get(&state).is_none() { + let new_accept_state = self.new_state(); + self.accept(new_accept_state); + new_states.insert(state, new_accept_state); + } + } + } + } + } + + /* Important things to remember before modifying this method: + * - the start states can't be merged if they have incoming edges + * - the accept states can't be merged if they have outgoing edges + */ + pub(crate) fn union_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + if other.is_empty() || self.is_total() { + return Ok(()); + } else if other.is_total() { + self.make_total(); + return Ok(()); + } else if self.is_empty() { + self.apply_model(other); + return Ok(()); + } + + let new_spanning_set = &self.spanning_set.merge(&other.spanning_set); + self.apply_new_spanning_set(new_spanning_set)?; + let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; + + let mut new_states: IntMap = IntMap::with_capacity_and_hasher( + other.get_number_of_states(), + BuildHasherDefault::default(), + ); + + let imcomplete_states = + self.prepare_start_states(other, &mut new_states, &condition_converter)?; + self.prepare_accept_states(other, &mut new_states, &imcomplete_states); + + for from_state in other.states() { + let new_from_state = match new_states.entry(from_state) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let new_state = self.new_state(); + v.insert(new_state); + new_state + } + }; + for (condition, to_state) in other.transitions_from(from_state) { + let new_condition = condition_converter.convert(condition)?; + let new_to_state = match new_states.entry(*to_state) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let new_state = self.new_state(); + v.insert(new_state); + new_state + } + }; + self.add_transition(new_from_state, new_to_state, &new_condition); + } + } + self.cyclic = self.cyclic || other.cyclic; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::regex::RegularExpression; + + #[test] + fn test_simple_alternation_regex_1() -> Result<(), String> { + let automaton = RegularExpression::parse("(abc|ac|aaa)", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("abc")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("aaa")); + assert!(!automaton.is_match("abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("acc")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aaaa")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_2() -> Result<(), String> { + let automaton = RegularExpression::parse("(b?|b{2})", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("b")); + assert!(automaton.is_match("bb")); + assert!(!automaton.is_match("bbb")); + assert!(!automaton.is_match("bbbb")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_3() -> Result<(), String> { + let automaton = RegularExpression::parse("((a|bc)*|d)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_3b() -> Result<(), String> { + let automaton = RegularExpression::parse("(d|(a|bc)*)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_3t() -> Result<(), String> { + let automaton = RegularExpression::parse("(d*|(a|bc)*)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(automaton.is_match("ddd")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_4() -> Result<(), String> { + let automaton = RegularExpression::parse("(a+(ba+)*|ca*c)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("cc")); + assert!(automaton.is_match("caaac")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aababa")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_5() -> Result<(), String> { + let automaton = RegularExpression::parse("((aad|ads|a)*|q)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("q")); + assert!(automaton.is_match("aad")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aadadsaaa")); + assert!(!automaton.is_match("aaaas")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("adsq")); + assert!(!automaton.is_match("qq")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_6() -> Result<(), String> { + let automaton = RegularExpression::parse("(ab|)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("ab")); + assert!(automaton.is_match("")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("aab")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_7() -> Result<(), String> { + let automaton = RegularExpression::parse("(d|a?|ab)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("d")); + assert!(automaton.is_match("ab")); + assert!(automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_8() -> Result<(), String> { + let automaton = RegularExpression::parse("((d|a?|ab)u)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("au")); + assert!(automaton.is_match("du")); + assert!(automaton.is_match("abu")); + assert!(automaton.is_match("u")); + assert!(automaton.is_match("")); + Ok(()) + } +} diff --git a/src/fast_automaton/serializer.rs b/src/fast_automaton/serializer.rs deleted file mode 100644 index 017341b..0000000 --- a/src/fast_automaton/serializer.rs +++ /dev/null @@ -1,225 +0,0 @@ -use super::*; -use lazy_static::lazy_static; -use rand::Rng; -use serde::{de, ser, Deserializer, Serializer}; -use serde::{Deserialize, Serialize}; -use std::env; -use z85::{decode, encode}; -use crate::tokenizer::Tokenizer; - -use sha2::{Digest, Sha256}; - -use aes_gcm_siv::{ - aead::{Aead, KeyInit}, - Aes256GcmSiv, Nonce, -}; -use flate2::read::ZlibDecoder; -use flate2::write::ZlibEncoder; -use flate2::Compression; -use std::io::prelude::*; - -use crate::tokenizer::token::{automaton_token::AutomatonToken, Token}; - -pub struct FastAutomatonReader { - cipher: Aes256GcmSiv, -} - -impl FastAutomatonReader { - pub fn new() -> Self { - let env_var = env::var("RS_FAIR_SECRET_KEY").unwrap_or("DEFAULT PASSKEY".to_string()); - let key = Sha256::digest(env_var.as_bytes()); - FastAutomatonReader { - cipher: Aes256GcmSiv::new(&key), - } - } - - pub fn random_nonce() -> [u8; 12] { - let mut nonce = [0u8; 12]; - rand::thread_rng().fill(&mut nonce); - nonce - } -} - -lazy_static! { - static ref SINGLETON_INSTANCE: FastAutomatonReader = FastAutomatonReader::new(); -} - -fn get_fast_automaton_reader() -> &'static FastAutomatonReader { - &SINGLETON_INSTANCE -} - -#[derive(Serialize, Deserialize, Debug)] -struct SerializedAutomaton(Vec, SpanningSet); - -impl serde::Serialize for FastAutomaton { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let tokenizer = Tokenizer::new(self); - match AutomatonToken::to_fair_tokens(&tokenizer.to_embedding()) { - Ok(tokens) => { - let serialized_automaton = - SerializedAutomaton(tokens, self.get_spanning_set().clone()); - - let mut serialized = Vec::with_capacity(self.get_number_of_states() * 8); - if let Err(err) = ciborium::into_writer(&serialized_automaton, &mut serialized) { - return Err(ser::Error::custom(err.to_string())); - } - - serialized = compress_data(&serialized); - - let nonce = FastAutomatonReader::random_nonce(); - - match get_fast_automaton_reader() - .cipher - .encrypt(Nonce::from_slice(&nonce), serialized.as_ref()) - { - Ok(ciphertext) => { - let mut encrypted = Vec::from_iter(nonce); - encrypted.extend(ciphertext); - - serializer.serialize_str(&encode(&encrypted)) - } - Err(err) => Err(ser::Error::custom(err.to_string())), - } - } - Err(err) => Err(ser::Error::custom(err.to_string())), - } - } -} - -impl<'de> serde::Deserialize<'de> for FastAutomaton { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - match String::deserialize(deserializer) { - Ok(decoded) => match decode(decoded) { - Ok(encrypted) => { - let nonce = &encrypted[0..12]; - let payload = encrypted[12..].to_vec(); - let cipher_result = get_fast_automaton_reader() - .cipher - .decrypt(Nonce::from_slice(nonce), payload.as_ref()); - - match cipher_result { - Ok(cipher_result) => { - let decrypted = decompress_data(&cipher_result); - - let automaton: Result< - SerializedAutomaton, - ciborium::de::Error, - > = ciborium::from_reader(&decrypted[..]); - match automaton { - Ok(automaton) => { - let mut temp_automaton = FastAutomaton::new_empty(); - temp_automaton.spanning_set = automaton.1; - let tokenizer = Tokenizer::new(&temp_automaton); - - match tokenizer.from_embedding( - &automaton - .0 - .into_iter() - .map(AutomatonToken::from_fair_token) - .collect::>(), - ) { - Ok(res) => Ok(res), - Err(err) => Err(de::Error::custom(err.to_string())), - } - } - Err(err) => Err(de::Error::custom(err.to_string())), - } - } - Err(err) => Err(de::Error::custom(err.to_string())), - } - } - Err(err) => Err(de::Error::custom(err.to_string())), - }, - Err(err) => Err(err), - } - } -} - -fn compress_data(data: &[u8]) -> Vec { - let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default()); - encoder.write_all(data).expect("Failed to write data"); - encoder.finish().expect("Failed to finish compression") -} - -fn decompress_data(data: &[u8]) -> Vec { - let mut decoder = ZlibDecoder::new(data); - let mut decompressed_data = Vec::new(); - decoder - .read_to_end(&mut decompressed_data) - .expect("Failed to read data"); - decompressed_data -} - -#[cfg(test)] -mod tests { - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_serialization() -> Result<(), String> { - assert_serialization("..."); - assert_serialization(".*abc"); - assert_serialization(".*"); - assert_serialization(".*abcdef.*dsqd"); - assert_serialization( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,2}", - ); - assert_serialization("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"); - - Ok(()) - } - - fn assert_serialization(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); - println!("{regex}"); - - let automaton = regex.to_automaton().unwrap(); - - let serialized = serde_json::to_string(&automaton).unwrap(); - println!("{serialized}"); - - let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); - - let unserialized = unserialized.determinize().unwrap(); - let automaton = automaton.determinize().unwrap(); - - assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); - assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); - } - - #[test] - fn test_serialization_case_1() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*") - .unwrap() - .to_automaton() - .unwrap(); - let automaton2 = RegularExpression::new("\\d+") - .unwrap() - .to_automaton() - .unwrap() - .determinize() - .unwrap(); - - let subtraction = automaton1.subtraction(&automaton2).unwrap(); - - let serialized = serde_json::to_string(&subtraction).unwrap(); - println!("{serialized}"); - - let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); - - let unserialized = unserialized.determinize().unwrap(); - let automaton = subtraction.determinize().unwrap(); - - assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); - assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); - - Ok(()) - } -} diff --git a/src/fast_automaton/serializer/mod.rs b/src/fast_automaton/serializer/mod.rs new file mode 100644 index 0000000..7a40bae --- /dev/null +++ b/src/fast_automaton/serializer/mod.rs @@ -0,0 +1,181 @@ +use crate::fast_automaton::serializer::tokenizer::token::automaton_token::AutomatonToken; +use crate::fast_automaton::serializer::tokenizer::Tokenizer; + +use super::*; +use serde::{Deserialize, Serialize}; +use serde::{Deserializer, Serializer, de, ser}; + +use z85::{decode, encode}; + +use flate2::Compression; +use flate2::read::ZlibDecoder; +use flate2::write::ZlibEncoder; +use std::io::prelude::*; + +#[cfg(feature = "serializable")] +pub mod tokenizer; + +#[derive(Serialize, Deserialize, Debug)] +struct SerializedAutomaton(Vec, SpanningSet, usize); + +impl serde::Serialize for FastAutomaton { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let tokenizer = Tokenizer::new(self); + let number_of_states = self.get_number_of_states(); + match AutomatonToken::to_tokens( + &tokenizer.to_embedding(), + self.get_spanning_set().get_number_of_spanning_ranges(), + number_of_states, + ) { + Ok(tokens) => { + let serialized_automaton = + SerializedAutomaton(tokens, self.get_spanning_set().clone(), number_of_states); + + let mut serialized = Vec::with_capacity(number_of_states * 8); + if let Err(err) = ciborium::into_writer(&serialized_automaton, &mut serialized) { + return Err(ser::Error::custom(err.to_string())); + } + + serializer.serialize_str(&encode(compress_data(&serialized))) + } + Err(err) => Err(ser::Error::custom(err.to_string())), + } + } +} + +impl<'de> serde::Deserialize<'de> for FastAutomaton { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + match String::deserialize(deserializer) { + Ok(decoded) => match decode(decoded) { + Ok(compressed) => { + let payload = decompress_data(&compressed); + + let automaton: Result< + SerializedAutomaton, + ciborium::de::Error, + > = ciborium::from_reader(&payload[..]); + match automaton { + Ok(automaton) => { + let mut temp_automaton = FastAutomaton::new_empty(); + temp_automaton.spanning_set = automaton.1; + let number_of_states = automaton.2; + let number_of_bases = + temp_automaton.spanning_set.get_number_of_spanning_ranges(); + let tokenizer = Tokenizer::new(&temp_automaton); + + match tokenizer.from_embedding( + &automaton + .0 + .into_iter() + .map(|t| { + AutomatonToken::from_token( + t, + number_of_bases, + number_of_states, + ) + }) + .collect::>(), + ) { + Ok(res) => Ok(res), + Err(err) => Err(de::Error::custom(err.to_string())), + } + } + Err(err) => Err(de::Error::custom(err.to_string())), + } + } + Err(err) => Err(de::Error::custom(err.to_string())), + }, + Err(err) => Err(err), + } + } +} + +fn compress_data(data: &[u8]) -> Vec { + let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default()); + encoder.write_all(data).expect("Failed to write data"); + encoder.finish().expect("Failed to finish compression") +} + +fn decompress_data(data: &[u8]) -> Vec { + let mut decoder = ZlibDecoder::new(data); + let mut decompressed_data = Vec::new(); + decoder + .read_to_end(&mut decompressed_data) + .expect("Failed to read data"); + decompressed_data +} + +#[cfg(test)] +mod tests { + use crate::regex::RegularExpression; + + use super::*; + + #[test] + fn test_serialization() -> Result<(), String> { + assert_serialization("..."); + assert_serialization(".*abc"); + assert_serialization(".*"); + assert_serialization(".*abcdef.*dsqd"); + assert_serialization( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,2}", + ); + assert_serialization( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", + ); + + Ok(()) + } + + fn assert_serialization(regex: &str) { + let regex = RegularExpression::parse(regex, false).unwrap(); + println!("{regex}"); + + let automaton = regex.to_automaton().unwrap(); + + let serialized = serde_json::to_string(&automaton).unwrap(); + println!("{serialized}"); + + let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); + + let unserialized = unserialized.determinize().unwrap(); + let automaton = automaton.determinize().unwrap(); + + assert!(automaton.difference(&unserialized).unwrap().is_empty()); + assert!(unserialized.difference(&automaton).unwrap().is_empty()); + } + + #[test] + fn test_serialization_case_1() -> Result<(), String> { + let automaton1 = RegularExpression::parse(".*", false) + .unwrap() + .to_automaton() + .unwrap(); + let automaton2 = RegularExpression::parse("\\d+", false) + .unwrap() + .to_automaton() + .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); + + let difference = automaton1.difference(&automaton2).unwrap(); + + let serialized = serde_json::to_string(&difference).unwrap(); + println!("{serialized}"); + + let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); + + let unserialized = unserialized.determinize().unwrap(); + let automaton = difference.determinize().unwrap(); + + assert!(automaton.difference(&unserialized).unwrap().is_empty()); + assert!(unserialized.difference(&automaton).unwrap().is_empty()); + + Ok(()) + } +} diff --git a/src/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs similarity index 57% rename from src/tokenizer/embed_automaton.rs rename to src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 79697dd..bfe8197 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -1,6 +1,6 @@ use token::TokenError; -use crate::{error::EngineError, fast_automaton::condition::Condition}; +use crate::{error::EngineError, fast_automaton::{condition::Condition, serializer::tokenizer::token::automaton_token::AutomatonToken}, CharRange}; use self::token::range_token::RangeToken; @@ -16,26 +16,25 @@ impl Tokenizer<'_> { worklist.push_front(self.automaton.get_start_state()); while let Some(current_state) = worklist.pop_back() { + if !seen.insert(current_state) { + continue; + } if !vec.is_empty() { // separator vec.push(AutomatonToken::SeparatorState) } - seen.insert(current_state); // state let embedded_state = AutomatonToken::State(*self.state_to_token.get(¤t_state).unwrap()); vec.push(embedded_state); - if self.automaton.is_accepted(¤t_state) { + if self.automaton.is_accepted(current_state) { // accept state vec.push(AutomatonToken::AcceptState) } - for (to_state, condition) in self - .automaton - .transitions_from_state_enumerate_iter(¤t_state) - { + for (condition, to_state) in self.automaton.transitions_from(current_state) { if condition.is_empty() { continue; } @@ -73,24 +72,24 @@ impl Tokenizer<'_> { let mut from_state = None; let mut to_state = None; - let mut range = Range::empty(); + let mut range = CharRange::empty(); for token in vec { match token { AutomatonToken::Range(r) => { range = range.union(self.range_tokenizer.token_to_range(r).unwrap()); } AutomatonToken::State(s) => { - while !automaton.has_state((*s).into()) { + while !automaton.has_state(*s) { automaton.new_state(); } if let Some(fs) = from_state { if let Some(ts) = to_state { Self::apply_transition(&mut automaton, fs, ts, &range)?; - range = Range::empty(); + range = CharRange::empty(); } - to_state = Some((*s).into()); + to_state = Some(*s); } else { - from_state = Some((*s).into()); + from_state = Some(*s); } } AutomatonToken::AcceptState => { @@ -107,7 +106,7 @@ impl Tokenizer<'_> { } from_state = None; to_state = None; - range = Range::empty(); + range = CharRange::empty(); } _ => return Err(EngineError::TokenError(TokenError::UnknownToken)), }; @@ -122,92 +121,72 @@ impl Tokenizer<'_> { automaton: &mut FastAutomaton, from_state: State, to_state: State, - range: &Range, + range: &CharRange, ) -> Result<(), EngineError> { let condition = Condition::from_range(range, automaton.get_spanning_set())?; - automaton.add_transition_to(from_state, to_state, &condition); + automaton.add_transition(from_state, to_state, &condition); Ok(()) } } #[cfg(test)] mod tests { - use embed_automaton::token::Token; - use crate::regex::RegularExpression; use super::*; #[test] fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion_for_fair_and_ai("(a|b)"); - assert_embedding_convertion_for_fair_and_ai("(|a)"); - assert_embedding_convertion_for_fair_and_ai(".*ab"); - assert_embedding_convertion_for_fair_and_ai("toto"); - assert_embedding_convertion_for_fair_and_ai(".{2,3}"); - assert_embedding_convertion_for_fair_and_ai("q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair_and_ai(".*q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair( + assert_embedding_convertion("(a|b)"); + assert_embedding_convertion("(|a)"); + assert_embedding_convertion(".*ab"); + assert_embedding_convertion("toto"); + assert_embedding_convertion(".{2,3}"); + assert_embedding_convertion("q(ab|ca|ab|abc)x"); + assert_embedding_convertion(".*q(ab|ca|ab|abc)x"); + assert_embedding_convertion( "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", ); - assert_embedding_convertion_for_fair("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"); + assert_embedding_convertion( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", + ); Ok(()) } - fn assert_embedding_convertion_for_fair(regex: &str) { - assert_embedding_convertion(regex, true); - } - - fn assert_embedding_convertion_for_fair_and_ai(regex: &str) { - assert_embedding_convertion(regex, false); - } - - fn assert_embedding_convertion(regex: &str, ignore_ai: bool) { - let regex = RegularExpression::new(regex).unwrap(); + fn assert_embedding_convertion(regex: &str) { + let regex = RegularExpression::parse(regex, false).unwrap(); println!("{}", regex); - let automaton = regex.to_automaton().unwrap().determinize().unwrap(); + let automaton = regex.to_automaton().unwrap(); + let automaton = automaton.determinize().unwrap(); let tokenizer = Tokenizer::new(&automaton); let embedding = tokenizer.to_embedding(); - // FAIR - let embedding_u16 = AutomatonToken::to_fair_tokens(&embedding).unwrap(); - let embedding: Vec = embedding_u16 + let number_of_bases = automaton.get_spanning_set().get_number_of_spanning_ranges(); + let number_of_states = automaton.get_number_of_states(); + + let embedding_usize = + AutomatonToken::to_tokens(&embedding, number_of_bases, number_of_states).unwrap(); + let embedding: Vec = embedding_usize .iter() - .map(|&t| AutomatonToken::from_fair_token(t)) + .map(|&t| AutomatonToken::from_token(t, number_of_bases, number_of_states)) .collect(); let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - assert!(automaton - .subtraction(&unembedded_automaton) - .unwrap() - .is_empty()); - assert!(unembedded_automaton - .subtraction(&automaton) - .unwrap() - .is_empty()); - - if !ignore_ai { - // AI - let embedding_u8 = AutomatonToken::to_ai_tokens(&embedding).unwrap(); - let embedding: Vec = embedding_u8 - .iter() - .map(|&t| AutomatonToken::from_ai_token(t)) - .collect(); - - let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - - assert!(automaton - .subtraction(&unembedded_automaton) + assert!( + automaton + .difference(&unembedded_automaton) .unwrap() - .is_empty()); - assert!(unembedded_automaton - .subtraction(&automaton) + .is_empty() + ); + assert!( + unembedded_automaton + .difference(&automaton) .unwrap() - .is_empty()); - } + .is_empty() + ); } } diff --git a/src/tokenizer/mod.rs b/src/fast_automaton/serializer/tokenizer/mod.rs similarity index 80% rename from src/tokenizer/mod.rs rename to src/fast_automaton/serializer/tokenizer/mod.rs index 2e3e4ed..8dccca2 100644 --- a/src/tokenizer/mod.rs +++ b/src/fast_automaton/serializer/tokenizer/mod.rs @@ -1,19 +1,15 @@ use std::{cmp::Ordering, collections::VecDeque, vec}; -use ahash::HashMapExt; +use crate::fast_automaton::serializer::tokenizer::range_tokenizer::RangeTokenizer; use crate::fast_automaton::spanning_set::SpanningSet; -use crate::Range; - use crate::{ - fast_automaton::{FastAutomaton, State}, IntMap, IntSet, + fast_automaton::{FastAutomaton, State}, }; +use ahash::HashMapExt; -use self::{range_tokenizer::RangeTokenizer, token::automaton_token::AutomatonToken}; mod embed_automaton; -mod embed_regex; -mod embed_regex_operations; pub mod range_tokenizer; pub mod token; @@ -21,7 +17,7 @@ pub mod token; pub struct Tokenizer<'a> { range_tokenizer: RangeTokenizer<'a>, automaton: &'a FastAutomaton, - state_to_token: IntMap, + state_to_token: IntMap, } impl Tokenizer<'_> { @@ -31,7 +27,7 @@ impl Tokenizer<'_> { worklist.push_front(automaton.get_start_state()); - let mut state_counter: u16 = 0; + let mut state_counter = 0; let mut state_to_token = IntMap::with_capacity(automaton.get_number_of_states()); while let Some(current_state) = worklist.pop_back() { @@ -43,9 +39,9 @@ impl Tokenizer<'_> { state_counter += 1; automaton - .transitions_from_state_enumerate_iter(¤t_state) - .filter(|(_, c)| !c.is_empty()) - .for_each(|(to_state, _)| { + .transitions_from(current_state) + .filter(|(c, _)| !c.is_empty()) + .for_each(|(_, to_state)| { if !seen.contains(to_state) { worklist.push_front(*to_state); } diff --git a/src/tokenizer/range_tokenizer.rs b/src/fast_automaton/serializer/tokenizer/range_tokenizer.rs similarity index 87% rename from src/tokenizer/range_tokenizer.rs rename to src/fast_automaton/serializer/tokenizer/range_tokenizer.rs index 3950033..e3b3c9c 100644 --- a/src/tokenizer/range_tokenizer.rs +++ b/src/fast_automaton/serializer/tokenizer/range_tokenizer.rs @@ -1,3 +1,5 @@ +use crate::CharRange; + use self::token::range_token::RangeToken; use super::*; @@ -5,7 +7,7 @@ use super::*; #[derive(Debug)] pub struct RangeTokenizer<'a> { spanning_set: &'a SpanningSet, - total: Range, + total: CharRange, } impl RangeTokenizer<'_> { @@ -21,7 +23,7 @@ impl RangeTokenizer<'_> { } } - pub fn range_to_embedding(&self, range: &Range) -> Option> { + pub fn range_to_embedding(&self, range: &CharRange) -> Option> { if range == &self.total { return Some(vec![RangeToken::Total]); } else if !range.difference(&self.total).is_empty() { @@ -39,12 +41,12 @@ impl RangeTokenizer<'_> { Some(vec) } - pub fn embedding_to_range(&self, vec: &[RangeToken]) -> Option { + pub fn embedding_to_range(&self, vec: &[RangeToken]) -> Option { if vec.is_empty() { - return Some(Range::empty()); + return Some(CharRange::empty()); } - let mut range = Range::empty(); + let mut range = CharRange::empty(); if vec[0] == RangeToken::Total { return Some(self.total.clone()); } @@ -60,7 +62,7 @@ impl RangeTokenizer<'_> { Some(range) } - pub fn token_to_range(&self, token: &RangeToken) -> Option<&Range> { + pub fn token_to_range(&self, token: &RangeToken) -> Option<&CharRange> { match token { RangeToken::Total => Some(&self.total), RangeToken::Base(b) => self.spanning_set.get_spanning_range(*b), diff --git a/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs b/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs new file mode 100644 index 0000000..2e68ded --- /dev/null +++ b/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs @@ -0,0 +1,94 @@ +use self::range_token::RangeToken; + +use super::*; + +#[derive(Debug, Eq, PartialEq, Clone, Copy)] +pub enum AutomatonToken { + Range(RangeToken), + State(usize), + AcceptState, + SeparatorState, + Error, +} + +impl Ord for AutomatonToken { + fn cmp(&self, other: &Self) -> Ordering { + match (self, other) { + (AutomatonToken::Range(a), AutomatonToken::Range(b)) => a.cmp(b), + (AutomatonToken::Range(_), _) => Ordering::Less, + (_, AutomatonToken::Range(_)) => Ordering::Greater, + + (AutomatonToken::State(a), AutomatonToken::State(b)) => a.cmp(b), + (AutomatonToken::State(_), _) => Ordering::Less, + (_, AutomatonToken::State(_)) => Ordering::Greater, + + (AutomatonToken::AcceptState, AutomatonToken::AcceptState) => Ordering::Equal, + (AutomatonToken::AcceptState, _) => Ordering::Less, + (_, AutomatonToken::AcceptState) => Ordering::Greater, + + (AutomatonToken::SeparatorState, AutomatonToken::SeparatorState) => Ordering::Equal, + (AutomatonToken::SeparatorState, _) => Ordering::Less, + (_, AutomatonToken::SeparatorState) => Ordering::Greater, + + (AutomatonToken::Error, AutomatonToken::Error) => Ordering::Equal, + } + } +} + +impl PartialOrd for AutomatonToken { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl AutomatonToken { + pub fn from_token( + token: usize, + number_of_bases: usize, + number_of_states: usize, + ) -> AutomatonToken { + let states = number_of_bases + 1; + let accept_state = states + number_of_states; + let separator_state = accept_state + 1; + if (0..states).contains(&token) { + AutomatonToken::Range(RangeToken::from_token(token, number_of_bases)) + } else if (states..accept_state).contains(&token) { + AutomatonToken::State(token - states) + } else if token == accept_state { + AutomatonToken::AcceptState + } else if token == separator_state { + AutomatonToken::SeparatorState + } else { + AutomatonToken::Error + } + } + + pub fn to_token( + &self, + number_of_bases: usize, + number_of_states: usize, + ) -> Result { + let states = number_of_bases + 1; + let accept_state = states + number_of_states; + let separator_state = accept_state + 1; + Ok(match self { + AutomatonToken::Range(r) => r.to_token(number_of_bases)?, + AutomatonToken::State(s) => s + states, + AutomatonToken::AcceptState => accept_state, + AutomatonToken::SeparatorState => separator_state, + AutomatonToken::Error => return Err(TokenError::UnknownToken), + }) + } + + pub fn to_tokens( + tokens: &[Self], + number_of_bases: usize, + number_of_states: usize, + ) -> Result, TokenError> { + let mut vec = Vec::with_capacity(tokens.len()); + for token in tokens { + vec.push(token.to_token(number_of_bases, number_of_states)?); + } + Ok(vec) + } +} diff --git a/src/fast_automaton/serializer/tokenizer/token/mod.rs b/src/fast_automaton/serializer/tokenizer/token/mod.rs new file mode 100644 index 0000000..c510dd4 --- /dev/null +++ b/src/fast_automaton/serializer/tokenizer/token/mod.rs @@ -0,0 +1,26 @@ +use std::fmt::Display; + +use super::*; + +pub mod automaton_token; +pub mod range_token; + +#[derive(Debug, PartialEq, Eq)] +pub enum TokenError { + TokenOutOfBound(&'static str, usize, usize), + UnknownToken, + SyntaxError, +} + +impl Display for TokenError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TokenError::TokenOutOfBound(token, expected, got) => write!( + f, + "TokenOutOfBound: {token}, expected: {expected}, got: {got}." + ), + TokenError::UnknownToken => write!(f, "UnknownToken"), + TokenError::SyntaxError => write!(f, "SyntaxError"), + } + } +} \ No newline at end of file diff --git a/src/fast_automaton/serializer/tokenizer/token/range_token.rs b/src/fast_automaton/serializer/tokenizer/token/range_token.rs new file mode 100644 index 0000000..20ed515 --- /dev/null +++ b/src/fast_automaton/serializer/tokenizer/token/range_token.rs @@ -0,0 +1,55 @@ +use super::*; + +#[derive(Debug, Eq, PartialEq, Clone, Copy)] +pub enum RangeToken { + Total, + Base(usize), + Error, +} + +impl Ord for RangeToken { + fn cmp(&self, other: &Self) -> Ordering { + match (self, other) { + (RangeToken::Total, RangeToken::Total) => Ordering::Equal, + (RangeToken::Total, _) => Ordering::Less, + (_, RangeToken::Total) => Ordering::Greater, + (RangeToken::Base(a), RangeToken::Base(b)) => a.cmp(b), + (RangeToken::Base(_), RangeToken::Error) => Ordering::Less, + (RangeToken::Error, RangeToken::Base(_)) => Ordering::Greater, + (RangeToken::Error, RangeToken::Error) => Ordering::Equal, + } + } +} + +impl PartialOrd for RangeToken { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl RangeToken { + pub fn from_token(token: usize, number_of_bases: usize) -> RangeToken { + let max_number_of_bases = number_of_bases + 1; + if token == 0 { + RangeToken::Total + } else if (1..max_number_of_bases).contains(&token) { + RangeToken::Base(token - 1) + } else { + RangeToken::Error + } + } + + pub fn to_token(&self, number_of_bases: usize) -> Result { + let max_number_of_bases = number_of_bases + 1; + Ok(match self { + RangeToken::Total => 0, + RangeToken::Base(b) => { + if *b > max_number_of_bases { + return Err(TokenError::TokenOutOfBound("Base", max_number_of_bases, *b)); + } + b + 1 + } + RangeToken::Error => return Err(TokenError::UnknownToken), + }) + } +} diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index 2aa2780..bdb9d9c 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -1,22 +1,24 @@ use std::slice::Iter; use ahash::AHashSet; -use regex_charclass::{char::Char, irange::RangeSet}; -#[cfg(feature = "serde")] + +#[cfg(feature = "serializable")] use serde::{Deserialize, Serialize}; -/// Contains a set of [`RangeSet`] that span all the transition of a [`crate::FastAutomaton`]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +use crate::CharRange; + +/// Contains a set of [`CharRange`] that span all the transition of a [`crate::FastAutomaton`]. +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(Clone, Debug, PartialEq, Eq)] -pub struct SpanningSet(Vec>, RangeSet); +pub struct SpanningSet(Vec, CharRange); impl SpanningSet { pub fn new_empty() -> Self { - SpanningSet(vec![], RangeSet::total()) + SpanningSet(vec![], CharRange::total()) } pub fn new_total() -> Self { - SpanningSet(vec![RangeSet::total()], RangeSet::empty()) + SpanningSet(vec![CharRange::total()], CharRange::empty()) } pub fn is_empty(&self) -> bool { @@ -35,7 +37,7 @@ impl SpanningSet { } } - pub(crate) fn get_spanning_ranges_with_rest(&self) -> Vec> { + pub(crate) fn get_spanning_ranges_with_rest(&self) -> Vec { if self.1.is_empty() { self.0.clone() } else { @@ -45,7 +47,7 @@ impl SpanningSet { } } - pub fn get_spanning_ranges(&self) -> Iter> { + pub fn get_spanning_ranges(&self) -> Iter { self.0.iter() } @@ -53,14 +55,15 @@ impl SpanningSet { self.0.len() } - pub fn get_spanning_range(&self, i: usize) -> Option<&RangeSet> { + pub fn get_spanning_range(&self, i: usize) -> Option<&CharRange> { self.0.get(i) } - pub fn get_rest(&self) -> &RangeSet { + pub fn get_rest(&self) -> &CharRange { &self.1 } + /// Compute a new minimal spanning set by merging the provided spanning set. pub fn merge(&self, other: &Self) -> Self { let mut ranges = Vec::with_capacity(self.0.len() + other.0.len()); ranges.extend_from_slice(&self.0); @@ -69,8 +72,9 @@ impl SpanningSet { Self::compute_spanning_set(&ranges) } - pub fn compute_spanning_set(ranges: &[RangeSet]) -> Self { - let mut spanning_ranges: Vec> = ranges.to_vec(); + /// Compute a new minimal spanning set for the provided ranges. + pub fn compute_spanning_set(ranges: &[CharRange]) -> Self { + let mut spanning_ranges: Vec = ranges.to_vec(); spanning_ranges.sort_unstable(); spanning_ranges.dedup(); @@ -87,13 +91,13 @@ impl SpanningSet { let other_set = spanning_ranges.swap_remove(index); let intersection_set = set.intersection(&other_set); new_spanning_ranges.insert(intersection_set); - let subtraction_set = set.difference(&other_set); - if !subtraction_set.is_empty() { - new_spanning_ranges.insert(subtraction_set); + let difference_set = set.difference(&other_set); + if !difference_set.is_empty() { + new_spanning_ranges.insert(difference_set); } - let subtraction_set = other_set.difference(&set); - if !subtraction_set.is_empty() { - new_spanning_ranges.insert(subtraction_set); + let difference_set = other_set.difference(&set); + if !difference_set.is_empty() { + new_spanning_ranges.insert(difference_set); } changed = true; } else if !set.is_empty() { @@ -105,7 +109,7 @@ impl SpanningSet { spanning_ranges.sort_unstable(); - let mut total = RangeSet::empty(); + let mut total = CharRange::empty(); for base in &spanning_ranges { total = total.union(base); } diff --git a/src/lib.rs b/src/lib.rs index 91493c7..de45599 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,131 +1,271 @@ use std::{ borrow::Cow, collections::{HashMap, HashSet}, + fmt::Display, hash::BuildHasherDefault, }; use cardinality::Cardinality; use error::EngineError; -use execution_profile::ThreadLocalParams; use fast_automaton::FastAutomaton; use nohash_hasher::NoHashHasher; +use rayon::prelude::*; use regex::RegularExpression; use regex_charclass::{char::Char, irange::RangeSet}; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] use serde::{Deserialize, Serialize}; +use crate::execution_profile::ExecutionProfile; + pub mod cardinality; pub mod error; pub mod execution_profile; pub mod fast_automaton; pub mod regex; -pub mod tokenizer; -type IntMap = HashMap>>; -type IntSet = HashSet>>; -type Range = RangeSet; +pub type IntMap = HashMap>>; +pub type IntSet = HashSet>>; +pub type CharRange = RangeSet; /// Represents a term that can be either a regular expression or a finite automaton. This term can be manipulated with a wide range of operations. /// -/// To put constraint and limitation on the execution of operations please refer to [`execution_profile::ExecutionProfile`]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +/// # Example +/// ```rust +/// use regexsolver::Term; +/// use regexsolver::error::EngineError; +/// +/// fn main() -> Result<(), EngineError> { +/// // Create terms from regex +/// let t1 = Term::from_pattern("abc.*")?; +/// let t2 = Term::from_pattern(".*xyz")?; +/// +/// // Concatenate +/// let concat = t1.concat(&[t2])?; +/// assert_eq!(concat.to_pattern(), "abc.*xyz"); +/// +/// // Union +/// let union = t1.union(&[Term::from_pattern("fgh")?])?; +/// assert_eq!(union.to_pattern(), "(abc.*|fgh)"); +/// +/// // Intersection +/// let inter = Term::from_pattern("(ab|xy){2}")? +/// .intersection(&[Term::from_pattern(".*xy")?])?; +/// assert_eq!(inter.to_pattern(), "(ab|xy)xy"); +/// +/// // Difference +/// let diff = Term::from_pattern("a*")? +/// .difference(&Term::from_pattern("")?)?; +/// assert_eq!(diff.to_pattern(), "a+"); +/// +/// // Repetition +/// let rep = Term::from_pattern("abc")? +/// .repeat(2, Some(4))?; +/// assert_eq!(rep.to_pattern(), "(abc){2,4}"); +/// +/// // Analyze +/// assert_eq!(rep.get_length(), (Some(6), Some(12))); +/// assert!(!rep.is_empty()); +/// +/// // Generate examples +/// let samples = Term::from_pattern("(x|y){1,3}")? +/// .generate_strings(5)?; +/// println!("Some matches: {:?}", samples); +/// +/// // Equivalence & subset +/// let a = Term::from_pattern("a+")?; +/// let b = Term::from_pattern("a*")?; +/// assert!(!a.equivalent(&b)?); +/// assert!(a.subset(&b)?); +/// +/// Ok(()) +/// } +/// # main(); +/// ``` +/// +/// To put constraint and limitation on the execution of operations please refer to [`ExecutionProfile`]. +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Eq, Debug)] -#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))] +#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value"))] pub enum Term { - #[cfg_attr(feature = "serde", serde(rename = "regex"))] + #[cfg_attr(feature = "serializable", serde(rename = "regex"))] RegularExpression(RegularExpression), - #[cfg_attr(feature = "serde", serde(rename = "fair"))] + #[cfg_attr(feature = "serializable", serde(rename = "fair"))] Automaton(FastAutomaton), } +impl Display for Term { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Term::RegularExpression(regular_expression) => write!(f, "{regular_expression}"), + Term::Automaton(fast_automaton) => write!(f, "{fast_automaton}"), + } + } +} + impl Term { - /// Create a term based on the given pattern. + /// Creates a term that matches the empty language. + pub fn new_empty() -> Self { + Term::RegularExpression(RegularExpression::new_empty()) + } + + /// Creates a term that matches all possible strings. + pub fn new_total() -> Self { + Term::RegularExpression(RegularExpression::new_total()) + } + + /// Creates a term that only matches the empty string `""`. + pub fn new_empty_string() -> Self { + Term::RegularExpression(RegularExpression::new_empty_string()) + } + + /// Parses and simplifies the provided pattern and returns a new [`Term`] holding the resulting [`RegularExpression`]. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex(".*abc.*").unwrap(); + /// let term = Term::from_pattern(".*abc.*").unwrap(); /// ``` - pub fn from_regex(regex: &str) -> Result { - Ok(Term::RegularExpression(RegularExpression::new(regex)?)) + pub fn from_pattern(pattern: &str) -> Result { + Ok(Term::RegularExpression(RegularExpression::new(pattern)?)) } - /// Compute the union of the given collection of terms. - /// Returns the resulting term. + /// Creates a new `Term` holding the provided [`RegularExpression`]. + pub fn from_regex(regex: RegularExpression) -> Self { + Term::RegularExpression(regex) + } + + /// Creates a new `Term` holding the provided [`FastAutomaton`]. + pub fn from_automaton(automaton: FastAutomaton) -> Self { + Term::Automaton(automaton) + } + + /// Computes the concatenation of the given terms. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("abc").unwrap(); - /// let term2 = Term::from_regex("de").unwrap(); - /// let term3 = Term::from_regex("fghi").unwrap(); + /// let term1 = Term::from_pattern("abc").unwrap(); + /// let term2 = Term::from_pattern("d.").unwrap(); + /// let term3 = Term::from_pattern(".*").unwrap(); /// - /// let union = term1.union(&[term2, term3]).unwrap(); + /// let concat = term1.concat(&[term2, term3]).unwrap(); /// - /// if let Term::RegularExpression(regex) = union { - /// assert_eq!("(abc|de|fghi)", regex.to_string()); + /// if let Term::RegularExpression(regex) = concat { + /// assert_eq!("abcd.+", regex.to_string()); /// } /// ``` - pub fn union(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - + pub fn concat(&self, terms: &[Term]) -> Result { let mut return_regex = RegularExpression::new_empty(); let mut return_automaton = FastAutomaton::new_empty(); + let mut has_automaton = false; match self { Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.clone(); + return_regex = regular_expression.clone() } Term::Automaton(fast_automaton) => { + has_automaton = true; return_automaton = fast_automaton.clone(); } } - for operand in terms { - match operand { - Term::RegularExpression(regex) => { - return_regex = return_regex.union(regex); - if return_regex.is_total() { - return Ok(Term::RegularExpression(RegularExpression::new_total())); + for term in terms { + if has_automaton { + return_automaton = return_automaton.concat(term.to_automaton()?.as_ref())?; + } else { + match term { + Term::RegularExpression(regular_expression) => { + return_regex = return_regex.concat(regular_expression, true); } - } - Term::Automaton(automaton) => { - return_automaton = return_automaton.union(automaton)?; - if return_automaton.is_total() { - return Ok(Term::RegularExpression(RegularExpression::new_total())); + Term::Automaton(fast_automaton) => { + has_automaton = true; + return_automaton = return_regex.to_automaton()?.concat(fast_automaton)?; } } } } - if return_automaton.is_empty() { + if !has_automaton { Ok(Term::RegularExpression(return_regex)) } else { - if !return_regex.is_empty() { - return_automaton = return_automaton.union(&return_regex.to_automaton()?)?; + Ok(Term::Automaton(return_automaton)) + } + } + + /// Computes the union of the given terms. + /// + /// # Example: + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term1 = Term::from_pattern("abc").unwrap(); + /// let term2 = Term::from_pattern("de").unwrap(); + /// let term3 = Term::from_pattern("fghi").unwrap(); + /// + /// let union = term1.union(&[term2, term3]).unwrap(); + /// + /// if let Term::RegularExpression(regex) = union { + /// assert_eq!("(abc|de|fghi)", regex.to_string()); + /// } + /// ``` + pub fn union(&self, terms: &[Term]) -> Result { + if self.is_total() { + return Ok(Term::new_total()); + } + + let mut has_automaton = matches!(self, Term::Automaton(_)); + if !has_automaton { + for term in terms { + if term.is_total() { + return Ok(Term::new_total()); + } + if matches!(term, Term::Automaton(_)) { + has_automaton = true; + break; + } } + } + + if has_automaton { + let parallel = terms.len() > 3; + + let automaton_list = self.get_automata(terms, parallel)?; - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + + let return_automaton = if parallel { + FastAutomaton::union_all_par(automaton_list) } else { - Ok(Term::Automaton(return_automaton)) - } + FastAutomaton::union_all(automaton_list) + }?; + + Ok(Term::Automaton(return_automaton)) + } else { + let regexes_list = self + .get_regexes(terms) + .expect("No automaton should be here so this operation is not supposed to fail."); + + let regexes_list = regexes_list.iter().map(AsRef::as_ref).collect::>(); + + Ok(Term::RegularExpression(RegularExpression::union_all( + regexes_list, + ))) } } - /// Compute the intersection of the given collection of terms. - /// Returns the resulting term. + /// Computes the intersection of the given terms. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de){2}").unwrap(); - /// let term2 = Term::from_regex("de.*").unwrap(); - /// let term3 = Term::from_regex(".*abc").unwrap(); + /// let term1 = Term::from_pattern("(abc|de){2}").unwrap(); + /// let term2 = Term::from_pattern("de.*").unwrap(); + /// let term3 = Term::from_pattern(".*abc").unwrap(); /// /// let intersection = term1.intersection(&[term2, term3]).unwrap(); /// @@ -134,222 +274,264 @@ impl Term { /// } /// ``` pub fn intersection(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - let mut return_automaton = self.get_automaton()?; - for term in terms { - let automaton = term.get_automaton()?; - return_automaton = Cow::Owned(return_automaton.intersection(&automaton)?); - if return_automaton.is_empty() { - return Ok(Term::RegularExpression(RegularExpression::new_empty())); - } + if self.is_empty() || terms.iter().any(|t| t.is_empty()) { + return Ok(Term::new_empty()); } - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + let parallel = terms.len() > 3; + + let automaton_list = self.get_automata(terms, parallel)?; + + let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + + let return_automaton = if parallel { + FastAutomaton::intersection_all_par(automaton_list) } else { - Ok(Term::Automaton(return_automaton.into_owned())) - } + FastAutomaton::intersection_all(automaton_list) + }?; + + Ok(Term::Automaton(return_automaton)) } - /// Compute the subtraction/difference of the two given terms. - /// Returns the resulting term. + /// Computes the difference between `self` and `other`. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de)").unwrap(); - /// let term2 = Term::from_regex("de").unwrap(); + /// let term1 = Term::from_pattern("(abc|de)").unwrap(); + /// let term2 = Term::from_pattern("de").unwrap(); /// - /// let subtraction = term1.subtraction(&term2).unwrap(); + /// let difference = term1.difference(&term2).unwrap(); /// - /// if let Term::RegularExpression(regex) = subtraction { + /// if let Term::RegularExpression(regex) = difference { /// assert_eq!("abc", regex.to_string()); /// } /// ``` - pub fn subtraction(&self, subtrahend: &Term) -> Result { - let minuend_automaton = self.get_automaton()?; - let subtrahend_automaton = subtrahend.get_automaton()?; + pub fn difference(&self, other: &Term) -> Result { + let minuend_automaton = self.to_automaton()?; + let subtrahend_automaton = other.to_automaton()?; let subtrahend_automaton = Self::determinize_subtrahend(&minuend_automaton, &subtrahend_automaton)?; - let return_automaton = minuend_automaton.subtraction(&subtrahend_automaton)?; - - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } - } + let return_automaton = minuend_automaton.difference(&subtrahend_automaton)?; - /// See [`Self::subtraction`]. - #[inline] - pub fn difference(&self, subtrahend: &Term) -> Result { - self.subtraction(subtrahend) + Ok(Term::Automaton(return_automaton)) } - /// Returns the Details of the given term. + /// Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. /// /// # Example: /// /// ``` - /// use regexsolver::{Term, cardinality::Cardinality}; + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern("abc").unwrap(); /// - /// let term = Term::from_regex("(abc|de)").unwrap(); + /// let repeat = term.repeat(1, None).unwrap(); + /// + /// if let Term::RegularExpression(regex) = repeat { + /// assert_eq!("(abc)+", regex.to_string()); + /// } /// - /// let details = term.get_details().unwrap(); + /// let repeat = term.repeat(3, Some(5)).unwrap(); /// - /// assert_eq!(Some(Cardinality::Integer(2)), *details.get_cardinality()); - /// assert_eq!((Some(2), Some(3)), *details.get_length()); - /// assert!(!details.is_empty()); - /// assert!(!details.is_total()); + /// if let Term::RegularExpression(regex) = repeat { + /// assert_eq!("(abc){3,5}", regex.to_string()); + /// } /// ``` - pub fn get_details(&self) -> Result { + pub fn repeat(&self, min: u32, max_opt: Option) -> Result { match self { - Term::RegularExpression(regex) => Ok(Details { - cardinality: Some(regex.get_cardinality()), - length: regex.get_length(), - empty: regex.is_empty(), - total: regex.is_total(), - }), - Term::Automaton(automaton) => Ok(Details { - cardinality: automaton.get_cardinality(), - length: automaton.get_length(), - empty: automaton.is_empty(), - total: automaton.is_total(), - }), + Term::RegularExpression(regular_expression) => Ok(Term::RegularExpression( + regular_expression.repeat(min, max_opt), + )), + Term::Automaton(fast_automaton) => { + let repeat_automaton = fast_automaton.repeat(min, max_opt)?; + Ok(Term::Automaton(repeat_automaton)) + } } } - /// Generate strings matched by the given term. + /// Generates `count` strings matched by the term. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex("(abc|de){2}").unwrap(); + /// let term = Term::from_pattern("(abc|de){2}").unwrap(); /// /// let strings = term.generate_strings(3).unwrap(); /// /// assert_eq!(3, strings.len()); // ex: ["deabc", "dede", "abcde"] /// ``` pub fn generate_strings(&self, count: usize) -> Result, EngineError> { - Ok(self - .get_automaton()? - .generate_strings(count)? - .into_iter() - .collect()) + self.to_automaton()?.generate_strings(count) } - /// Compute if the two given terms are equivalent. + /// Returns `true` if both terms accept the same language. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de)").unwrap(); - /// let term2 = Term::from_regex("(abc|de)*").unwrap(); + /// let term1 = Term::from_pattern("(abc|de)").unwrap(); + /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// - /// assert!(!term1.are_equivalent(&term2).unwrap()); + /// assert!(!term1.equivalent(&term2).unwrap()); /// ``` - pub fn are_equivalent(&self, that: &Term) -> Result { - if self == that { + pub fn equivalent(&self, term: &Term) -> Result { + if self == term { return Ok(true); } - let automaton_1 = self.get_automaton()?; - let automaton_2 = that.get_automaton()?; - automaton_1.is_equivalent_of(&automaton_2) + let automaton_1 = self.to_automaton()?; + let automaton_2 = term.to_automaton()?; + automaton_1.equivalent(&automaton_2) } - /// Compute if the first term is a subset of the second one. + /// Returns `true` if all strings matched by the current term are also matched by the given term. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("de").unwrap(); - /// let term2 = Term::from_regex("(abc|de)").unwrap(); + /// let term1 = Term::from_pattern("de").unwrap(); + /// let term2 = Term::from_pattern("(abc|de)").unwrap(); /// - /// assert!(term1.is_subset_of(&term2).unwrap()); + /// assert!(term1.subset(&term2).unwrap()); /// ``` - pub fn is_subset_of(&self, that: &Term) -> Result { - if self == that { + pub fn subset(&self, term: &Term) -> Result { + if self == term { return Ok(true); } - let automaton_1 = self.get_automaton()?; - let automaton_2 = that.get_automaton()?; - automaton_1.is_subset_of(&automaton_2) + let automaton_1 = self.to_automaton()?; + let automaton_2 = term.to_automaton()?; + automaton_1.subset(&automaton_2) } - fn check_number_of_terms(terms: &[Term]) -> Result<(), EngineError> { - let number_of_terms = terms.len() + 1; - let max_number_of_terms = ThreadLocalParams::get_max_number_of_terms(); - if number_of_terms > max_number_of_terms { - Err(EngineError::TooMuchTerms( - max_number_of_terms, - number_of_terms, - )) - } else { - Ok(()) + /// Checks if the term matches the empty language. + pub fn is_empty(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_empty(), + Term::Automaton(fast_automaton) => fast_automaton.is_empty(), } } - fn determinize_subtrahend<'a>( - minuend: &FastAutomaton, - subtrahend: &'a FastAutomaton, - ) -> Result, EngineError> { - if subtrahend.is_determinitic() { - Ok(Cow::Borrowed(subtrahend)) - } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { - Ok(Cow::Owned(minuend.intersection(subtrahend)?.determinize()?)) - } else { - Ok(Cow::Owned(subtrahend.determinize()?)) + /// Checks if the term matches all possible strings. + pub fn is_total(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_total(), + Term::Automaton(fast_automaton) => fast_automaton.is_total(), } } - fn get_automaton(&self) -> Result, EngineError> { + /// Checks if the term matches only the empty string `""`. + pub fn is_empty_string(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_empty_string(), + Term::Automaton(fast_automaton) => fast_automaton.is_empty_string(), + } + } + + /// Returns the minimum and maximum length of matched strings. + pub fn get_length(&self) -> (Option, Option) { + match self { + Term::RegularExpression(regex) => regex.get_length(), + Term::Automaton(automaton) => automaton.get_length(), + } + } + + /// Returns the cardinality of the term (i.e., the number of possible matched strings). + pub fn get_cardinality(&self) -> Result, EngineError> { + match self { + Term::RegularExpression(regex) => Ok(regex.get_cardinality()), + Term::Automaton(automaton) => Ok(if !automaton.is_deterministic() { + automaton.determinize()?.get_cardinality() + } else { + automaton.get_cardinality() + }), + } + } + + /// Converts the term to a [`FastAutomaton`]. + pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), Term::Automaton(automaton) => Cow::Borrowed(automaton), }) } -} -/// Represents details about a [Term]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, PartialEq, Eq, Debug)] -#[cfg_attr(feature = "serde", serde(tag = "type", rename = "details"))] -pub struct Details { - cardinality: Option>, - length: (Option, Option), - empty: bool, - total: bool, -} + /// Converts the term to a [`RegularExpression`]. + pub fn to_regex(&self) -> Cow { + match self { + Term::RegularExpression(regex) => Cow::Borrowed(regex), + Term::Automaton(automaton) => Cow::Owned(automaton.to_regex()), + } + } -impl Details { - /// Return the number of unique strings matched. - pub fn get_cardinality(&self) -> &Option> { - &self.cardinality + /// Converts the term to a regular expression pattern. + pub fn to_pattern(&self) -> String { + self.to_regex().to_string() } - /// Return the minimum and the maximum length of matched strings. - pub fn get_length(&self) -> &(Option, Option) { - &self.length + fn determinize_subtrahend<'a>( + minuend: &FastAutomaton, + subtrahend: &'a FastAutomaton, + ) -> Result, EngineError> { + if subtrahend.is_deterministic() { + Ok(Cow::Borrowed(subtrahend)) + } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { + Ok(Cow::Owned( + minuend + .intersection(subtrahend)? + .determinize()? + .into_owned(), + )) + } else { + Ok(subtrahend.determinize()?) + } } - /// Return `true` if it does not match any string. - pub fn is_empty(&self) -> bool { - self.empty + fn get_automata<'a>( + &'a self, + terms: &'a [Term], + parallel: bool, + ) -> Result>, EngineError> { + let mut automaton_list = Vec::with_capacity(terms.len() + 1); + automaton_list.push(self.to_automaton()?); + + let mut terms_automata = if parallel { + let execution_profile = ExecutionProfile::get(); + terms + .par_iter() + .map(|a| execution_profile.apply(|| a.to_automaton())) + .collect::, _>>() + } else { + terms + .iter() + .map(Term::to_automaton) + .collect::, _>>() + }?; + automaton_list.append(&mut terms_automata); + + Ok(automaton_list) } - /// Return `true` if it match all possible strings. - pub fn is_total(&self) -> bool { - self.total + fn get_regexes<'a>(&'a self, terms: &'a [Term]) -> Option>> { + let mut regex_list = Vec::with_capacity(terms.len() + 1); + regex_list.push(self.to_regex()); + + let mut terms_regexes = terms + .iter() + .map(Term::to_regex) + .collect::>(); + regex_list.append(&mut terms_regexes); + + Some(regex_list) } } @@ -360,43 +542,41 @@ mod tests { use super::*; #[test] - fn test_details() -> Result<(), String> { - let regex1 = Term::from_regex("a").unwrap(); - let regex2 = Term::from_regex("b").unwrap(); + fn test_intersection() -> Result<(), String> { + let regex1 = Term::from_pattern("a").unwrap(); + let regex2 = Term::from_pattern("b").unwrap(); - let details = regex1.intersection(&vec![regex2]); - assert!(details.is_ok()); + let intersection = regex1.intersection(&vec![regex2]).unwrap(); + assert!(intersection.is_empty()); + assert_eq!("[]", intersection.to_pattern()); Ok(()) } #[test] - fn test_subtraction_1() -> Result<(), String> { - let regex1 = Term::from_regex("a*").unwrap(); - let regex2 = Term::from_regex("").unwrap(); + fn test_difference_1() -> Result<(), String> { + let regex1 = Term::from_pattern("a*").unwrap(); + let regex2 = Term::from_pattern("").unwrap(); - let result = regex1.subtraction(®ex2); + let result = regex1.difference(®ex2); assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!( - Term::RegularExpression(RegularExpression::new("a+").unwrap()), - result - ); + let result = result.unwrap().to_pattern(); + assert_eq!("a+", result); Ok(()) } #[test] - fn test_subtraction_2() -> Result<(), String> { - let regex1 = Term::from_regex("x*").unwrap(); - let regex2 = Term::from_regex("(xxx)*").unwrap(); + fn test_difference_2() -> Result<(), String> { + let regex1 = Term::from_pattern("x*").unwrap(); + let regex2 = Term::from_pattern("(xxx)*").unwrap(); - let result = regex1.subtraction(®ex2); + let result = regex1.difference(®ex2); assert!(result.is_ok()); - let result = result.unwrap(); + let result = result.unwrap().to_regex().into_owned(); assert_eq!( - Term::RegularExpression(RegularExpression::new("(xxx)*(x|xx)").unwrap()), - result + Term::RegularExpression(RegularExpression::new("x(x{3})*x?").unwrap()), + Term::RegularExpression(result) ); Ok(()) @@ -404,40 +584,26 @@ mod tests { #[test] fn test_intersection_1() -> Result<(), String> { - let regex1 = Term::from_regex("a*").unwrap(); - let regex2 = Term::from_regex("b*").unwrap(); + let regex1 = Term::from_pattern("a*").unwrap(); + let regex2 = Term::from_pattern("b*").unwrap(); let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!(Term::from_regex("").unwrap(), result); + let result = result.unwrap().to_pattern(); + assert_eq!("", result); Ok(()) } #[test] fn test_intersection_2() -> Result<(), String> { - let regex1 = Term::from_regex("x*").unwrap(); - let regex2 = Term::from_regex("(xxx)*").unwrap(); + let regex1 = Term::from_pattern("x*").unwrap(); + let regex2 = Term::from_pattern("(xxx)*").unwrap(); let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!( - Term::RegularExpression(RegularExpression::new("(x{3})*").unwrap()), - result - ); - - Ok(()) - } - - #[test] - fn test__() -> Result<(), String> { - let term = Term::from_regex("(abc|de){2}").unwrap(); - - let strings = term.generate_strings(3).unwrap(); - - println!("strings={:?}", strings); + let result = result.unwrap().to_pattern(); + assert_eq!("(x{3})*", result); Ok(()) } diff --git a/src/regex/analyze/affixes.rs b/src/regex/analyze/affixes.rs index 4213e3f..34aa401 100644 --- a/src/regex/analyze/affixes.rs +++ b/src/regex/analyze/affixes.rs @@ -3,7 +3,7 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { - pub fn get_common_affixes( + pub(crate) fn get_common_affixes( &self, other: &RegularExpression, ) -> ( @@ -21,7 +21,7 @@ impl RegularExpression { (common_prefix, (self_regex, other_regex), common_suffix) } - pub fn get_common_affix( + pub(crate) fn get_common_affix( &self, other: &RegularExpression, is_prefix: bool, @@ -285,6 +285,7 @@ mod tests { assert_regex_affix(true, "(ab|cd)x", "(ab|cd)y", "(ab|cd)", "x", "y"); assert_regex_affix(true, "a+", "a+b", "a+", "", "b"); + assert_regex_affix(true, "(ab|cd)", "(ab|cd)", "(ab|cd)", "", ""); Ok(()) } diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index ae08148..f5d1975 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -6,6 +6,7 @@ mod affixes; mod number_of_states; impl RegularExpression { + /// Returns the minimum and maximum length of possible matched strings. pub fn get_length(&self) -> (Option, Option) { match self { RegularExpression::Character(range) => { @@ -84,6 +85,7 @@ impl RegularExpression { } } + /// Returns the cardinality of the regular expression (i.e., the number of possible matched strings). pub fn get_cardinality(&self) -> Cardinality { if self.is_empty() { return Cardinality::Integer(0); @@ -225,12 +227,12 @@ mod tests { let mut automaton = regex.to_automaton().unwrap(); if !automaton.is_cyclic() { - automaton = automaton.determinize().unwrap(); + automaton = automaton.determinize().unwrap().into_owned(); } //automaton.to_dot(); - let expected = automaton.get_cardinality().unwrap(); + let expected = automaton.get_cardinality(); assert_eq!(expected, cardinality); } diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index 90c1897..8325456 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -9,7 +9,7 @@ struct AbstractStateMetadata { } impl AbstractStateMetadata { - pub fn new(has_incoming_edges: bool, has_outgoing_edges: bool) -> Self { + pub(crate) fn new(has_incoming_edges: bool, has_outgoing_edges: bool) -> Self { AbstractStateMetadata { has_incoming_edges, has_outgoing_edges, @@ -25,7 +25,7 @@ struct AbstractNFAMetadata { } impl AbstractNFAMetadata { - pub fn new() -> Self { + pub(crate) fn new() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, true), accepted: vec![AbstractStateMetadata::new(true, false)], @@ -33,7 +33,7 @@ impl AbstractNFAMetadata { } } - pub fn new_empty_string() -> Self { + pub(crate) fn new_empty_string() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, false), accepted: vec![AbstractStateMetadata::new(false, false)], @@ -41,7 +41,7 @@ impl AbstractNFAMetadata { } } - pub fn new_empty() -> Self { + pub(crate) fn new_empty() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, false), accepted: vec![], @@ -49,7 +49,7 @@ impl AbstractNFAMetadata { } } - pub fn concat(&self, nfa: &AbstractNFAMetadata) -> Self { + pub(crate) fn concat(&self, nfa: &AbstractNFAMetadata) -> Self { let start_state_and_accept_states_not_mergeable = nfa.start.has_incoming_edges && self.accepted.iter().any(|s| s.has_outgoing_edges); @@ -68,7 +68,7 @@ impl AbstractNFAMetadata { } } - pub fn repeat(&self, min: u32, max_opt: &Option) -> Self { + pub(crate) fn repeat(&self, min: u32, max_opt: &Option) -> Self { let start_state_not_mergeable = self.start.has_incoming_edges; let accepted_not_mergeable = self.accepted.iter().any(|s| s.has_outgoing_edges); let start_state_or_accept_states_not_mergeable = @@ -129,7 +129,7 @@ impl AbstractNFAMetadata { } } - pub fn alternate(&mut self, nfa: &AbstractNFAMetadata) -> Self { + pub(crate) fn alternate(&mut self, nfa: &AbstractNFAMetadata) -> Self { let self_start_state_not_mergeable = self.start.has_incoming_edges; let self_accepted_not_mergeable = self.accepted.iter().any(|s| s.has_outgoing_edges); @@ -162,7 +162,7 @@ impl AbstractNFAMetadata { } impl RegularExpression { - pub fn get_number_of_states_in_nfa(&self) -> usize { + pub(crate) fn get_number_of_states_in_nfa(&self) -> usize { self.evaluate_number_of_states_in_nfa().number_of_states } diff --git a/src/regex/builder.rs b/src/regex/builder.rs index e8a354f..727bcfe 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,19 +11,25 @@ lazy_static! { } impl RegularExpression { - pub fn new(regex: &str) -> Result { - if regex.is_empty() { + /// Parses and simplifies the provided pattern and returns the resulting [`RegularExpression`]. + pub fn new(pattern: &str) -> Result { + Self::parse(pattern, true) + } + + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. If `simplify` is `true`, the expression is simplified during parsing. + pub fn parse(pattern: &str, simplify: bool) -> Result { + if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); } - if regex == "[]" { + if pattern == "[]" { return Ok(RegularExpression::new_empty()); } match ParserBuilder::new() .dot_matches_new_line(true) .build() - .parse(&Self::remove_flags(regex)) + .parse(&Self::remove_flags(pattern)) { - Ok(hir) => Self::convert_to_regex(&hir), + Ok(hir) => Self::convert_to_regex(&hir, simplify), Err(err) => Err(EngineError::RegexSyntaxError(err.to_string())), } } @@ -32,23 +38,26 @@ impl RegularExpression { RE_FLAG_DETECTION.replace_all(regex, "").to_string() } + /// Creates a regular expression that matches all possible strings. pub fn new_total() -> Self { RegularExpression::Repetition( - Box::new(RegularExpression::Character(Range::total())), + Box::new(RegularExpression::Character(CharRange::total())), 0, None, ) } + /// Creates a regular expression that matches the empty language. pub fn new_empty() -> Self { - RegularExpression::Character(Range::empty()) + RegularExpression::Character(CharRange::empty()) } + /// Creates a regular expression that matches only the empty string `""`. pub fn new_empty_string() -> Self { RegularExpression::Concat(VecDeque::new()) } - fn convert_to_regex(hir: &Hir) -> Result { + fn convert_to_regex(hir: &Hir, simplify: bool) -> Result { match hir.kind() { HirKind::Empty => Ok(RegularExpression::new_empty_string()), HirKind::Literal(literal) => { @@ -56,7 +65,7 @@ impl RegularExpression { if let Ok(string) = String::from_utf8(literal.0.clone().into_vec()) { for char in string.chars() { regex_concat = regex_concat.concat( - &RegularExpression::Character(Range::new_from_range( + &RegularExpression::Character(CharRange::new_from_range( Char::new(char)..=Char::new(char), )), true, @@ -80,15 +89,26 @@ impl RegularExpression { HirKind::Look(_) => Ok(RegularExpression::new_empty_string()), HirKind::Repetition(repetition) => { let (min, max) = (repetition.min, repetition.max); - Self::convert_to_regex(&repetition.sub).map(|v| v.repeat(min, max)) + let regex = Self::convert_to_regex(&repetition.sub, simplify)?; + Ok(if simplify { + regex.repeat(min, max) + } else { + RegularExpression::Repetition(Box::new(regex), min, max) + }) } - HirKind::Capture(capture) => Self::convert_to_regex(&capture.sub), + HirKind::Capture(capture) => Self::convert_to_regex(&capture.sub, simplify), HirKind::Concat(concat) => { let mut concat_regex = RegularExpression::Concat(VecDeque::with_capacity(concat.len())); for c in concat { - let concat_value = Self::convert_to_regex(c)?; - concat_regex = concat_regex.concat(&concat_value, true); + let concat_value = Self::convert_to_regex(c, simplify)?; + if simplify { + concat_regex = concat_regex.concat(&concat_value, true); + } else if let RegularExpression::Concat(values) = concat_regex { + let mut values = values.clone(); + values.push_back(concat_value); + concat_regex = RegularExpression::Concat(values); + } } Ok(concat_regex) } @@ -96,32 +116,38 @@ impl RegularExpression { let mut alternation_regex = RegularExpression::Alternation(Vec::with_capacity(alternation.len())); for a in alternation { - let alternation_value = Self::convert_to_regex(a)?; - alternation_regex = alternation_regex.union(&alternation_value); + let alternation_value = Self::convert_to_regex(a, simplify)?; + if simplify { + alternation_regex = alternation_regex.union(&alternation_value); + } else if let RegularExpression::Alternation(values) = alternation_regex { + let mut values = values.clone(); + values.push(alternation_value); + alternation_regex = RegularExpression::Alternation(values); + } } Ok(alternation_regex) } } } - fn to_range_unicode(class_unicode: &ClassUnicode) -> Range { + fn to_range_unicode(class_unicode: &ClassUnicode) -> CharRange { let mut new_range = Vec::with_capacity(class_unicode.ranges().len()); for range in class_unicode.ranges() { new_range.push(AnyRange::from( Char::new(range.start())..=Char::new(range.end()), )); } - Range::new_from_ranges(&new_range) + CharRange::new_from_ranges(&new_range) } - fn to_range_bytes(class_bytes: &ClassBytes) -> Range { + fn to_range_bytes(class_bytes: &ClassBytes) -> CharRange { let mut new_range = Vec::with_capacity(class_bytes.ranges().len()); for range in class_bytes.ranges() { new_range.push(AnyRange::from( Char::new(range.start() as char)..=Char::new(range.end() as char), )); } - Range::new_from_ranges(&new_range) + CharRange::new_from_ranges(&new_range) } } @@ -246,22 +272,22 @@ mod tests { let regex_parsed = RegularExpression::new(".").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("\t")); - assert!(automaton.match_string("\n")); - assert!(automaton.match_string("\r")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("\t")); + assert!(automaton.is_match("\n")); + assert!(automaton.is_match("\r")); let regex_parsed = RegularExpression::new("(?i)a").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("a")); - assert!(!automaton.match_string("A")); + assert!(automaton.is_match("a")); + assert!(!automaton.is_match("A")); let regex_parsed = RegularExpression::new("a(?i)a(?-s).").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("aa\n")); - assert!(!automaton.match_string("aAb")); + assert!(automaton.is_match("aa\n")); + assert!(!automaton.is_match("aAb")); assert!(RegularExpression::new("\\1").is_err()); Ok(()) diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 176612f..a7682f6 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,7 +1,6 @@ use std::{cmp, collections::VecDeque, fmt::Display}; -use crate::Range; -use execution_profile::ThreadLocalParams; +use crate::execution_profile::ExecutionProfile; use regex_charclass::CharacterClass; use regex_syntax::hir::{Class, ClassBytes, ClassUnicode, Hir, HirKind}; @@ -12,13 +11,13 @@ use super::*; mod analyze; mod builder; mod operation; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] mod serializer; /// Represent a regular expression. #[derive(Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)] pub enum RegularExpression { - Character(Range), + Character(CharRange), Repetition(Box, u32, Option), Concat(VecDeque), Alternation(Vec), @@ -44,21 +43,21 @@ impl Display for RegularExpression { multiplicator_part = String::from("?"); } else if let Some(max) = max_opt { if max == min { - multiplicator_part = format!("{{{}}}", max); + multiplicator_part = format!("{{{max}}}"); } else { - multiplicator_part = format!("{{{},{}}}", min, max); + multiplicator_part = format!("{{{min},{max}}}"); } } else { - multiplicator_part = format!("{{{},}}", min); + multiplicator_part = format!("{{{min},}}"); } match **regular_expression { RegularExpression::Repetition(_, _, _) => { - format!("({}){}", regex_part, multiplicator_part) + format!("({regex_part}){multiplicator_part}") } RegularExpression::Concat(_) => { - format!("({}){}", regex_part, multiplicator_part) + format!("({regex_part}){multiplicator_part}") } - _ => format!("{}{}", regex_part, multiplicator_part), + _ => format!("{regex_part}{multiplicator_part}"), } } RegularExpression::Concat(concat) => { @@ -82,15 +81,16 @@ impl Display for RegularExpression { if alternation.len() == 1 { sb } else { - format!("({})", sb) + format!("({sb})") } } }; - write!(f, "{}", str) + write!(f, "{str}") } } impl RegularExpression { + /// Checks if the regular expression matches the empty language. pub fn is_empty(&self) -> bool { match self { RegularExpression::Alternation(alternation) => alternation.is_empty(), @@ -99,6 +99,7 @@ impl RegularExpression { } } + /// Checks if the regular expression only matches the empty string `""`. pub fn is_empty_string(&self) -> bool { match self { RegularExpression::Concat(concat) => concat.is_empty(), @@ -106,6 +107,7 @@ impl RegularExpression { } } + /// Checks if the regular expression matches all possible strings. pub fn is_total(&self) -> bool { match self { RegularExpression::Repetition(regular_expression, min, max_opt) => { @@ -122,15 +124,15 @@ impl RegularExpression { } } + /// Converts the regular expression to an equivalent [`FastAutomaton`]. pub fn to_automaton(&self) -> Result { - if self.get_number_of_states_in_nfa() >= ThreadLocalParams::get_max_number_of_states() { - return Err(EngineError::AutomatonHasTooManyStates); - } + ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; + match self { - RegularExpression::Character(range) => FastAutomaton::make_from_range(range), + RegularExpression::Character(range) => Ok(FastAutomaton::new_from_range(range)), RegularExpression::Repetition(regular_expression, min, max_opt) => { let mut automaton = regular_expression.to_automaton()?; - automaton.repeat(*min, *max_opt)?; + automaton.repeat_mut(*min, *max_opt)?; Ok(automaton) } RegularExpression::Concat(concat) => { @@ -138,15 +140,106 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - FastAutomaton::concatenate(concats) + FastAutomaton::concat_all(&concats) } RegularExpression::Alternation(alternation) => { - let mut concats = Vec::with_capacity(alternation.len()); + let mut alternates = Vec::with_capacity(alternation.len()); for c in alternation.iter() { - concats.push(c.to_automaton()?); + alternates.push(c.to_automaton()?); + } + FastAutomaton::union_all(&alternates) + } + } + } + + /// Returns a heuristic score for the readability of the pattern. + pub fn evaluate_complexity(&self) -> f64 { + let (score, depth, _) = self.eval_inner(); + score + Self::depth_penalty(depth) + } + + /// Returns: (score, max_depth, contains_repetition) + fn eval_inner(&self) -> (f64, usize, bool) { + match self { + RegularExpression::Character(range) => { + let len = range.to_regex().len() as f64; + // small, capped cost for raw length + let base = 1.0 + 0.05 * len.min(40.0); + (base, 1, false) + } + + RegularExpression::Repetition(inner, min, max_opt) => { + let (inner_score, inner_depth, inner_has_rep) = inner.eval_inner(); + + // multipliers tuned for readability impact + let mut m = match max_opt { + None => 1.6, + Some(max) if max > min => 1.3, + Some(max) if max == min && *min > 1 => 1.1, + _ => 1.0, + }; + + // nested quantifiers like (...+)+ are harder + if inner_has_rep { + m *= 1.5; } - FastAutomaton::alternation(concats) + + (inner_score * m, inner_depth + 1, true) } + + RegularExpression::Concat(items) => { + let mut sum = 0.0; + let mut max_depth = 0usize; + let mut has_rep = false; + + for (i, it) in items.iter().enumerate() { + let (s, d, h) = it.eval_inner(); + sum += s; + if i > 0 { + // tiny discount: linear sequences are relatively easy to read + sum *= 0.98; + } + if d > max_depth { + max_depth = d; + } + has_rep |= h; + } + + (sum, max_depth + 1, has_rep) + } + + RegularExpression::Alternation(branches) => { + if branches.is_empty() { + return (0.0, 1, false); + } + let mut sum = 0.0; + let mut max_depth = 0usize; + let mut has_rep = false; + + for b in branches { + let (s, d, h) = b.eval_inner(); + sum += s; + if d > max_depth { + max_depth = d; + } + has_rep |= h; + } + + // branching cost: more alternatives = harder to scan + let k = branches.len() as f64; + let multiplier = 1.0 + 0.15 * (k - 1.0); + + (sum * multiplier, max_depth + 1, has_rep) + } + } + } + + fn depth_penalty(depth: usize) -> f64 { + // no penalty up to depth 2, then quadratic growth + if depth <= 2 { + 0.0 + } else { + ((depth - 2) as f64).powi(2) * 0.8 } } } diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 6907d9b..9cefb01 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,6 +1,20 @@ use super::*; impl RegularExpression { + /// Returns a regular expression that is the concatenation of all expressions in `patterns`. + pub fn concat_all<'a, I: IntoIterator>( + patterns: I, + ) -> RegularExpression { + let mut result = RegularExpression::new_empty_string(); + + for other in patterns { + result = result.concat(other, true); + } + + result + } + + /// Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { return RegularExpression::new_empty(); @@ -10,35 +24,19 @@ impl RegularExpression { return self.clone(); } - match (self, other) { + let (front, back) = if append_back { + (self, other) + } else { + (other, self) + }; + + match (front, back) { (RegularExpression::Concat(_), RegularExpression::Concat(_)) => { - if append_back { - Self::opconcat_concat_and_concat(self, other) - } else { - Self::opconcat_concat_and_concat(other, self) - } - } - (RegularExpression::Concat(_), _) => { - if append_back { - Self::opconcat_concat_and_other(self, other) - } else { - Self::opconcat_other_and_concat(other, self) - } - } - (_, RegularExpression::Concat(_)) => { - if append_back { - Self::opconcat_other_and_concat(self, other) - } else { - Self::opconcat_concat_and_other(other, self) - } - } - (_, _) => { - if append_back { - Self::opconcat_other_and_other(self, other) - } else { - Self::opconcat_other_and_other(other, self) - } + Self::opconcat_concat_and_concat(front, back) } + (RegularExpression::Concat(_), _) => Self::opconcat_concat_and_other(front, back), + (_, RegularExpression::Concat(_)) => Self::opconcat_other_and_concat(front, back), + (_, _) => Self::opconcat_other_and_other(front, back), } } @@ -178,17 +176,9 @@ impl RegularExpression { } else { None }; - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + Some(this_regex.repeat(new_min, new_max_opt)) } else { - Some(RegularExpression::Repetition( - Box::new(this.clone()), - 2, - Some(2), - )) + Some(this.repeat(2, Some(2))) } } else if let ( RegularExpression::Repetition(this_regex, this_min, this_max_opt), @@ -203,11 +193,8 @@ impl RegularExpression { } else { None }; - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + + Some(this_regex.repeat(new_min, new_max_opt)) } else if let ( RegularExpression::Character(this_range), RegularExpression::Character(that_range), @@ -226,11 +213,7 @@ impl RegularExpression { if **this_regex == *that { let new_min = this_min + 1; let new_max_opt = this_max_opt.as_ref().map(|this_max| this_max + 1); - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + Some(this_regex.repeat(new_min, new_max_opt)) } else { None } @@ -238,11 +221,7 @@ impl RegularExpression { if **that_regex == *this { let new_min = that_min + 1; let new_max_opt = that_max_opt.as_ref().map(|this_max| this_max + 1); - Some(RegularExpression::Repetition( - that_regex.clone(), - new_min, - new_max_opt, - )) + Some(that_regex.repeat(new_min, new_max_opt)) } else { None } diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index 2baa587..7364d65 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -1,236 +1,6 @@ use super::*; mod concat; +mod repeat; mod simplify; -mod union; - -impl RegularExpression { - pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { - if self.is_total() { - return RegularExpression::new_total(); - } else if self.is_empty() { - return RegularExpression::new_empty(); - } else if self.is_empty_string() { - return Self::new_empty_string(); - } else if let Some(max) = max_opt { - if max < min || max == 0 { - return RegularExpression::new_empty_string(); - } else if min == 1 && max == 1 { - return self.clone(); - } - } - - match self { - RegularExpression::Repetition(regular_expression, o_min, o_max_opt) => { - let new_max = if let (Some(max), Some(o_max)) = (max_opt, o_max_opt) { - Some(max * o_max) - } else { - None - }; - - let o_min = *o_min; - if let Some(o_max) = o_max_opt { - let o_max = *o_max; - if o_min <= 1 || max_opt.is_some() && max_opt.unwrap() == min { - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } else if o_min == o_max && o_min > 1 { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } else { - let r = ((o_max as f64) - 1f64) / ((o_max as f64) - (o_min as f64)); - if r > cmp::max(2, min) as f64 { - return RegularExpression::Repetition( - Box::new(self.clone()), - min, - max_opt, - ); - } - - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } - } else if o_max_opt.is_none() - || max_opt.is_some() && (max_opt.unwrap() == min || max_opt.unwrap() == 1) - || o_max_opt.is_some() && o_max_opt.unwrap() == 1 - || max_opt.is_none() && o_min == 0 - { - RegularExpression::Repetition(regular_expression.clone(), min * o_min, new_max) - } else { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } - } - _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), - } - } -} - -#[cfg(test)] -mod tests { - use regex_charclass::{char::Char, irange::RangeSet}; - - use crate::regex::RegularExpression; - - #[test] - fn test_parse_and_simplify() -> Result<(), String> { - assert_parse_and_simplify("(xxx)*", "(x{3})*"); - assert_parse_and_simplify("(x*){3}", "x*"); - assert_parse_and_simplify("(x+)?", "x*"); - assert_parse_and_simplify("(x?)+", "x*"); - assert_parse_and_simplify("(x{0,3})+", "x*"); - assert_parse_and_simplify("(x{2,3})+", "x{2,}"); - assert_parse_and_simplify("(x{7,9})+", "(x{7,9})+"); - assert_parse_and_simplify("(x+)*", "x*"); - assert_parse_and_simplify(".*abc", ".*abc"); - assert_parse_and_simplify(".*a(b|cd)", ".*a(b|cd)"); - assert_parse_and_simplify( - "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}", - "a(bc(dg|fe)|mkv)*(abc){4,5}", - ); - assert_parse_and_simplify("((abc|fg)abc|(abc|fg)fg)", "(abc|fg){2}"); - assert_parse_and_simplify("(a{2}|a{3})", "a{2,3}"); - assert_parse_and_simplify("(a|b)", "[ab]"); - assert_parse_and_simplify("(ab|a|cd|b|ef)", "(b|ab?|cd|ef)"); - assert_parse_and_simplify("(ab|ab)", "ab"); - assert_parse_and_simplify("(ab)(ab)(ab)", "(ab){3}"); - assert_parse_and_simplify("aaaabbbbbccc", "a{4}b{5}c{3}"); - assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); - assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); - assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); - assert_parse_and_simplify("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}"); - Ok(()) - } - - fn assert_parse_and_simplify(regex: &str, regex_simplified: &str) { - let regex_parsed = RegularExpression::new(regex).unwrap(); - assert_eq!(regex_simplified, regex_parsed.to_string()); - } - - #[test] - fn test_repeat_simplify() -> Result<(), String> { - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(2), - 3, - Some(3), - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(2), - 2, - Some(4), - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 3, - Some(3), - 0, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 0, - Some(3), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 1, - Some(2), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(3), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 3, - Some(4), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 7, - Some(8), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 0, - None, - 3, - Some(3), - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 1, - None, - 0, - Some(1), - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 0, - Some(1), - 1, - None, - ); - - Ok(()) - } - - fn assert_repeat_simplify( - range: &RangeSet, - min1: u32, - max1: Option, - min2: u32, - max2: Option, - ) { - let repeat = RegularExpression::Repetition( - Box::new(RegularExpression::Repetition( - Box::new(RegularExpression::Character(range.clone())), - min1, - max1, - )), - min2, - max2, - ); - - let got = RegularExpression::new(&repeat.to_string()).unwrap(); - - println!("{} -> {}", repeat, got); - - let repeat = repeat.to_automaton().unwrap(); - - //repeat.to_dot(); - - let result = got.to_automaton().unwrap(); - - assert!(repeat.is_equivalent_of(&result).unwrap()); - } -} +mod union; \ No newline at end of file diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs new file mode 100644 index 0000000..235f4f1 --- /dev/null +++ b/src/regex/operation/repeat.rs @@ -0,0 +1,253 @@ +use super::*; + +impl RegularExpression { + /// Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { + if self.is_total() { + return RegularExpression::new_total(); + } else if self.is_empty() { + return RegularExpression::new_empty(); + } else if self.is_empty_string() { + return Self::new_empty_string(); + } else if let Some(max) = max_opt { + if max < min || max == 0 { + return RegularExpression::new_empty_string(); + } else if min == 1 && max == 1 { + return self.clone(); + } + } + + match self { + RegularExpression::Repetition(regular_expression, i_min, i_max_opt) => { + let new_max = if let (Some(o_max), Some(i_max)) = (max_opt, i_max_opt) { + Some(o_max * i_max) + } else { + None + }; + + if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, min, max_opt) { + RegularExpression::Repetition( + regular_expression.clone(), + min * i_min, + new_max, + ) + } else { + RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) + } + } + _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), + } + } + + /// Evaluate if the repetition `(r{i_min,i_max_opt}){o_min,o_max_opt}` can be simplified to `r{i_min*o_min,i_max_opt*o_max_opt}`. + fn can_simplify_nested_repetition( + i_min: u32, + i_max_opt: Option, + o_min: u32, + o_max_opt: Option, + ) -> bool { + if let Some(o_max) = o_max_opt { + if o_min == o_max { + return true; + } + } + + if let Some(i_max) = i_max_opt { + // We check if there is any gap by resolving: + // o_min * i_max >= (o_min + 1) * i_min - 1 + // <=> o_min * (i_max - i_min) >= i_min - 1 + o_min.saturating_mul(i_max.saturating_sub(i_min)) >= i_min.saturating_sub(1) + } else if o_min > 0 { + true + } else { + i_min <= 1 + } + } +} + +#[cfg(test)] +mod tests { + + use regex_charclass::char::Char; + + use crate::{CharRange, regex::RegularExpression}; + + #[test] + fn test_parse_and_simplify() -> Result<(), String> { + assert_parse_and_simplify("(xxx)*", "(x{3})*"); + assert_parse_and_simplify("(x*){3}", "x*"); + assert_parse_and_simplify("(x+)?", "x*"); + assert_parse_and_simplify("(x?)+", "x*"); + assert_parse_and_simplify("(x{0,3})+", "x*"); + assert_parse_and_simplify("(x{2,3})+", "x{2,}"); + assert_parse_and_simplify("(x{7,9})+", "(x{7,9})+"); + assert_parse_and_simplify("(x+)*", "x*"); + assert_parse_and_simplify(".*abc", ".*abc"); + assert_parse_and_simplify(".*a(b|cd)", ".*a(b|cd)"); + assert_parse_and_simplify( + "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}", + "a(bc(dg|fe)|mkv)*(abc){4,5}", + ); + assert_parse_and_simplify("((abc|fg)abc|(abc|fg)fg)", "(abc|fg){2}"); + assert_parse_and_simplify("(a{2}|a{3})", "a{2,3}"); + assert_parse_and_simplify("(a|b)", "[ab]"); + assert_parse_and_simplify("(ab|a|cd|b|ef)", "(b|ab?|cd|ef)"); + assert_parse_and_simplify("(ab|ab)", "ab"); + assert_parse_and_simplify("(ab)(ab)(ab)", "(ab){3}"); + assert_parse_and_simplify("aaaabbbbbccc", "a{4}b{5}c{3}"); + assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); + assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); + assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); + assert_parse_and_simplify( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}", + ); + + assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); + Ok(()) + } + + fn assert_parse_and_simplify(regex: &str, regex_simplified: &str) { + let regex_parsed = RegularExpression::new(regex).unwrap(); + assert_eq!(regex_simplified, regex_parsed.to_string()); + } + + #[test] + fn test_repeat_simplify() -> Result<(), String> { + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(2), + 3, + Some(3), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(2), + 2, + Some(4), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 3, + Some(3), + 0, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + Some(3), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 1, + Some(2), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(3), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 3, + Some(4), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 7, + Some(8), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + None, + 3, + Some(3), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 1, + None, + 0, + Some(1), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + Some(1), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(4), + 2, + Some(4), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(3), + 2, + Some(2), + ); + + Ok(()) + } + + fn assert_repeat_simplify( + range: &CharRange, + min1: u32, + max1: Option, + min2: u32, + max2: Option, + ) { + let repeat = RegularExpression::Repetition( + Box::new(RegularExpression::Repetition( + Box::new(RegularExpression::Character(range.clone())), + min1, + max1, + )), + min2, + max2, + ); + + let got = RegularExpression::new(&repeat.to_string()).unwrap(); + + println!("{} -> {}", repeat, got); + + let repeat = repeat.to_automaton().unwrap(); + + //repeat.to_dot(); + + let result = got.to_automaton().unwrap(); + + assert!(repeat.equivalent(&result).unwrap()); + } +} diff --git a/src/regex/operation/simplify.rs b/src/regex/operation/simplify.rs index ae87087..5156ce8 100644 --- a/src/regex/operation/simplify.rs +++ b/src/regex/operation/simplify.rs @@ -1,6 +1,7 @@ use super::*; impl RegularExpression { + /// Returns a simplified version by eliminating redundant constructs and applying canonical reductions. pub fn simplify(&self) -> Self { match self { RegularExpression::Character(_) => self.clone(), diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 8f5c1ae..ee6abee 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -3,19 +3,41 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { + /// Returns a regular expression matching the union of `self` and `other`. pub fn union(&self, other: &RegularExpression) -> RegularExpression { + Self::union_all([self, other]) + } + + /// Returns a regular expression that is the union of all expressions in `patterns`. + pub fn union_all<'a, I: IntoIterator>( + patterns: I, + ) -> RegularExpression { + let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); + + for other in patterns { + result = result.union_(other); + + if result.is_total() { + break; + } + } + + result.into_owned() + } + + fn union_<'a>(&self, other: &'a RegularExpression) -> Cow<'a, RegularExpression> { if self.is_total() || other.is_total() { - return RegularExpression::new_total(); + return Cow::Owned(RegularExpression::new_total()); } else if self.is_empty() { - return other.clone(); + return Cow::Borrowed(other); } else if other.is_empty() || self == other { - return self.clone(); + return Cow::Owned(self.clone()); } else if other.is_empty_string() { - return self.clone().repeat(0, Some(1)); + return Cow::Owned(self.repeat(0, Some(1))); } else if self.is_empty_string() { - return other.clone().repeat(0, Some(1)); + return Cow::Owned(other.repeat(0, Some(1))); } - match (self, other) { + Cow::Owned(match (self, other) { ( RegularExpression::Character(self_range), RegularExpression::Character(other_range), @@ -63,14 +85,14 @@ impl RegularExpression { Self::opunion_concat_and_alternation(other, self) } (RegularExpression::Alternation(self_elements), RegularExpression::Alternation(_)) => { - let mut new_alternation = other.clone(); + let mut new_alternation = Cow::Borrowed(other); for self_element in self_elements { - new_alternation = new_alternation.union(self_element); + new_alternation = new_alternation.union_(self_element); } - new_alternation + new_alternation.into_owned() } - } + }) } fn opunion_character_and_repetition( @@ -83,21 +105,14 @@ impl RegularExpression { ) = (this_character, that_repetition) { if this_character == &**that_regex && *that_min <= 2 { - RegularExpression::Repetition( - that_regex.clone(), - cmp::min(1, *that_min), - *that_max_opt, - ) + that_regex.repeat(cmp::min(1, *that_min), *that_max_opt) } else { let mut alternate = vec![this_character.clone(), that_repetition.clone()]; alternate.sort_unstable(); RegularExpression::Alternation(alternate) } } else { - panic!( - "Not character and repetition {:?} {:?}", - this_character, that_repetition - ) + panic!("Not character and repetition {this_character:?} {that_repetition:?}") } } @@ -116,17 +131,17 @@ impl RegularExpression { if prefix.is_none() && suffix.is_none() { let mut alternate_elements = vec![self_regex, other_regex]; alternate_elements.sort_unstable(); - RegularExpression::Alternation(alternate_elements) + Cow::Owned(RegularExpression::Alternation(alternate_elements)) } else { - self_regex.union(&other_regex) + self_regex.union_(&other_regex) } } else { - RegularExpression::Repetition(Box::new(self_regex), 0, Some(1)) + Cow::Owned(self_regex.repeat(0, Some(1))) } } else if !other_regex.is_empty_string() { - RegularExpression::Repetition(Box::new(other_regex), 0, Some(1)) + Cow::Owned(other_regex.repeat(0, Some(1))) } else { - RegularExpression::new_empty_string() + Cow::Owned(RegularExpression::new_empty_string()) }; regex = regex.concat(®ex_from_alternate, true); @@ -202,11 +217,7 @@ impl RegularExpression { ) = (this_concat, that_repetition) { if this_concat == &**that_regex && *that_min <= 2 { - RegularExpression::Repetition( - that_regex.clone(), - cmp::min(1, *that_min), - *that_max_opt, - ) + that_regex.repeat(cmp::min(1, *that_min), *that_max_opt) } else { Self::opunion_common_affixes(this_concat, that_repetition) } @@ -262,18 +273,13 @@ impl RegularExpression { || this_max + 1 == *that_min || that_max + 1 == *this_min { - return RegularExpression::Repetition( - this_regex.clone(), + return this_regex.repeat( cmp::min(*this_min, *that_min), Some(cmp::max(*this_max, *that_max)), ); } } else { - return RegularExpression::Repetition( - this_regex.clone(), - cmp::min(*this_min, *that_min), - None, - ); + return this_regex.repeat(cmp::min(*this_min, *that_min), None); } } @@ -295,11 +301,7 @@ impl RegularExpression { ) = (this_repetition, that_alternation) { if that_alternation == &**this_regex && *this_min <= 2 { - RegularExpression::Repetition( - this_regex.clone(), - cmp::min(1, *this_min), - *this_max_opt, - ) + this_regex.repeat(cmp::min(1, *this_min), *this_max_opt) } else { let mut set = BTreeSet::new(); @@ -354,6 +356,11 @@ mod tests { #[test] fn test_union() -> Result<(), String> { assert_union("(a+|a+b)", "a+b?"); + assert_union("(a+|a*)", "a*"); + assert_union("(a?|a{0,2})", "a{0,2}"); + assert_union("(a{2,4}|a{1,3})", "a{1,4}"); + assert_union("(a{1,2}|a{3,4})", "a{1,4}"); + assert_union("(a{3,4}|a{1,2})", "a{1,4}"); Ok(()) } diff --git a/src/regex/serializer.rs b/src/regex/serializer.rs index 83fd99f..0832756 100644 --- a/src/regex/serializer.rs +++ b/src/regex/serializer.rs @@ -16,10 +16,7 @@ impl<'de> serde::Deserialize<'de> for RegularExpression { where D: Deserializer<'de>, { - let regex_string = match String::deserialize(deserializer) { - Ok(str) => str, - Err(err) => return Err(err), - }; + let regex_string = String::deserialize(deserializer)?; match RegularExpression::new(®ex_string) { Ok(regex) => Ok(regex), Err(err) => Err(de::Error::custom(err.to_string())), diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs deleted file mode 100644 index cb581e6..0000000 --- a/src/tokenizer/embed_regex.rs +++ /dev/null @@ -1,307 +0,0 @@ -use token::TokenError; - -use crate::regex::RegularExpression; - -use self::token::regex_token::RegexToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_regex_embedding(&self, regex: &RegularExpression) -> Vec { - let mut vec = self.to_regex_embedding_vec(regex); - - Self::append_counter_if_necessary(&mut vec); - - vec - } - - fn append_counter_if_necessary(vec: &mut Vec) { - if let Some(last) = vec.last() { - match last { - RegexToken::RepetitionNone => {} - RegexToken::Repetition(_) => {} - RegexToken::EndGroup => {} - RegexToken::StartGroup => {} - RegexToken::Alternation => {} - RegexToken::Error => todo!(), - _ => { - vec.push(RegexToken::Repetition(1)); - } - }; - } - } - - fn to_regex_embedding_vec(&self, regex: &RegularExpression) -> Vec { - let mut vec = vec![]; - - match regex { - RegularExpression::Character(range) => { - self.range_tokenizer - .range_to_embedding(range) - .unwrap() - .into_iter() - .for_each(|t| vec.push(RegexToken::Range(t))); - } - RegularExpression::Repetition(regex, min, max_opt) => { - if matches!( - **regex, - RegularExpression::Repetition(_, _, _) | RegularExpression::Concat(_) - ) { - vec.push(RegexToken::StartGroup); - vec.extend(self.to_regex_embedding_vec(regex)); - vec.push(RegexToken::EndGroup); - } else { - vec.extend(self.to_regex_embedding_vec(regex)); - } - - vec.push(RegexToken::Repetition(*min as u16)); - - if let Some(max) = max_opt { - if max != min { - vec.push(RegexToken::Repetition(*max as u16)); - } - } else { - vec.push(RegexToken::RepetitionNone); - } - } - RegularExpression::Concat(elements) => { - for element in elements { - vec.extend(self.to_regex_embedding_vec(element)); - Self::append_counter_if_necessary(&mut vec); - } - } - RegularExpression::Alternation(elements) => { - vec.push(RegexToken::StartGroup); - - for i in 0..elements.len() { - let element = &elements[i]; - vec.extend(self.to_regex_embedding_vec(element)); - Self::append_counter_if_necessary(&mut vec); - if i < elements.len() - 1 { - vec.push(RegexToken::Alternation); - } - } - - vec.push(RegexToken::EndGroup); - } - } - - vec - } - - pub fn from_regex_embedding( - &self, - vec: &[RegexToken], - ) -> Result { - let mut regex_groups = vec![(RegularExpression::new_empty_string(), false)]; - let mut current_range: Option = None; - let mut current_min = None; - for i in 0..vec.len() { - let token = vec[i]; - let current_group = regex_groups.len() - 1; - match token { - RegexToken::Range(range_token) => { - let range = self.range_tokenizer.token_to_range(&range_token).unwrap(); - if let Some(curr_range) = ¤t_range { - current_range = Some(curr_range.union(range)); - } else { - current_range = Some(range.clone()); - } - } - RegexToken::StartGroup => { - regex_groups.push((RegularExpression::new_empty_string(), false)); - } - RegexToken::EndGroup => { - if current_group == 0 { - return Err(TokenError::SyntaxError); - } - if i == vec.len() - 1 || !matches!(vec[i + 1], RegexToken::Repetition(_)) { - let alternation: bool = regex_groups[current_group].1; - Self::pop_regex_group(&mut regex_groups, &None, &None); - if alternation { - Self::pop_regex_group(&mut regex_groups, &None, &None); - } - } - } - RegexToken::Alternation => { - if regex_groups[current_group].1 { - Self::pop_regex_group(&mut regex_groups, &None, &None); - } - regex_groups.push((RegularExpression::new_empty_string(), true)); - } - RegexToken::RepetitionNone => { - if current_min.is_some() { - if let Some(range) = ¤t_range { - Self::add_regex( - &mut regex_groups, - ¤t_min, - &None, - &RegularExpression::Character(range.clone()), - false, - ); - current_range = None; - } else { - Self::pop_regex_group(&mut regex_groups, ¤t_min, &None); - } - current_min = None; - } else { - return Err(TokenError::SyntaxError); - } - } - RegexToken::Repetition(count) => { - if current_min.is_some() - || i == vec.len() - 1 - || !matches!(vec[i + 1], RegexToken::Repetition(_)) - && !matches!(vec[i + 1], RegexToken::RepetitionNone) - { - let min; - let max; - if current_min.is_some() { - min = current_min; - max = Some(count as u32); - } else { - min = Some(count as u32); - max = Some(count as u32); - } - if let Some(range) = ¤t_range { - Self::add_regex( - &mut regex_groups, - &min, - &max, - &RegularExpression::Character(range.clone()), - false, - ); - current_range = None; - } else { - Self::pop_regex_group(&mut regex_groups, &min, &max); - } - current_min = None; - } else { - current_min = Some(count as u32); - } - } - _ => return Err(TokenError::UnknownToken), - }; - } - - Ok(regex_groups[0].0.clone()) - } - - fn pop_regex_group( - regex_groups: &mut Vec<(RegularExpression, bool)>, - current_min: &Option, - current_max: &Option, - ) -> bool { - if regex_groups.len() <= 1 { - return false; - } - - let popped_group = regex_groups.pop().unwrap(); - Self::add_regex( - regex_groups, - current_min, - current_max, - &popped_group.0, - popped_group.1, - ); - true - } - - fn add_regex( - regex_groups: &mut [(RegularExpression, bool)], - current_min: &Option, - current_max: &Option, - regex: &RegularExpression, - alternation: bool, - ) { - let current_group = regex_groups.len() - 1; - let regex_to_use = if let Some(min) = current_min { - if min == &1 && current_max.is_some() { - if current_max.unwrap() == 1 { - regex.clone() - } else { - RegularExpression::Repetition(Box::new(regex.clone()), *min, *current_max) - } - } else { - RegularExpression::Repetition(Box::new(regex.clone()), *min, *current_max) - } - } else { - regex.clone() - }; - - if alternation { - regex_groups[current_group].0 = regex_groups[current_group].0.union(®ex_to_use); - } else { - regex_groups[current_group].0 = - regex_groups[current_group].0.concat(®ex_to_use, true); - } - } -} - -#[cfg(test)] -mod tests { - use embed_regex::token::Token; - - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion(".*"); - assert_embedding_convertion("(a|b)"); - assert_embedding_convertion("(|a)"); - assert_embedding_convertion(".*ab"); - assert_embedding_convertion("[a-e]{3}"); - assert_embedding_convertion("[a-e]{3}efg"); - assert_embedding_convertion("toto"); - assert_embedding_convertion(".{2,3}"); - assert_embedding_convertion("q(abc?|ca)x"); - assert_embedding_convertion(".*q(abc?|ca)x"); - assert_embedding_convertion("(abc){3,6}"); - assert_embedding_convertion("((|a)abd+){3}"); - /*assert_embedding_convertion( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", - );*/ - Ok(()) - } - - fn assert_embedding_convertion(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); - println!("{}", regex); - - let automaton = regex.to_automaton().unwrap().determinize().unwrap(); - //automaton.to_dot(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_regex_embedding(®ex); - - //println!("{:?}", embedding); - - // FAIR - let embedding_u16 = RegexToken::to_fair_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u16 - .iter() - .map(|&t| RegexToken::from_fair_token(t)) - .collect::>() - ); - - let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); - assert_eq!(regex, unembedded_regex); - - // AI - let embedding_u8 = RegexToken::to_ai_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u8 - .iter() - .map(|&t| RegexToken::from_ai_token(t)) - .collect::>() - ); - - let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); - assert_eq!(regex, unembedded_regex); - } -} diff --git a/src/tokenizer/embed_regex_operations.rs b/src/tokenizer/embed_regex_operations.rs deleted file mode 100644 index 4dcb19f..0000000 --- a/src/tokenizer/embed_regex_operations.rs +++ /dev/null @@ -1,119 +0,0 @@ -use token::TokenError; - -use crate::regex::RegularExpression; - -use self::token::regex_operations_token::RegexOperationsToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_regex_operations_embedding( - &self, - regex_operations: &[(bool, RegularExpression)], - ) -> Vec { - let mut vec = vec![]; - - for (not, regex) in regex_operations { - if !vec.is_empty() { - vec.push(RegexOperationsToken::And); - } - if *not { - vec.push(RegexOperationsToken::Not); - } - - vec.extend( - self.to_regex_embedding(regex) - .into_iter() - .map(RegexOperationsToken::RegexToken), - ); - } - - vec - } - - pub fn from_regex_operations_embedding( - &self, - vec: &[RegexOperationsToken], - ) -> Result, TokenError> { - let mut operations = vec![]; - let mut current_regex_not = false; - let mut current_regex_token = vec![]; - for token in vec { - match token { - RegexOperationsToken::RegexToken(regex_token) => { - current_regex_token.push(*regex_token) - } - RegexOperationsToken::And => { - let regex = self.from_regex_embedding(¤t_regex_token)?; - operations.push((current_regex_not, regex)); - current_regex_not = false; - current_regex_token.clear(); - } - RegexOperationsToken::Not => current_regex_not = true, - RegexOperationsToken::Error => return Err(TokenError::UnknownToken), - }; - } - - if !current_regex_token.is_empty() { - let regex = self.from_regex_embedding(¤t_regex_token)?; - operations.push((current_regex_not, regex)); - } - - Ok(operations) - } -} - -#[cfg(test)] -mod tests { - use embed_regex_operations::token::Token; - - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion(&[(false, "(a|b)")]); - assert_embedding_convertion(&[(false, "(|a)")]); - assert_embedding_convertion(&[(false, ".*ab")]); - assert_embedding_convertion(&[(true, "toto")]); - assert_embedding_convertion(&[(false, ".{2,3}")]); - assert_embedding_convertion(&[(false, "q(abc?|ca)x")]); - assert_embedding_convertion(&[(false, ".*q(abc?|ca)x")]); - assert_embedding_convertion(&[(false, "(abc){3,6}")]); - assert_embedding_convertion(&[(true, "((|a)abd+){3}")]); - - assert_embedding_convertion(&[(false, ".*a.*"), (false, ".*b.*"), (true, ".*abc.*")]); - Ok(()) - } - - fn assert_embedding_convertion(operations: &[(bool, &str)]) { - let mut automaton = FastAutomaton::new_total(); - let operations: Vec<(bool, RegularExpression)> = operations - .iter() - .map(|(not, regex)| { - let regex = RegularExpression::new(regex).unwrap(); - automaton = automaton.intersection(®ex.to_automaton().unwrap()).unwrap(); - (*not, regex) - }) - .collect(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_regex_operations_embedding(&operations); - - // AI - let embedding_u8: Vec = RegexOperationsToken::to_ai_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u8 - .iter() - .map(|&t| RegexOperationsToken::from_ai_token(t)) - .collect::>() - ); - - let unembedded_operations = tokenizer - .from_regex_operations_embedding(&embedding) - .unwrap(); - assert_eq!(operations, unembedded_operations); - } -} diff --git a/src/tokenizer/token/automaton_token.rs b/src/tokenizer/token/automaton_token.rs deleted file mode 100644 index 215ffed..0000000 --- a/src/tokenizer/token/automaton_token.rs +++ /dev/null @@ -1,115 +0,0 @@ -use self::range_token::RangeToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum AutomatonToken { - Range(RangeToken), - State(u16), - AcceptState, - SeparatorState, - Error, -} - -impl Ord for AutomatonToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for AutomatonToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl AutomatonToken { - const TK_AI_RANGE: u8 = 0; - const TK_AI_STATE: u8 = Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE; - const TK_AI_ACCEPT_STATE: u8 = Self::TK_AI_STATE + Self::AI_MAX_NUMBER_OF_STATES; - const TK_AI_SEPARATOR_STATE: u8 = Self::TK_AI_ACCEPT_STATE + 1; - - pub const AI_MAX_NUMBER_OF_STATES: u8 = 100; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_SEPARATOR_STATE + 1; - - const TK_FAIR_RANGE: u16 = 0; - const TK_FAIR_STATE: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; - const TK_FAIR_ACCEPT_STATE: u16 = Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES; - const TK_FAIR_SEPARATOR_STATE: u16 = Self::TK_FAIR_ACCEPT_STATE + 1; - - pub const FAIR_MAX_NUMBER_OF_STATES: u16 = 65_000; - - pub const FAIR_VOCABULARY_SIZE: u16 = Self::TK_FAIR_SEPARATOR_STATE + 1; -} - -impl Token for AutomatonToken { - fn from_ai_token(token: u8) -> AutomatonToken { - if (Self::TK_AI_RANGE..Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE).contains(&token) - { - AutomatonToken::Range(RangeToken::from_ai_token(token)) - } else if (Self::TK_AI_STATE..Self::TK_AI_STATE + Self::AI_MAX_NUMBER_OF_STATES) - .contains(&token) - { - AutomatonToken::State((token - Self::TK_AI_STATE) as u16) - } else if token == Self::TK_AI_ACCEPT_STATE { - AutomatonToken::AcceptState - } else if token == Self::TK_AI_SEPARATOR_STATE { - AutomatonToken::SeparatorState - } else { - AutomatonToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - AutomatonToken::Range(r) => r.to_ai_token()?, - AutomatonToken::State(s) => { - let max = Self::AI_MAX_NUMBER_OF_STATES; - let s = *s as u8; - if s > max { - return Err(TokenError::TokenOutOfBound("State", max.into(), s.into())); - } - s + Self::TK_AI_STATE - } - AutomatonToken::AcceptState => Self::TK_AI_ACCEPT_STATE, - AutomatonToken::SeparatorState => Self::TK_AI_SEPARATOR_STATE, - AutomatonToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(token: u16) -> AutomatonToken { - if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) - .contains(&token) - { - AutomatonToken::Range(RangeToken::from_fair_token(token)) - } else if (Self::TK_FAIR_STATE..Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES) - .contains(&token) - { - AutomatonToken::State(token - Self::TK_FAIR_STATE) - } else if token == Self::TK_FAIR_ACCEPT_STATE { - AutomatonToken::AcceptState - } else if token == Self::TK_FAIR_SEPARATOR_STATE { - AutomatonToken::SeparatorState - } else { - AutomatonToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - AutomatonToken::Range(r) => r.to_fair_token()?, - AutomatonToken::State(s) => { - let max = Self::FAIR_MAX_NUMBER_OF_STATES; - let s = *s; - if s > max { - return Err(TokenError::TokenOutOfBound("State", max.into(), s.into())); - } - s + Self::TK_FAIR_STATE - } - AutomatonToken::AcceptState => Self::TK_FAIR_ACCEPT_STATE, - AutomatonToken::SeparatorState => Self::TK_FAIR_SEPARATOR_STATE, - AutomatonToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/src/tokenizer/token/mod.rs b/src/tokenizer/token/mod.rs deleted file mode 100644 index 2f28e32..0000000 --- a/src/tokenizer/token/mod.rs +++ /dev/null @@ -1,60 +0,0 @@ -use std::fmt::Display; - -use super::*; - -pub mod automaton_token; -pub mod range_token; -pub mod regex_operations_token; -pub mod regex_token; - -#[derive(Debug, PartialEq, Eq)] -pub enum TokenError { - TokenOutOfBound(&'static str, usize, usize), - UnknownToken, - SyntaxError, -} - -impl Display for TokenError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - TokenError::TokenOutOfBound(token, expected, got) => write!( - f, - "TokenOutOfBound: {token}, expected: {expected}, got: {got}." - ), - TokenError::UnknownToken => write!(f, "UnknownToken"), - TokenError::SyntaxError => write!(f, "SyntaxError"), - } - } -} - -pub trait Token { - fn from_ai_token(token: u8) -> Self; - - fn to_ai_token(&self) -> Result; - - fn to_ai_tokens(tokens: &[Self]) -> Result, TokenError> - where - Self: Sized, - { - let mut vec = Vec::with_capacity(tokens.len()); - for token in tokens { - vec.push(token.to_ai_token()?); - } - Ok(vec) - } - - fn from_fair_token(token: u16) -> Self; - - fn to_fair_token(&self) -> Result; - - fn to_fair_tokens(tokens: &[Self]) -> Result, TokenError> - where - Self: Sized, - { - let mut vec = Vec::with_capacity(tokens.len()); - for token in tokens { - vec.push(token.to_fair_token()?); - } - Ok(vec) - } -} diff --git a/src/tokenizer/token/range_token.rs b/src/tokenizer/token/range_token.rs deleted file mode 100644 index 62a1753..0000000 --- a/src/tokenizer/token/range_token.rs +++ /dev/null @@ -1,92 +0,0 @@ -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RangeToken { - Total, - Base(usize), - Error, -} - -impl RangeToken { - const TK_AI_TOTAL: u8 = 0; - const TK_AI_BASE: u8 = 1; - - pub const AI_MAX_NUMBER_OF_BASES: u8 = 10; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_BASE + Self::AI_MAX_NUMBER_OF_BASES + 1; - - const TK_FAIR_TOTAL: u16 = 0; - const TK_FAIR_BASE: u16 = 1; - - pub const FAIR_MAX_NUMBER_OF_BASES: u16 = 127; - - pub const FAIR_VOCABULARY_SIZE: u16 = Self::TK_FAIR_BASE + Self::FAIR_MAX_NUMBER_OF_BASES + 1; -} - -impl Ord for RangeToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for RangeToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Token for RangeToken { - fn from_ai_token(token: u8) -> RangeToken { - if token == Self::TK_AI_TOTAL { - RangeToken::Total - } else if (Self::TK_AI_BASE..Self::TK_AI_BASE + Self::AI_MAX_NUMBER_OF_BASES) - .contains(&token) - { - RangeToken::Base((token - Self::TK_AI_BASE) as usize) - } else { - RangeToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RangeToken::Total => Self::TK_AI_TOTAL, - RangeToken::Base(b) => { - let max = Self::AI_MAX_NUMBER_OF_BASES; - let b = *b as u8; - if b > max { - return Err(TokenError::TokenOutOfBound("Base", max.into(), b.into())); - } - b + Self::TK_AI_BASE - } - RangeToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(token: u16) -> RangeToken { - if token == Self::TK_FAIR_TOTAL { - RangeToken::Total - } else if (Self::TK_FAIR_BASE..Self::TK_FAIR_BASE + Self::FAIR_MAX_NUMBER_OF_BASES) - .contains(&token) - { - RangeToken::Base((token - Self::TK_FAIR_BASE) as usize) - } else { - RangeToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - RangeToken::Total => Self::TK_FAIR_TOTAL, - RangeToken::Base(b) => { - let max = Self::FAIR_MAX_NUMBER_OF_BASES; - let b = *b as u16; - if b > max { - return Err(TokenError::TokenOutOfBound("Base", max.into(), b.into())); - } - b + Self::TK_FAIR_BASE - } - RangeToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/src/tokenizer/token/regex_operations_token.rs b/src/tokenizer/token/regex_operations_token.rs deleted file mode 100644 index 1074f7f..0000000 --- a/src/tokenizer/token/regex_operations_token.rs +++ /dev/null @@ -1,64 +0,0 @@ -use self::regex_token::RegexToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RegexOperationsToken { - RegexToken(RegexToken), - And, - Not, - Error, -} - -impl Ord for RegexOperationsToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_ai_token().unwrap()).cmp(&other.to_ai_token().unwrap()) - } -} - -impl PartialOrd for RegexOperationsToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl RegexOperationsToken { - const TK_AI_REGEX_TOKEN: u8 = 0; - const TK_AI_AND: u8 = Self::TK_AI_REGEX_TOKEN + RegexToken::AI_VOCABULARY_SIZE; - const TK_AI_NOT: u8 = Self::TK_AI_AND + 1; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_NOT + 1; -} - -impl Token for RegexOperationsToken { - fn from_ai_token(token: u8) -> RegexOperationsToken { - if (Self::TK_AI_REGEX_TOKEN..Self::TK_AI_REGEX_TOKEN + RegexToken::AI_VOCABULARY_SIZE) - .contains(&token) - { - RegexOperationsToken::RegexToken(RegexToken::from_ai_token(token)) - } else if token == Self::TK_AI_AND { - RegexOperationsToken::And - } else if token == Self::TK_AI_NOT { - RegexOperationsToken::Not - } else { - RegexOperationsToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RegexOperationsToken::RegexToken(regex_token) => regex_token.to_ai_token()?, - RegexOperationsToken::And => Self::TK_AI_AND, - RegexOperationsToken::Not => Self::TK_AI_NOT, - RegexOperationsToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(_: u16) -> RegexOperationsToken { - panic!("A RegexOperationsToken does not have a FAIR representation.") - } - - fn to_fair_token(&self) -> Result { - panic!("A RegexOperationsToken does not have a FAIR representation.") - } -} diff --git a/src/tokenizer/token/regex_token.rs b/src/tokenizer/token/regex_token.rs deleted file mode 100644 index 2f4c2f2..0000000 --- a/src/tokenizer/token/regex_token.rs +++ /dev/null @@ -1,137 +0,0 @@ -use self::range_token::RangeToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RegexToken { - Range(RangeToken), - StartGroup, - EndGroup, - Alternation, - RepetitionNone, - Repetition(u16), - Error, -} - -impl Ord for RegexToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for RegexToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl RegexToken { - const TK_AI_RANGE: u8 = 0; - const TK_AI_START_GROUP: u8 = Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE; - const TK_AI_END_GROUP: u8 = Self::TK_AI_START_GROUP + 1; - const TK_AI_ALTERNATION: u8 = Self::TK_AI_END_GROUP + 1; - const TK_AI_REPETITION_NONE: u8 = Self::TK_AI_ALTERNATION + 1; - const TK_AI_REPETITION: u8 = Self::TK_AI_REPETITION_NONE + 1; - - pub const AI_MAX_NUMBER_OF_REPETITION: u8 = 10; - - pub const AI_VOCABULARY_SIZE: u8 = - Self::TK_AI_REPETITION + Self::AI_MAX_NUMBER_OF_REPETITION + 1; - - const TK_FAIR_RANGE: u16 = 0; - const TK_FAIR_START_GROUP: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; - const TK_FAIR_END_GROUP: u16 = Self::TK_FAIR_START_GROUP + 1; - const TK_FAIR_ALTERNATION: u16 = Self::TK_FAIR_END_GROUP + 1; - const TK_FAIR_REPETITION_NONE: u16 = Self::TK_FAIR_ALTERNATION + 1; - const TK_FAIR_REPETITION: u16 = Self::TK_FAIR_REPETITION_NONE + 1; - - pub const FAIR_MAX_NUMBER_OF_REPETITION: u16 = 1024; - - pub const FAIR_VOCABULARY_SIZE: u16 = - Self::TK_FAIR_REPETITION + Self::FAIR_MAX_NUMBER_OF_REPETITION + 1; -} - -impl Token for RegexToken { - fn from_ai_token(token: u8) -> RegexToken { - if (Self::TK_AI_RANGE..Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE).contains(&token) - { - RegexToken::Range(RangeToken::from_ai_token(token)) - } else if token == Self::TK_AI_START_GROUP { - RegexToken::StartGroup - } else if token == Self::TK_AI_END_GROUP { - RegexToken::EndGroup - } else if token == Self::TK_AI_ALTERNATION { - RegexToken::Alternation - } else if token == Self::TK_AI_REPETITION_NONE { - RegexToken::RepetitionNone - } else if (Self::TK_AI_REPETITION - ..Self::TK_AI_REPETITION + Self::AI_MAX_NUMBER_OF_REPETITION) - .contains(&token) - { - RegexToken::Repetition((token - Self::TK_AI_REPETITION) as u16) - } else { - RegexToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RegexToken::Range(r) => r.to_ai_token()?, - RegexToken::StartGroup => Self::TK_AI_START_GROUP, - RegexToken::EndGroup => Self::TK_AI_END_GROUP, - RegexToken::Alternation => Self::TK_AI_ALTERNATION, - RegexToken::RepetitionNone => Self::TK_AI_REPETITION_NONE, - RegexToken::Repetition(r) => { - let max = Self::AI_MAX_NUMBER_OF_REPETITION; - let r = *r as u8; - if r > max { - return Err(TokenError::TokenOutOfBound("Repetition", max.into(), r.into())); - } - r + Self::TK_AI_REPETITION - } - RegexToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(token: u16) -> RegexToken { - if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) - .contains(&token) - { - RegexToken::Range(RangeToken::from_fair_token(token)) - } else if token == Self::TK_FAIR_START_GROUP { - RegexToken::StartGroup - } else if token == Self::TK_FAIR_END_GROUP { - RegexToken::EndGroup - } else if token == Self::TK_FAIR_ALTERNATION { - RegexToken::Alternation - } else if token == Self::TK_FAIR_REPETITION_NONE { - RegexToken::RepetitionNone - } else if (Self::TK_FAIR_REPETITION - ..Self::TK_FAIR_REPETITION + Self::FAIR_MAX_NUMBER_OF_REPETITION) - .contains(&token) - { - RegexToken::Repetition(token - Self::TK_FAIR_REPETITION) - } else { - RegexToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - RegexToken::Range(r) => r.to_fair_token()?, - RegexToken::StartGroup => Self::TK_FAIR_START_GROUP, - RegexToken::EndGroup => Self::TK_FAIR_END_GROUP, - RegexToken::Alternation => Self::TK_FAIR_ALTERNATION, - RegexToken::RepetitionNone => Self::TK_FAIR_REPETITION_NONE, - RegexToken::Repetition(r) => { - let max = Self::FAIR_MAX_NUMBER_OF_REPETITION; - let r = *r; - if r > max { - return Err(TokenError::TokenOutOfBound("Repetition", max.into(), r.into())); - } - r + Self::TK_FAIR_REPETITION - } - RegexToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/tests/data/regex.txt b/tests/data/regex.txt index e5fb5df..31aa829 100644 --- a/tests/data/regex.txt +++ b/tests/data/regex.txt @@ -1,3 +1,5 @@ +(a*,a*)? +(?:\s*,\s*(?:0|1|0?\.\d+))? [\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f] a{2,3} (abc|fg){2} @@ -19,4 +21,59 @@ a+(ba+)* [0-9]+[A-Z]* ù -^\d$ \ No newline at end of file +^\d$ +foo +bar? +baz+ +qux* +quux{3} +quuux{2,5} +quuuux{0,4} +.* +[aeiou] +[^aeiou] +[a-zA-Z0-9] +[\dA-Fa-f] +[\w&&[^_]] +[[:alpha:]]+ +[\p{L}]+ +[0-9]{2,4} +[01]?\d +[1-9][0-9]* +(cat|dog|mouse) +(?:red|green|blue){2} +(gr(a|e)y){1,3} +((ab|cd)ef)+ +(a(b(c|d)e)f)+ +(a|b(c|d(e|f))){2,3} +(?:abc){0,} +(?:abc){1,} +(?:abc){2,5} +a++ +\.\*\?\+\(\)\[\]\{\}\\\| +\u0041\u0042\u0043 +\p{Greek}+ +\p{Sc} +[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[A-Za-z]{2,} +\b((25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]?\d?\d)\b +https?://[^\s/$.?#][^\s]* +\d{4}/\d{2}/\d{2} +\d{1,2}:\d{2}(:\d{2})? +<([A-Za-z][A-Za-z0-9]*)\b[^>]*?/> +\{(?:[^{}]|\{[^{}]*\})*\} +\b(?:\d[ -]*?){13,16}\b +#([A-Fa-f0-9]{8}) +(a|b|c|d|e|f|g|h|i|j){5} +(?:"[^"]*"|[^,]*)(?:,(?:"[^"]*"|[^,]*))* +\b([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\b +\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b +[[:alnum:]&&[^0-9]] +[ \t]+ +[\r\n]+ +[^\t\r\n]+ +(a*,a*)* +#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) +\{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} +rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) +[+-]?(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)? +<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> \ No newline at end of file diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 1e572a9..319f261 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -9,31 +9,26 @@ use regexsolver::regex::RegularExpression; fn assert_regex(regex: &str) { let re = Regex::new(&format!("(?s)^{}$", regex)).unwrap(); - let regex = RegularExpression::new(regex).unwrap(); + let regex = RegularExpression::parse(regex, true).unwrap(); let automaton = regex.to_automaton().unwrap(); let strings = automaton.generate_strings(500).unwrap(); for string in strings { assert!(re.is_match(&string), "'{string}'"); } - assert_eq!( - automaton.get_number_of_states(), - regex.get_number_of_states_in_nfa() - ); - let determinized_automaton = automaton.determinize().unwrap(); let strings = determinized_automaton.generate_strings(500).unwrap(); for string in strings { assert!(re.is_match(&string), "'{string}'"); } - assert!(automaton.is_subset_of(&determinized_automaton).unwrap()); - assert!(determinized_automaton.is_subset_of(&automaton).unwrap()); - assert!(automaton.is_equivalent_of(&determinized_automaton).unwrap()); + assert!(automaton.subset(&determinized_automaton).unwrap()); + assert!(determinized_automaton.subset(&automaton).unwrap()); + assert!(automaton.equivalent(&determinized_automaton).unwrap()); - let regex_from_automaton = automaton.to_regex().unwrap(); + let regex_from_automaton = automaton.to_regex(); let automaton_from_regex = regex_from_automaton.to_automaton().unwrap(); - assert!(automaton.is_equivalent_of(&automaton_from_regex).unwrap()); + assert!(automaton.equivalent(&automaton_from_regex).unwrap()); } #[test]