-
Notifications
You must be signed in to change notification settings - Fork 124
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Allow line and block comments. (#512)
- Loading branch information
1 parent
fb107e6
commit aee72b8
Showing
2 changed files
with
289 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,272 @@ | ||
//! Provides an Iterator<Item=u8> decorator that uses a finite state machine to exclude comments | ||
//! from a string in linear time and constant space. | ||
|
||
use std::iter::Peekable; | ||
|
||
pub(crate) trait Exclude: Sized + Iterator<Item = u8> { | ||
fn exclude_comments(self) -> ExcludingComments<Self>; | ||
} | ||
|
||
impl<T: Iterator<Item = u8>> Exclude for T { | ||
fn exclude_comments(self) -> ExcludingComments<T> { | ||
ExcludingComments::new_from_iter(self) | ||
} | ||
} | ||
|
||
pub(crate) struct ExcludingComments<I: Iterator<Item = u8>> { | ||
state: State, | ||
iter: Peekable<I>, | ||
} | ||
|
||
impl<I: Iterator<Item = u8>> Iterator for ExcludingComments<I> { | ||
type Item = u8; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
let next_byte = self.next_byte(); | ||
if next_byte.is_none() { | ||
match self.state { | ||
State::BlockComment | State::PotentiallyLeavingBlockComment => { | ||
panic!("block comment not terminated with */") | ||
} | ||
_ => {} | ||
} | ||
} | ||
next_byte | ||
} | ||
} | ||
|
||
/// States of the comment removal machine: | ||
/// <pre> | ||
/// Normal | ||
/// '/' | ||
/// PotentialComment | ||
/// '/' '*' | ||
/// LineComment BlockComment | ||
/// '\n' '*' | ||
/// Normal PotentiallyLeavingBlockComment | ||
/// '/' '_' | ||
/// Normal BlockComment | ||
/// </pre> | ||
enum State { | ||
Normal, | ||
PotentialComment { previous: u8 }, | ||
LineComment, | ||
BlockComment, | ||
PotentiallyLeavingBlockComment, | ||
} | ||
|
||
impl<I: Iterator<Item = u8>> ExcludingComments<I> { | ||
fn new_from_iter(iter: I) -> Self { | ||
Self { | ||
state: State::Normal, | ||
iter: iter.peekable(), | ||
} | ||
} | ||
|
||
fn next_byte(&mut self) -> Option<u8> { | ||
loop { | ||
return match self.state { | ||
State::Normal => { | ||
let next = self.iter.next()?; | ||
match next { | ||
b'/' => { | ||
self.state = State::PotentialComment { previous: next }; | ||
continue; | ||
} | ||
_ => Some(next), | ||
} | ||
} | ||
State::PotentialComment { previous } => { | ||
let peeked_next = self.iter.peek()?; | ||
match peeked_next { | ||
b'/' => { | ||
// second /, enter line comment and consume | ||
self.iter.next(); | ||
self.state = State::LineComment; | ||
continue; | ||
} | ||
b'*' => { | ||
/* entering a block comment consume '*' */ | ||
self.iter.next(); | ||
self.state = State::BlockComment; | ||
continue; | ||
} | ||
_ => { | ||
// here we need to emit the previous character (the first '/') | ||
// and do not consume the current character | ||
self.state = State::Normal; | ||
return Some(previous); | ||
} | ||
} | ||
} | ||
State::LineComment => { | ||
let next = self.iter.next()?; | ||
match next { | ||
b'\n' => { | ||
self.state = State::Normal; | ||
return Some(next); | ||
} | ||
_ => { | ||
// ignore all other characters while in the line comment | ||
continue; | ||
} | ||
} | ||
} | ||
State::BlockComment => { | ||
let next = self.iter.next()?; | ||
match next { | ||
b'*' => { | ||
self.state = State::PotentiallyLeavingBlockComment; | ||
continue; | ||
} | ||
_ => { | ||
/* ignore all other characters while in the block comment */ | ||
continue; | ||
} | ||
} | ||
} | ||
State::PotentiallyLeavingBlockComment => { | ||
let next = self.iter.next()?; | ||
match next { | ||
b'/' => { | ||
/* Left the block comment */ | ||
self.state = State::Normal; | ||
continue; | ||
} | ||
_ => { | ||
/* we're still in the block comment */ | ||
self.state = State::BlockComment; | ||
continue; | ||
} | ||
} | ||
} | ||
}; | ||
} | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use std::vec::IntoIter; | ||
|
||
use super::*; | ||
|
||
/// Converts the input to an iterator of u8, excludes comments, maps back to char and collects | ||
/// the results. | ||
fn exclude_comments(input: &str) -> String { | ||
let excluding_comments: ExcludingComments<IntoIter<u8>> = input | ||
.to_string() | ||
.into_bytes() | ||
.into_iter() | ||
.exclude_comments(); | ||
excluding_comments.map(|b| b as char).collect() | ||
} | ||
|
||
#[test] | ||
fn empty() { | ||
assert!(exclude_comments("").is_empty()); | ||
} | ||
|
||
#[test] | ||
fn single_char() { | ||
assert_eq!(exclude_comments("0"), "0"); | ||
} | ||
|
||
#[test] | ||
fn two_chars() { | ||
assert_eq!(exclude_comments("ab"), "ab"); | ||
} | ||
|
||
#[test] | ||
fn comment() { | ||
assert_eq!(exclude_comments("ab//cd"), "ab"); | ||
} | ||
|
||
#[test] | ||
fn comments_are_ended_by_new_line() { | ||
assert_eq!(exclude_comments("ab//comment\nde"), "ab\nde"); | ||
} | ||
|
||
#[test] | ||
fn new_lines_without_comments() { | ||
assert_eq!(exclude_comments("ab\nde"), "ab\nde"); | ||
} | ||
|
||
#[test] | ||
fn single_slash_is_not_excluded() { | ||
assert_eq!(exclude_comments("ab/cd"), "ab/cd"); | ||
} | ||
|
||
#[test] | ||
fn line_comments_on_multiple_lines() { | ||
assert_eq!( | ||
exclude_comments( | ||
" | ||
line 1 //comment 1 | ||
line 2 // comment 2 // comment 3 | ||
line 3 | ||
line 4 // comment 4" | ||
), | ||
" | ||
line 1 | ||
line 2 | ||
line 3 | ||
line 4 " | ||
); | ||
} | ||
|
||
#[test] | ||
fn block_comment() { | ||
assert_eq!(exclude_comments("ab/*comment*/12"), "ab12"); | ||
} | ||
|
||
#[test] | ||
fn empty_block_comment() { | ||
assert_eq!(exclude_comments("ab/**/12"), "ab12"); | ||
} | ||
|
||
#[test] | ||
fn block_comment_with_asterisk_and_slash_inside() { | ||
assert_eq!(exclude_comments("ab/*false * asterisk and / */12"), "ab12"); | ||
} | ||
|
||
#[test] | ||
fn block_comment_within_line_comment() { | ||
assert_eq!(exclude_comments("ab// /*comment*/12"), "ab"); | ||
} | ||
|
||
#[test] | ||
#[should_panic(expected = "block comment not terminated with */")] | ||
fn block_comment_not_terminated() { | ||
exclude_comments("ab /*comment"); | ||
} | ||
|
||
#[test] | ||
#[should_panic(expected = "block comment not terminated with */")] | ||
fn block_comment_not_completely_terminated() { | ||
exclude_comments("ab /*comment*"); | ||
} | ||
|
||
#[test] | ||
fn block_and_line_comments_on_multiple_lines() { | ||
assert_eq!( | ||
exclude_comments( | ||
" | ||
line 1 /* comment 1 */ | ||
line /* comment 2 */2 // line comment 1 | ||
line 3 /* some comments | ||
over multiple lines | ||
*/ | ||
line 4 /* more multiline comments | ||
* with leading | ||
* asterisks | ||
*/end// line comment 2" | ||
), | ||
" | ||
line 1 | ||
line 2 | ||
line 3 | ||
line 4 end" | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters