diff --git a/Makefile b/Makefile index 97aa372..c93f316 100644 --- a/Makefile +++ b/Makefile @@ -2,12 +2,12 @@ SOURCES = $(wildcard src/*.rs) RUSTC ?= rustc RUSTC_FLAGS ?= -C opt-level=3 -C target-cpu=core2 -C lto RUSTC_FLAGS += -L ./lib -REGEX ?= regex-1.0.4 +REGEX ?= regex-1.1.5 ARENA ?= typed-arena-1.4.1 FUTURES_CPUPOOL ?= futures-cpupool-0.1.8 -RAYON ?= rayon-1.0.2 +RAYON ?= rayon-1.0.3 INDEXMAP ?= indexmap-1.0.1 -CROSSBEAM ?= crossbeam-0.4.1 +CROSSBEAM_UTILS ?= crossbeam-utils-0.6.5 version=$(lastword $(subst -, , $1)) crate=$(strip $(subst -$(call version, $1),, $1)) @@ -22,11 +22,12 @@ clean: distclean: clean rm -fr bin out tmp lib -bin/binary_trees: lib/$(ARENA).pkg lib/$(RAYON).pkg +bin/binary_trees: lib/$(TOOLSHED).pkg lib/$(RAYON).pkg +bin/binary_trees2: lib/$(RAYON).pkg bin/fannkuch_redux: lib/$(RAYON).pkg bin/k_nucleotide: lib/$(FUTURES_CPUPOOL).pkg lib/$(INDEXMAP).pkg bin/mandelbrot: lib/$(RAYON).pkg -bin/regex_redux: lib/$(REGEX).pkg +bin/regex_redux: lib/$(REGEX).pkg lib/$(CROSSBEAM_UTILS).pkg bin/reverse_complement: lib/$(RAYON).pkg bin/spectralnorm: lib/$(RAYON).pkg diff --git a/src/regex_redux.rs b/src/regex_redux.rs index cd52a41..9825806 100644 --- a/src/regex_redux.rs +++ b/src/regex_redux.rs @@ -1,74 +1,79 @@ // The Computer Language Benchmarks Game -// http://benchmarksgame.alioth.debian.org/ +// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ // -// regex-dna program contributed by the Rust Project Developers -// contributed by BurntSushi -// contributed by TeXitoi -// converted from regex-dna program -// contributed by Matt Brubeck +// contributed by Tom Kaitchuck +// contributed by Andre Bogus +extern crate crossbeam; extern crate regex; -use std::borrow::Cow; -use std::fs; -use std::sync::Arc; -use std::thread; +use crossbeam::scope; +use regex::bytes::Regex; +use std::{ + borrow::Cow, + io::{self, Read}, +}; -macro_rules! regex { ($re:expr) => { ::regex::bytes::Regex::new($re).unwrap() } } - -fn main() { - let mut seq = fs::read("/dev/stdin").unwrap(); - let ilen = seq.len(); - - // Remove headers and newlines. - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]).into_owned(); - let clen = seq.len(); +fn regex(s: &str) -> Regex { + Regex::new(s).unwrap() +} +fn count_reverse_complements(sequence: &[u8]) -> String { // Search for occurrences of the following patterns: - let variants = vec![ - regex!("agggtaaa|tttaccct"), - regex!("[cgt]gggtaaa|tttaccc[acg]"), - regex!("a[act]ggtaaa|tttacc[agt]t"), - regex!("ag[act]gtaaa|tttac[agt]ct"), - regex!("agg[act]taaa|ttta[agt]cct"), - regex!("aggg[acg]aaa|ttt[cgt]ccct"), - regex!("agggt[cgt]aa|tt[acg]accct"), - regex!("agggta[cgt]a|t[acg]taccct"), - regex!("agggtaa[cgt]|[acg]ttaccct"), + static VARIANTS: &[&str] = &[ + "agggtaaa|tttaccct", + "[cgt]gggtaaa|tttaccc[acg]", + "a[act]ggtaaa|tttacc[agt]t", + "ag[act]gtaaa|tttac[agt]ct", + "agg[act]taaa|ttta[agt]cct", + "aggg[acg]aaa|ttt[cgt]ccct", + "agggt[cgt]aa|tt[acg]accct", + "agggta[cgt]a|t[acg]taccct", + "agggtaa[cgt]|[acg]ttaccct", ]; + VARIANTS + .iter() + .map(|variant| { + format!( + "{} {}\n", + variant, + regex(variant).find_iter(sequence).count() + ) + }) + .collect() +} - // Count each pattern in parallel. Use an Arc (atomic reference-counted - // pointer) to share the sequence between threads without copying it. - let seq_arc = Arc::new(seq); - let mut counts = vec![]; - for variant in variants { - let seq = seq_arc.clone(); - let restr = variant.to_string(); - let future = thread::spawn(move || variant.find_iter(&seq).count()); - counts.push((restr, future)); - } - +fn find_replaced_sequence_length(sequence: &[u8]) -> usize { // Replace the following patterns, one at a time: - let substs = vec![ - (regex!("tHa[Nt]"), &b"<4>"[..]), - (regex!("aND|caN|Ha[DS]|WaS"), &b"<3>"[..]), - (regex!("a[NSt]|BY"), &b"<2>"[..]), - (regex!("<[^>]*>"), &b"|"[..]), - (regex!("\\|[^|][^|]*\\|"), &b"-"[..]), + static SUBSTS: &[(&str, &[u8])] = &[ + ("tHa[Nt]", b"<4>"), + ("aND|caN|Ha[DS]|WaS", b"<3>"), + ("a[NSt]|BY", b"<2>"), + ("<[^>]*>", b"|"), + ("\\|[^|][^|]*\\|", b"-"), ]; - - // Use Cow here to avoid one extra copy of the sequence, by borrowing from - // the Arc during the first iteration. - let mut seq = Cow::Borrowed(&seq_arc[..]); - + let mut seq = Cow::Borrowed(sequence); // Perform the replacements in sequence: - for (re, replacement) in substs { - seq = Cow::Owned(re.replace_all(&seq, replacement).into_owned()); + for (re, replacement) in SUBSTS.iter().cloned() { + seq = Cow::Owned(regex(re).replace_all(&seq, replacement).into_owned()); } + seq.len() +} - // Print the results: - for (variant, count) in counts { - println!("{} {}", variant, count.join().unwrap()); - } - println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +fn main() { + let mut input = Vec::with_capacity(51 * (1 << 20)); + io::stdin().read_to_end(&mut input).unwrap(); + let sequence = regex(">[^\n]*\n|\n").replace_all(&input, &b""[..]); + scope(|s| { + let result = s.spawn(|_| find_replaced_sequence_length(&sequence)); + + println!( + "{}\n{}\n{}\n{}", + count_reverse_complements(&sequence[..]), + input.len(), + sequence.len(), + result.join().unwrap() + ); + }) + .unwrap(); }