Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

regex-redux crossbeamed #81

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 6 additions & 5 deletions Makefile
Expand Up @@ -2,12 +2,12 @@ SOURCES = $(wildcard src/*.rs)
RUSTC ?= rustc
RUSTC_FLAGS ?= -C opt-level=3 -C target-cpu=core2 -C lto
RUSTC_FLAGS += -L ./lib
REGEX ?= regex-1.0.4
REGEX ?= regex-1.1.5
ARENA ?= typed-arena-1.4.1
FUTURES_CPUPOOL ?= futures-cpupool-0.1.8
RAYON ?= rayon-1.0.2
RAYON ?= rayon-1.0.3
INDEXMAP ?= indexmap-1.0.1
CROSSBEAM ?= crossbeam-0.4.1
CROSSBEAM_UTILS ?= crossbeam-utils-0.6.5

version=$(lastword $(subst -, , $1))
crate=$(strip $(subst -$(call version, $1),, $1))
Expand All @@ -22,11 +22,12 @@ clean:
distclean: clean
rm -fr bin out tmp lib

bin/binary_trees: lib/$(ARENA).pkg lib/$(RAYON).pkg
bin/binary_trees: lib/$(TOOLSHED).pkg lib/$(RAYON).pkg
bin/binary_trees2: lib/$(RAYON).pkg
bin/fannkuch_redux: lib/$(RAYON).pkg
bin/k_nucleotide: lib/$(FUTURES_CPUPOOL).pkg lib/$(INDEXMAP).pkg
bin/mandelbrot: lib/$(RAYON).pkg
bin/regex_redux: lib/$(REGEX).pkg
bin/regex_redux: lib/$(REGEX).pkg lib/$(CROSSBEAM_UTILS).pkg
bin/reverse_complement: lib/$(RAYON).pkg
bin/spectralnorm: lib/$(RAYON).pkg

Expand Down
121 changes: 63 additions & 58 deletions src/regex_redux.rs
@@ -1,74 +1,79 @@
// The Computer Language Benchmarks Game
// http://benchmarksgame.alioth.debian.org/
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// regex-dna program contributed by the Rust Project Developers
// contributed by BurntSushi
// contributed by TeXitoi
// converted from regex-dna program
// contributed by Matt Brubeck
// contributed by Tom Kaitchuck
// contributed by Andre Bogus

extern crate crossbeam;
extern crate regex;

use std::borrow::Cow;
use std::fs;
use std::sync::Arc;
use std::thread;
use crossbeam::scope;
use regex::bytes::Regex;
use std::{
borrow::Cow,
io::{self, Read},
};

macro_rules! regex { ($re:expr) => { ::regex::bytes::Regex::new($re).unwrap() } }

fn main() {
let mut seq = fs::read("/dev/stdin").unwrap();
let ilen = seq.len();

// Remove headers and newlines.
seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]).into_owned();
let clen = seq.len();
fn regex(s: &str) -> Regex {
Regex::new(s).unwrap()
}

fn count_reverse_complements(sequence: &[u8]) -> String {
// Search for occurrences of the following patterns:
let variants = vec![
regex!("agggtaaa|tttaccct"),
regex!("[cgt]gggtaaa|tttaccc[acg]"),
regex!("a[act]ggtaaa|tttacc[agt]t"),
regex!("ag[act]gtaaa|tttac[agt]ct"),
regex!("agg[act]taaa|ttta[agt]cct"),
regex!("aggg[acg]aaa|ttt[cgt]ccct"),
regex!("agggt[cgt]aa|tt[acg]accct"),
regex!("agggta[cgt]a|t[acg]taccct"),
regex!("agggtaa[cgt]|[acg]ttaccct"),
static VARIANTS: &[&str] = &[
"agggtaaa|tttaccct",
"[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t",
"ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct",
"aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct",
"agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct",
];
VARIANTS
.iter()
.map(|variant| {
format!(
"{} {}\n",
variant,
regex(variant).find_iter(sequence).count()
)
})
.collect()
}

// Count each pattern in parallel. Use an Arc (atomic reference-counted
// pointer) to share the sequence between threads without copying it.
let seq_arc = Arc::new(seq);
let mut counts = vec![];
for variant in variants {
let seq = seq_arc.clone();
let restr = variant.to_string();
let future = thread::spawn(move || variant.find_iter(&seq).count());
counts.push((restr, future));
}

fn find_replaced_sequence_length(sequence: &[u8]) -> usize {
// Replace the following patterns, one at a time:
let substs = vec![
(regex!("tHa[Nt]"), &b"<4>"[..]),
(regex!("aND|caN|Ha[DS]|WaS"), &b"<3>"[..]),
(regex!("a[NSt]|BY"), &b"<2>"[..]),
(regex!("<[^>]*>"), &b"|"[..]),
(regex!("\\|[^|][^|]*\\|"), &b"-"[..]),
static SUBSTS: &[(&str, &[u8])] = &[
("tHa[Nt]", b"<4>"),
("aND|caN|Ha[DS]|WaS", b"<3>"),
("a[NSt]|BY", b"<2>"),
("<[^>]*>", b"|"),
("\\|[^|][^|]*\\|", b"-"),
];

// Use Cow here to avoid one extra copy of the sequence, by borrowing from
// the Arc during the first iteration.
let mut seq = Cow::Borrowed(&seq_arc[..]);

let mut seq = Cow::Borrowed(sequence);
// Perform the replacements in sequence:
for (re, replacement) in substs {
seq = Cow::Owned(re.replace_all(&seq, replacement).into_owned());
for (re, replacement) in SUBSTS.iter().cloned() {
seq = Cow::Owned(regex(re).replace_all(&seq, replacement).into_owned());
}
seq.len()
}

// Print the results:
for (variant, count) in counts {
println!("{} {}", variant, count.join().unwrap());
}
println!("\n{}\n{}\n{}", ilen, clen, seq.len());
fn main() {
let mut input = Vec::with_capacity(51 * (1 << 20));
io::stdin().read_to_end(&mut input).unwrap();
let sequence = regex(">[^\n]*\n|\n").replace_all(&input, &b""[..]);
scope(|s| {
let result = s.spawn(|_| find_replaced_sequence_length(&sequence));

println!(
"{}\n{}\n{}\n{}",
count_reverse_complements(&sequence[..]),
input.len(),
sequence.len(),
result.join().unwrap()
);
})
.unwrap();
}