From 1135a925426a65a7f65b4f8c44da9904127ac554 Mon Sep 17 00:00:00 2001 From: Matt Brubeck Date: Fri, 3 Mar 2017 12:24:37 -0800 Subject: [PATCH] reverse_complement: Do line wrapping and reverse complement in one step --- src/reverse_complement.rs | 231 +++++++++++++++++++------------------- 1 file changed, 116 insertions(+), 115 deletions(-) diff --git a/src/reverse_complement.rs b/src/reverse_complement.rs index 616fc26..affe4ac 100644 --- a/src/reverse_complement.rs +++ b/src/reverse_complement.rs @@ -2,148 +2,150 @@ // http://benchmarksgame.alioth.debian.org/ // // contributed by the Rust Project Developers -// contributed by Matt Brubeck // contributed by Cristi Cobzarenco (@cristicbz) // contributed by TeXitoi +// contributed by Matt Brubeck extern crate rayon; extern crate memchr; use std::io::{Read, Write}; -use std::{io, ptr, slice}; +use std::{cmp, io}; use std::fs::File; - -struct Tables { - table8: [u8;1 << 8], - table16: [u16;1 << 16] +use std::mem::replace; + +/// Lookup table to find the complement of a single FASTA code. +fn build_table() -> [u8; 256] { + let mut table = [0; 256]; + for (i, x) in table.iter_mut().enumerate() { + *x = match i as u8 as char { + 'A' | 'a' => 'T', + 'C' | 'c' => 'G', + 'G' | 'g' => 'C', + 'T' | 't' => 'A', + 'U' | 'u' => 'A', + 'M' | 'm' => 'K', + 'R' | 'r' => 'Y', + 'W' | 'w' => 'W', + 'S' | 's' => 'S', + 'Y' | 'y' => 'R', + 'K' | 'k' => 'M', + 'V' | 'v' => 'B', + 'H' | 'h' => 'D', + 'D' | 'd' => 'H', + 'B' | 'b' => 'V', + 'N' | 'n' => 'N', + i => i, + } as u8; + } + table } -impl Tables { - fn new() -> Tables { - let mut table8 = [0;1 << 8]; - for (i, v) in table8.iter_mut().enumerate() { - *v = Tables::computed_cpl8(i as u8); - } - let mut table16 = [0;1 << 16]; - for (i, v) in table16.iter_mut().enumerate() { - *v = (table8[i & 255] as u16) << 8 | - table8[i >> 8] as u16; - } - Tables { table8: table8, table16: table16 } +/// Utilities for splitting chunks off of slices. +trait SplitOff { + fn split_off_left(&mut self, n: usize) -> Self; + fn split_off_right(&mut self, n: usize) -> Self; +} +impl<'a, T> SplitOff for &'a mut [T] { + /// Split the left `n` items from self and return them as a separate slice. + fn split_off_left(&mut self, n: usize) -> Self { + let n = cmp::min(self.len(), n); + let data = replace(self, &mut []); + let (left, data) = data.split_at_mut(n); + *self = data; + left } - - fn computed_cpl8(c: u8) -> u8 { - match c { - b'A' | b'a' => b'T', - b'C' | b'c' => b'G', - b'G' | b'g' => b'C', - b'T' | b't' => b'A', - b'U' | b'u' => b'A', - b'M' | b'm' => b'K', - b'R' | b'r' => b'Y', - b'W' | b'w' => b'W', - b'S' | b's' => b'S', - b'Y' | b'y' => b'R', - b'K' | b'k' => b'M', - b'V' | b'v' => b'B', - b'H' | b'h' => b'D', - b'D' | b'd' => b'H', - b'B' | b'b' => b'V', - b'N' | b'n' => b'N', - i => i, - } + /// Split the right `n` items from self and return them as a separate slice. + fn split_off_right(&mut self, n: usize) -> Self { + let len = self.len(); + let n = cmp::min(len, n); + let data = replace(self, &mut []); + let (data, right) = data.split_at_mut(len - n); + *self = data; + right } +} - /// Retrieves the complement for `i`. - fn cpl8(&self, i: u8) -> u8 { - self.table8[i as usize] - } +/// Length of a normal line including the terminating \n. +const LINE_LEN: usize = 61; +const SEQUENTIAL_SIZE: usize = 2048; - /// Retrieves the complement for `i`. - fn cpl16(&self, i: u16) -> u16 { - self.table16[i as usize] +/// Compute the reverse complement for two contiguous chunks without line breaks. +fn reverse_complement_chunk(left: &mut [u8], right: &mut [u8], table: &[u8; 256]) { + for (x, y) in left.iter_mut().zip(right.iter_mut().rev()) { + *y = table[replace(x, table[*y as usize]) as usize]; } } -/// Length of a normal line without the terminating \n. -const LINE_LEN: usize = 60; -const SEQUENTIAL_SIZE: usize = 1024; - -/// Compute the reverse complement with the sequence split into two equal-sized slices. -fn reverse_complement_left_right(left: &mut [u16], right: &mut [u16], tables: &Tables) { +/// Compute the reverse complement on chunks from opposite ends of a sequence. +/// +/// `left` must start at the beginning of a line. If there are an odd number of bytes, `right` +/// will initially be 1 byte longer than `left`; otherwise they will have equal lengths. +fn reverse_complement_left_right(mut left: &mut [u8], + mut right: &mut [u8], + trailing_len: usize, + table: &[u8; 256]) { let len = left.len(); if len <= SEQUENTIAL_SIZE { - assert_eq!(right.len(), len); - for (left, right) in left.iter_mut().zip(right.iter_mut().rev()) { - let tmp = tables.cpl16(*left); - *left = tables.cpl16(*right); - *right = tmp; + // Each iteration swaps one line from the start of the sequence with one from the end. + while left.len() > 0 || right.len() > 0 { + // Get the chunk up to the newline in `right`. + let mut a = left.split_off_left(trailing_len); + let mut b = right.split_off_right(trailing_len); + + // If we've reached the middle of the sequence here and there is an odd number of + // bytes remaining, the odd one will be on the right. + if b.len() > a.len() { + let mid = b.split_off_left(1); + mid[0] = table[mid[0] as usize]; + } + reverse_complement_chunk(a, b, table); + + // Skip the newline in `right`. + right.split_off_right(1); + + // Get the chunk up to the newline in `left`. + let n = LINE_LEN - 1 - trailing_len; + a = left.split_off_left(n); + b = right.split_off_right(n); + + // If we've reached the middle of the sequence here and there is an odd number of + // bytes remaining, the odd one will now be on the left. + if a.len() > b.len() { + let mid = a.split_off_right(1); + mid[0] = table[mid[0] as usize] + } + reverse_complement_chunk(a, b, table); + + // Skip the newline in `left`. + left.split_off_left(1); } } else { - let (left1, left2) = left.split_at_mut((len + 1) / 2); - let (right2, right1) = right.split_at_mut(len / 2); - rayon::join(|| reverse_complement_left_right(left1, right1, tables), - || reverse_complement_left_right(left2, right2, tables)); - } -} + let line_count = len / LINE_LEN; + let mid = line_count / 2 * LINE_LEN; // Split on a whole number of lines. -/// Split a byte slice into two u16-halves with any remainder left in the middle. -fn split_mut_middle_as_u16<'a>(seq: &'a mut [u8]) -> (&'a mut [u16], &'a mut [u8], &'a mut [u16]) { - let len = seq.len(); - let div = len / 4; - let rem = len % 4; - unsafe { - let left_ptr = seq.as_mut_ptr(); - // This is slow if len % 2 != 0 but still faster than bytewise operations. - let right_ptr = left_ptr.offset((div * 2 + rem) as isize); - (slice::from_raw_parts_mut(left_ptr as *mut u16, div), - slice::from_raw_parts_mut(left_ptr.offset((div * 2) as isize), rem), - slice::from_raw_parts_mut(right_ptr as *mut u16, div)) + let left1 = left.split_off_left(mid); + let right1 = right.split_off_right(mid); + rayon::join(|| reverse_complement_left_right(left, right, trailing_len, table), + || reverse_complement_left_right(left1, right1, trailing_len, table)); } } -/// Compute the reverse complement. -fn reverse_complement(seq: &mut [u8], tables: &Tables) { +/// Compute the reverse complement of one sequence. +fn reverse_complement(seq: &mut [u8], table: &[u8; 256]) { let len = seq.len() - 1; - let seq = &mut seq[..len];// Drop the last newline - - // Move newlines so the reversed text is wrapped correctly. - let off = LINE_LEN - len % (LINE_LEN + 1); - let mut i = LINE_LEN; - while i < len { - unsafe { - ptr::copy(seq.as_ptr().offset((i - off) as isize), - seq.as_mut_ptr().offset((i - off + 1) as isize), off); - *seq.get_unchecked_mut(i - off) = b'\n'; - } - i += LINE_LEN + 1; - } - let (left, middle, right) = split_mut_middle_as_u16(seq); - reverse_complement_left_right(left, right, tables); - - match middle.len() { - 0 => {} - 1 => middle[0] = tables.cpl8(middle[0]), - 2 => { - let tmp = tables.cpl8(middle[0]); - middle[0] = tables.cpl8(middle[1]); - middle[1] = tmp; - }, - 3 => { - middle[1] = tables.cpl8(middle[1]); - let tmp = tables.cpl8(middle[0]); - middle[0] = tables.cpl8(middle[2]); - middle[2] = tmp; - }, - _ => unreachable!() - } + let seq = &mut seq[..len]; // Drop the last newline + let trailing_len = len % LINE_LEN; + let (left, right) = seq.split_at_mut(len / 2); + reverse_complement_left_right(left, right, trailing_len, table); } fn file_size(f: &mut File) -> io::Result { Ok(f.metadata()?.len() as usize) } -fn split_and_reverse<'a>(data: &mut [u8], tables: &Tables) { +/// Locate each DNA sequence in the input file and reverse it. +fn split_and_reverse<'a>(data: &mut [u8], table: &[u8; 256]) { let data = match memchr::memchr(b'\n', data) { Some(i) => &mut data[i + 1..], None => return, @@ -152,10 +154,10 @@ fn split_and_reverse<'a>(data: &mut [u8], tables: &Tables) { match memchr::memchr(b'>', data) { Some(i) => { let (head, tail) = data.split_at_mut(i); - rayon::join(|| reverse_complement(head, tables), - || split_and_reverse(tail, tables)); + rayon::join(|| reverse_complement(head, table), + || split_and_reverse(tail, table)); } - None => reverse_complement(data, tables), + None => reverse_complement(data, table), }; } @@ -164,9 +166,8 @@ fn main() { let size = file_size(&mut stdin).unwrap_or(1024 * 1024); let mut data = Vec::with_capacity(size + 1); stdin.read_to_end(&mut data).unwrap(); - let tables = &Tables::new(); - split_and_reverse(&mut data, tables); + split_and_reverse(&mut data, &build_table()); let stdout = io::stdout(); stdout.lock().write_all(&data).unwrap(); }