diff --git a/src/reverse_complement.rs b/src/reverse_complement.rs index 616fc26..97ae108 100644 --- a/src/reverse_complement.rs +++ b/src/reverse_complement.rs @@ -10,12 +10,11 @@ extern crate rayon; extern crate memchr; use std::io::{Read, Write}; -use std::{io, ptr, slice}; +use std::{cmp, io, mem}; use std::fs::File; struct Tables { table8: [u8;1 << 8], - table16: [u16;1 << 16] } impl Tables { @@ -24,12 +23,7 @@ impl Tables { for (i, v) in table8.iter_mut().enumerate() { *v = Tables::computed_cpl8(i as u8); } - let mut table16 = [0;1 << 16]; - for (i, v) in table16.iter_mut().enumerate() { - *v = (table8[i & 255] as u16) << 8 | - table8[i >> 8] as u16; - } - Tables { table8: table8, table16: table16 } + Tables { table8: table8 } } fn computed_cpl8(c: u8) -> u8 { @@ -58,85 +52,98 @@ impl Tables { fn cpl8(&self, i: u8) -> u8 { self.table8[i as usize] } +} - /// Retrieves the complement for `i`. - fn cpl16(&self, i: u16) -> u16 { - self.table16[i as usize] +/// Utilities for splitting chunks off of mutable slices. +trait SplitOff { + fn split_off_left(&mut self, n: usize) -> Self; + fn split_off_right(&mut self, n: usize) -> Self; +} +impl<'a, T> SplitOff for &'a mut [T] { + fn split_off_left(&mut self, n: usize) -> Self { + let n = cmp::min(self.len(), n); + let data = mem::replace(self, &mut []); + let (left, data) = data.split_at_mut(n); + *self = data; + left + } + fn split_off_right(&mut self, n: usize) -> Self { + let len = self.len(); + let n = cmp::min(len, n); + let data = mem::replace(self, &mut []); + let (data, right) = data.split_at_mut(len - n); + *self = data; + right } } -/// Length of a normal line without the terminating \n. -const LINE_LEN: usize = 60; -const SEQUENTIAL_SIZE: usize = 1024; +/// Length of a normal line including the terminating \n. +const LINE_LEN: usize = 61; +const SEQUENTIAL_SIZE: usize = 2048; -/// Compute the reverse complement with the sequence split into two equal-sized slices. -fn reverse_complement_left_right(left: &mut [u16], right: &mut [u16], tables: &Tables) { +/// Compute the reverse complement for two contiguous chunks without line breaks. +fn reverse_complement_chunk(left: &mut [u8], right: &mut [u8], tables: &Tables) { + for (x, y) in left.iter_mut().zip(right.iter_mut().rev()) { + let tmp = tables.cpl8(*x); + *x = tables.cpl8(*y); + *y = tmp; + } +} + +/// Compute the reverse complement on chunks from opposite ends of the sequence. +/// +/// `left` must start at the beginning of a line. +fn reverse_complement_left_right(mut left: &mut [u8], mut right: &mut [u8], trailing_len: usize, tables: &Tables) { let len = left.len(); if len <= SEQUENTIAL_SIZE { - assert_eq!(right.len(), len); - for (left, right) in left.iter_mut().zip(right.iter_mut().rev()) { - let tmp = tables.cpl16(*left); - *left = tables.cpl16(*right); - *right = tmp; + while left.len() > 0 || right.len() > 0 { + // Process the chunk up to the newline in `right`. + let mut a = left.split_off_left(trailing_len); + let mut b = right.split_off_right(trailing_len); + + // If there is an odd number of bytes, the extra one will be on the right. + if b.len() > a.len() { + let mid = b.split_off_left(1); + mid[0] = tables.cpl8(mid[0]) + } + reverse_complement_chunk(a, b, tables); + + // Skip the newline in `right`. + right.split_off_right(1); + + // Process the chunk up to the newline in `left`. + let leading_len = LINE_LEN - 1 - trailing_len; + a = left.split_off_left(leading_len); + b = right.split_off_right(leading_len); + + // If there is an odd number of bytes, the extra one will be on the left. + if a.len() > b.len() { + let mid = a.split_off_right(1); + mid[0] = tables.cpl8(mid[0]) + } + reverse_complement_chunk(a, b, tables); + + // Skip the newline in `left`. + left.split_off_left(1); } } else { - let (left1, left2) = left.split_at_mut((len + 1) / 2); - let (right2, right1) = right.split_at_mut(len / 2); - rayon::join(|| reverse_complement_left_right(left1, right1, tables), - || reverse_complement_left_right(left2, right2, tables)); - } -} + let line_count = (len + LINE_LEN - 1) / LINE_LEN; + let mid = line_count / 2 * LINE_LEN; // Split on a whole number of lines. -/// Split a byte slice into two u16-halves with any remainder left in the middle. -fn split_mut_middle_as_u16<'a>(seq: &'a mut [u8]) -> (&'a mut [u16], &'a mut [u8], &'a mut [u16]) { - let len = seq.len(); - let div = len / 4; - let rem = len % 4; - unsafe { - let left_ptr = seq.as_mut_ptr(); - // This is slow if len % 2 != 0 but still faster than bytewise operations. - let right_ptr = left_ptr.offset((div * 2 + rem) as isize); - (slice::from_raw_parts_mut(left_ptr as *mut u16, div), - slice::from_raw_parts_mut(left_ptr.offset((div * 2) as isize), rem), - slice::from_raw_parts_mut(right_ptr as *mut u16, div)) + let left1 = left.split_off_left(mid); + let right1 = right.split_off_right(mid); + rayon::join(|| reverse_complement_left_right(left, right, trailing_len, tables), + || reverse_complement_left_right(left1, right1, trailing_len, tables)); } } /// Compute the reverse complement. fn reverse_complement(seq: &mut [u8], tables: &Tables) { let len = seq.len() - 1; - let seq = &mut seq[..len];// Drop the last newline - - // Move newlines so the reversed text is wrapped correctly. - let off = LINE_LEN - len % (LINE_LEN + 1); - let mut i = LINE_LEN; - while i < len { - unsafe { - ptr::copy(seq.as_ptr().offset((i - off) as isize), - seq.as_mut_ptr().offset((i - off + 1) as isize), off); - *seq.get_unchecked_mut(i - off) = b'\n'; - } - i += LINE_LEN + 1; - } - let (left, middle, right) = split_mut_middle_as_u16(seq); - reverse_complement_left_right(left, right, tables); - - match middle.len() { - 0 => {} - 1 => middle[0] = tables.cpl8(middle[0]), - 2 => { - let tmp = tables.cpl8(middle[0]); - middle[0] = tables.cpl8(middle[1]); - middle[1] = tmp; - }, - 3 => { - middle[1] = tables.cpl8(middle[1]); - let tmp = tables.cpl8(middle[0]); - middle[0] = tables.cpl8(middle[2]); - middle[2] = tmp; - }, - _ => unreachable!() - } + let seq = &mut seq[..len]; // Drop the last newline + let trailing_len = len % LINE_LEN; + let (left, right) = seq.split_at_mut(len / 2); + reverse_complement_left_right(left, right, trailing_len, tables); } fn file_size(f: &mut File) -> io::Result {