Skip to content

Commit

Permalink
reverse_complement: Do line wrapping and reverse complement in one step
Browse files Browse the repository at this point in the history
  • Loading branch information
mbrubeck committed Mar 6, 2017
1 parent 58472e7 commit 1135a92
Showing 1 changed file with 116 additions and 115 deletions.
231 changes: 116 additions & 115 deletions src/reverse_complement.rs
Expand Up @@ -2,148 +2,150 @@
// http://benchmarksgame.alioth.debian.org/
//
// contributed by the Rust Project Developers
// contributed by Matt Brubeck
// contributed by Cristi Cobzarenco (@cristicbz)
// contributed by TeXitoi
// contributed by Matt Brubeck

extern crate rayon;
extern crate memchr;

use std::io::{Read, Write};
use std::{io, ptr, slice};
use std::{cmp, io};
use std::fs::File;

struct Tables {
table8: [u8;1 << 8],
table16: [u16;1 << 16]
use std::mem::replace;

/// Lookup table to find the complement of a single FASTA code.
fn build_table() -> [u8; 256] {
let mut table = [0; 256];
for (i, x) in table.iter_mut().enumerate() {
*x = match i as u8 as char {
'A' | 'a' => 'T',
'C' | 'c' => 'G',
'G' | 'g' => 'C',
'T' | 't' => 'A',
'U' | 'u' => 'A',
'M' | 'm' => 'K',
'R' | 'r' => 'Y',
'W' | 'w' => 'W',
'S' | 's' => 'S',
'Y' | 'y' => 'R',
'K' | 'k' => 'M',
'V' | 'v' => 'B',
'H' | 'h' => 'D',
'D' | 'd' => 'H',
'B' | 'b' => 'V',
'N' | 'n' => 'N',
i => i,
} as u8;
}
table
}

impl Tables {
fn new() -> Tables {
let mut table8 = [0;1 << 8];
for (i, v) in table8.iter_mut().enumerate() {
*v = Tables::computed_cpl8(i as u8);
}
let mut table16 = [0;1 << 16];
for (i, v) in table16.iter_mut().enumerate() {
*v = (table8[i & 255] as u16) << 8 |
table8[i >> 8] as u16;
}
Tables { table8: table8, table16: table16 }
/// Utilities for splitting chunks off of slices.
trait SplitOff {
fn split_off_left(&mut self, n: usize) -> Self;
fn split_off_right(&mut self, n: usize) -> Self;
}
impl<'a, T> SplitOff for &'a mut [T] {
/// Split the left `n` items from self and return them as a separate slice.
fn split_off_left(&mut self, n: usize) -> Self {
let n = cmp::min(self.len(), n);
let data = replace(self, &mut []);
let (left, data) = data.split_at_mut(n);
*self = data;
left
}

fn computed_cpl8(c: u8) -> u8 {
match c {
b'A' | b'a' => b'T',
b'C' | b'c' => b'G',
b'G' | b'g' => b'C',
b'T' | b't' => b'A',
b'U' | b'u' => b'A',
b'M' | b'm' => b'K',
b'R' | b'r' => b'Y',
b'W' | b'w' => b'W',
b'S' | b's' => b'S',
b'Y' | b'y' => b'R',
b'K' | b'k' => b'M',
b'V' | b'v' => b'B',
b'H' | b'h' => b'D',
b'D' | b'd' => b'H',
b'B' | b'b' => b'V',
b'N' | b'n' => b'N',
i => i,
}
/// Split the right `n` items from self and return them as a separate slice.
fn split_off_right(&mut self, n: usize) -> Self {
let len = self.len();
let n = cmp::min(len, n);
let data = replace(self, &mut []);
let (data, right) = data.split_at_mut(len - n);
*self = data;
right
}
}

/// Retrieves the complement for `i`.
fn cpl8(&self, i: u8) -> u8 {
self.table8[i as usize]
}
/// Length of a normal line including the terminating \n.
const LINE_LEN: usize = 61;
const SEQUENTIAL_SIZE: usize = 2048;

/// Retrieves the complement for `i`.
fn cpl16(&self, i: u16) -> u16 {
self.table16[i as usize]
/// Compute the reverse complement for two contiguous chunks without line breaks.
fn reverse_complement_chunk(left: &mut [u8], right: &mut [u8], table: &[u8; 256]) {
for (x, y) in left.iter_mut().zip(right.iter_mut().rev()) {
*y = table[replace(x, table[*y as usize]) as usize];
}
}

/// Length of a normal line without the terminating \n.
const LINE_LEN: usize = 60;
const SEQUENTIAL_SIZE: usize = 1024;

/// Compute the reverse complement with the sequence split into two equal-sized slices.
fn reverse_complement_left_right(left: &mut [u16], right: &mut [u16], tables: &Tables) {
/// Compute the reverse complement on chunks from opposite ends of a sequence.
///
/// `left` must start at the beginning of a line. If there are an odd number of bytes, `right`
/// will initially be 1 byte longer than `left`; otherwise they will have equal lengths.
fn reverse_complement_left_right(mut left: &mut [u8],
mut right: &mut [u8],
trailing_len: usize,
table: &[u8; 256]) {
let len = left.len();
if len <= SEQUENTIAL_SIZE {
assert_eq!(right.len(), len);
for (left, right) in left.iter_mut().zip(right.iter_mut().rev()) {
let tmp = tables.cpl16(*left);
*left = tables.cpl16(*right);
*right = tmp;
// Each iteration swaps one line from the start of the sequence with one from the end.
while left.len() > 0 || right.len() > 0 {
// Get the chunk up to the newline in `right`.
let mut a = left.split_off_left(trailing_len);
let mut b = right.split_off_right(trailing_len);

// If we've reached the middle of the sequence here and there is an odd number of
// bytes remaining, the odd one will be on the right.
if b.len() > a.len() {
let mid = b.split_off_left(1);
mid[0] = table[mid[0] as usize];
}
reverse_complement_chunk(a, b, table);

// Skip the newline in `right`.
right.split_off_right(1);

// Get the chunk up to the newline in `left`.
let n = LINE_LEN - 1 - trailing_len;
a = left.split_off_left(n);
b = right.split_off_right(n);

// If we've reached the middle of the sequence here and there is an odd number of
// bytes remaining, the odd one will now be on the left.
if a.len() > b.len() {
let mid = a.split_off_right(1);
mid[0] = table[mid[0] as usize]
}
reverse_complement_chunk(a, b, table);

// Skip the newline in `left`.
left.split_off_left(1);
}
} else {
let (left1, left2) = left.split_at_mut((len + 1) / 2);
let (right2, right1) = right.split_at_mut(len / 2);
rayon::join(|| reverse_complement_left_right(left1, right1, tables),
|| reverse_complement_left_right(left2, right2, tables));
}
}
let line_count = len / LINE_LEN;
let mid = line_count / 2 * LINE_LEN; // Split on a whole number of lines.

/// Split a byte slice into two u16-halves with any remainder left in the middle.
fn split_mut_middle_as_u16<'a>(seq: &'a mut [u8]) -> (&'a mut [u16], &'a mut [u8], &'a mut [u16]) {
let len = seq.len();
let div = len / 4;
let rem = len % 4;
unsafe {
let left_ptr = seq.as_mut_ptr();
// This is slow if len % 2 != 0 but still faster than bytewise operations.
let right_ptr = left_ptr.offset((div * 2 + rem) as isize);
(slice::from_raw_parts_mut(left_ptr as *mut u16, div),
slice::from_raw_parts_mut(left_ptr.offset((div * 2) as isize), rem),
slice::from_raw_parts_mut(right_ptr as *mut u16, div))
let left1 = left.split_off_left(mid);
let right1 = right.split_off_right(mid);
rayon::join(|| reverse_complement_left_right(left, right, trailing_len, table),
|| reverse_complement_left_right(left1, right1, trailing_len, table));
}
}

/// Compute the reverse complement.
fn reverse_complement(seq: &mut [u8], tables: &Tables) {
/// Compute the reverse complement of one sequence.
fn reverse_complement(seq: &mut [u8], table: &[u8; 256]) {
let len = seq.len() - 1;
let seq = &mut seq[..len];// Drop the last newline

// Move newlines so the reversed text is wrapped correctly.
let off = LINE_LEN - len % (LINE_LEN + 1);
let mut i = LINE_LEN;
while i < len {
unsafe {
ptr::copy(seq.as_ptr().offset((i - off) as isize),
seq.as_mut_ptr().offset((i - off + 1) as isize), off);
*seq.get_unchecked_mut(i - off) = b'\n';
}
i += LINE_LEN + 1;
}
let (left, middle, right) = split_mut_middle_as_u16(seq);
reverse_complement_left_right(left, right, tables);

match middle.len() {
0 => {}
1 => middle[0] = tables.cpl8(middle[0]),
2 => {
let tmp = tables.cpl8(middle[0]);
middle[0] = tables.cpl8(middle[1]);
middle[1] = tmp;
},
3 => {
middle[1] = tables.cpl8(middle[1]);
let tmp = tables.cpl8(middle[0]);
middle[0] = tables.cpl8(middle[2]);
middle[2] = tmp;
},
_ => unreachable!()
}
let seq = &mut seq[..len]; // Drop the last newline
let trailing_len = len % LINE_LEN;
let (left, right) = seq.split_at_mut(len / 2);
reverse_complement_left_right(left, right, trailing_len, table);
}

fn file_size(f: &mut File) -> io::Result<usize> {
Ok(f.metadata()?.len() as usize)
}

fn split_and_reverse<'a>(data: &mut [u8], tables: &Tables) {
/// Locate each DNA sequence in the input file and reverse it.
fn split_and_reverse<'a>(data: &mut [u8], table: &[u8; 256]) {
let data = match memchr::memchr(b'\n', data) {
Some(i) => &mut data[i + 1..],
None => return,
Expand All @@ -152,10 +154,10 @@ fn split_and_reverse<'a>(data: &mut [u8], tables: &Tables) {
match memchr::memchr(b'>', data) {
Some(i) => {
let (head, tail) = data.split_at_mut(i);
rayon::join(|| reverse_complement(head, tables),
|| split_and_reverse(tail, tables));
rayon::join(|| reverse_complement(head, table),
|| split_and_reverse(tail, table));
}
None => reverse_complement(data, tables),
None => reverse_complement(data, table),
};
}

Expand All @@ -164,9 +166,8 @@ fn main() {
let size = file_size(&mut stdin).unwrap_or(1024 * 1024);
let mut data = Vec::with_capacity(size + 1);
stdin.read_to_end(&mut data).unwrap();
let tables = &Tables::new();

split_and_reverse(&mut data, tables);
split_and_reverse(&mut data, &build_table());
let stdout = io::stdout();
stdout.lock().write_all(&data).unwrap();
}

0 comments on commit 1135a92

Please sign in to comment.