Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reverse_complement: Do line wrapping and reverse complement in one step #55

Merged
merged 1 commit into from Mar 6, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
231 changes: 116 additions & 115 deletions src/reverse_complement.rs
Expand Up @@ -2,148 +2,150 @@
// http://benchmarksgame.alioth.debian.org/
//
// contributed by the Rust Project Developers
// contributed by Matt Brubeck
// contributed by Cristi Cobzarenco (@cristicbz)
// contributed by TeXitoi
// contributed by Matt Brubeck

extern crate rayon;
extern crate memchr;

use std::io::{Read, Write};
use std::{io, ptr, slice};
use std::{cmp, io};
use std::fs::File;

struct Tables {
table8: [u8;1 << 8],
table16: [u16;1 << 16]
use std::mem::replace;

/// Lookup table to find the complement of a single FASTA code.
fn build_table() -> [u8; 256] {
let mut table = [0; 256];
for (i, x) in table.iter_mut().enumerate() {
*x = match i as u8 as char {
'A' | 'a' => 'T',
'C' | 'c' => 'G',
'G' | 'g' => 'C',
'T' | 't' => 'A',
'U' | 'u' => 'A',
'M' | 'm' => 'K',
'R' | 'r' => 'Y',
'W' | 'w' => 'W',
'S' | 's' => 'S',
'Y' | 'y' => 'R',
'K' | 'k' => 'M',
'V' | 'v' => 'B',
'H' | 'h' => 'D',
'D' | 'd' => 'H',
'B' | 'b' => 'V',
'N' | 'n' => 'N',
i => i,
} as u8;
}
table
}

impl Tables {
fn new() -> Tables {
let mut table8 = [0;1 << 8];
for (i, v) in table8.iter_mut().enumerate() {
*v = Tables::computed_cpl8(i as u8);
}
let mut table16 = [0;1 << 16];
for (i, v) in table16.iter_mut().enumerate() {
*v = (table8[i & 255] as u16) << 8 |
table8[i >> 8] as u16;
}
Tables { table8: table8, table16: table16 }
/// Utilities for splitting chunks off of slices.
trait SplitOff {
fn split_off_left(&mut self, n: usize) -> Self;
fn split_off_right(&mut self, n: usize) -> Self;
}
impl<'a, T> SplitOff for &'a mut [T] {
/// Split the left `n` items from self and return them as a separate slice.
fn split_off_left(&mut self, n: usize) -> Self {
let n = cmp::min(self.len(), n);
let data = replace(self, &mut []);
let (left, data) = data.split_at_mut(n);
*self = data;
left
}

fn computed_cpl8(c: u8) -> u8 {
match c {
b'A' | b'a' => b'T',
b'C' | b'c' => b'G',
b'G' | b'g' => b'C',
b'T' | b't' => b'A',
b'U' | b'u' => b'A',
b'M' | b'm' => b'K',
b'R' | b'r' => b'Y',
b'W' | b'w' => b'W',
b'S' | b's' => b'S',
b'Y' | b'y' => b'R',
b'K' | b'k' => b'M',
b'V' | b'v' => b'B',
b'H' | b'h' => b'D',
b'D' | b'd' => b'H',
b'B' | b'b' => b'V',
b'N' | b'n' => b'N',
i => i,
}
/// Split the right `n` items from self and return them as a separate slice.
fn split_off_right(&mut self, n: usize) -> Self {
let len = self.len();
let n = cmp::min(len, n);
let data = replace(self, &mut []);
let (data, right) = data.split_at_mut(len - n);
*self = data;
right
}
}

/// Retrieves the complement for `i`.
fn cpl8(&self, i: u8) -> u8 {
self.table8[i as usize]
}
/// Length of a normal line including the terminating \n.
const LINE_LEN: usize = 61;
const SEQUENTIAL_SIZE: usize = 2048;

/// Retrieves the complement for `i`.
fn cpl16(&self, i: u16) -> u16 {
self.table16[i as usize]
/// Compute the reverse complement for two contiguous chunks without line breaks.
fn reverse_complement_chunk(left: &mut [u8], right: &mut [u8], table: &[u8; 256]) {
for (x, y) in left.iter_mut().zip(right.iter_mut().rev()) {
*y = table[replace(x, table[*y as usize]) as usize];
}
}

/// Length of a normal line without the terminating \n.
const LINE_LEN: usize = 60;
const SEQUENTIAL_SIZE: usize = 1024;

/// Compute the reverse complement with the sequence split into two equal-sized slices.
fn reverse_complement_left_right(left: &mut [u16], right: &mut [u16], tables: &Tables) {
/// Compute the reverse complement on chunks from opposite ends of a sequence.
///
/// `left` must start at the beginning of a line. If there are an odd number of bytes, `right`
/// will initially be 1 byte longer than `left`; otherwise they will have equal lengths.
fn reverse_complement_left_right(mut left: &mut [u8],
mut right: &mut [u8],
trailing_len: usize,
table: &[u8; 256]) {
let len = left.len();
if len <= SEQUENTIAL_SIZE {
assert_eq!(right.len(), len);
for (left, right) in left.iter_mut().zip(right.iter_mut().rev()) {
let tmp = tables.cpl16(*left);
*left = tables.cpl16(*right);
*right = tmp;
// Each iteration swaps one line from the start of the sequence with one from the end.
while left.len() > 0 || right.len() > 0 {
// Get the chunk up to the newline in `right`.
let mut a = left.split_off_left(trailing_len);
let mut b = right.split_off_right(trailing_len);

// If we've reached the middle of the sequence here and there is an odd number of
// bytes remaining, the odd one will be on the right.
if b.len() > a.len() {
let mid = b.split_off_left(1);
mid[0] = table[mid[0] as usize];
}
reverse_complement_chunk(a, b, table);

// Skip the newline in `right`.
right.split_off_right(1);

// Get the chunk up to the newline in `left`.
let n = LINE_LEN - 1 - trailing_len;
a = left.split_off_left(n);
b = right.split_off_right(n);

// If we've reached the middle of the sequence here and there is an odd number of
// bytes remaining, the odd one will now be on the left.
if a.len() > b.len() {
let mid = a.split_off_right(1);
mid[0] = table[mid[0] as usize]
}
reverse_complement_chunk(a, b, table);

// Skip the newline in `left`.
left.split_off_left(1);
}
} else {
let (left1, left2) = left.split_at_mut((len + 1) / 2);
let (right2, right1) = right.split_at_mut(len / 2);
rayon::join(|| reverse_complement_left_right(left1, right1, tables),
|| reverse_complement_left_right(left2, right2, tables));
}
}
let line_count = len / LINE_LEN;
let mid = line_count / 2 * LINE_LEN; // Split on a whole number of lines.

/// Split a byte slice into two u16-halves with any remainder left in the middle.
fn split_mut_middle_as_u16<'a>(seq: &'a mut [u8]) -> (&'a mut [u16], &'a mut [u8], &'a mut [u16]) {
let len = seq.len();
let div = len / 4;
let rem = len % 4;
unsafe {
let left_ptr = seq.as_mut_ptr();
// This is slow if len % 2 != 0 but still faster than bytewise operations.
let right_ptr = left_ptr.offset((div * 2 + rem) as isize);
(slice::from_raw_parts_mut(left_ptr as *mut u16, div),
slice::from_raw_parts_mut(left_ptr.offset((div * 2) as isize), rem),
slice::from_raw_parts_mut(right_ptr as *mut u16, div))
let left1 = left.split_off_left(mid);
let right1 = right.split_off_right(mid);
rayon::join(|| reverse_complement_left_right(left, right, trailing_len, table),
|| reverse_complement_left_right(left1, right1, trailing_len, table));
}
}

/// Compute the reverse complement.
fn reverse_complement(seq: &mut [u8], tables: &Tables) {
/// Compute the reverse complement of one sequence.
fn reverse_complement(seq: &mut [u8], table: &[u8; 256]) {
let len = seq.len() - 1;
let seq = &mut seq[..len];// Drop the last newline

// Move newlines so the reversed text is wrapped correctly.
let off = LINE_LEN - len % (LINE_LEN + 1);
let mut i = LINE_LEN;
while i < len {
unsafe {
ptr::copy(seq.as_ptr().offset((i - off) as isize),
seq.as_mut_ptr().offset((i - off + 1) as isize), off);
*seq.get_unchecked_mut(i - off) = b'\n';
}
i += LINE_LEN + 1;
}
let (left, middle, right) = split_mut_middle_as_u16(seq);
reverse_complement_left_right(left, right, tables);

match middle.len() {
0 => {}
1 => middle[0] = tables.cpl8(middle[0]),
2 => {
let tmp = tables.cpl8(middle[0]);
middle[0] = tables.cpl8(middle[1]);
middle[1] = tmp;
},
3 => {
middle[1] = tables.cpl8(middle[1]);
let tmp = tables.cpl8(middle[0]);
middle[0] = tables.cpl8(middle[2]);
middle[2] = tmp;
},
_ => unreachable!()
}
let seq = &mut seq[..len]; // Drop the last newline
let trailing_len = len % LINE_LEN;
let (left, right) = seq.split_at_mut(len / 2);
reverse_complement_left_right(left, right, trailing_len, table);
}

fn file_size(f: &mut File) -> io::Result<usize> {
Ok(f.metadata()?.len() as usize)
}

fn split_and_reverse<'a>(data: &mut [u8], tables: &Tables) {
/// Locate each DNA sequence in the input file and reverse it.
fn split_and_reverse<'a>(data: &mut [u8], table: &[u8; 256]) {
let data = match memchr::memchr(b'\n', data) {
Some(i) => &mut data[i + 1..],
None => return,
Expand All @@ -152,10 +154,10 @@ fn split_and_reverse<'a>(data: &mut [u8], tables: &Tables) {
match memchr::memchr(b'>', data) {
Some(i) => {
let (head, tail) = data.split_at_mut(i);
rayon::join(|| reverse_complement(head, tables),
|| split_and_reverse(tail, tables));
rayon::join(|| reverse_complement(head, table),
|| split_and_reverse(tail, table));
}
None => reverse_complement(data, tables),
None => reverse_complement(data, table),
};
}

Expand All @@ -164,9 +166,8 @@ fn main() {
let size = file_size(&mut stdin).unwrap_or(1024 * 1024);
let mut data = Vec::with_capacity(size + 1);
stdin.read_to_end(&mut data).unwrap();
let tables = &Tables::new();

split_and_reverse(&mut data, tables);
split_and_reverse(&mut data, &build_table());
let stdout = io::stdout();
stdout.lock().write_all(&data).unwrap();
}