Skip to content

Commit

Permalink
Merge pull request #40 from sanger-pathogens/pipeline
Browse files Browse the repository at this point in the history
Pathogens changes
  • Loading branch information
sb10 committed Jun 14, 2012
2 parents f5e6af1 + b391a2b commit ccc1ca9
Show file tree
Hide file tree
Showing 83 changed files with 9,048 additions and 133 deletions.
1,318 changes: 1,318 additions & 0 deletions modules/Bio/Tools/NonStandardGFF.pm

Large diffs are not rendered by default.

40 changes: 34 additions & 6 deletions modules/LSF.pm
Expand Up @@ -235,13 +235,18 @@ sub adjust_bsub_options
}

if (defined $mem) {
# at some point an attempt to run this failed due to MEMLIMIT, using
# $mem max memory; increase by 1000MB
$mem += 1000;
# at some point an attempt to run this failed due to MEMLIMIT
$mem = calculate_memory_limit($mem);

if ( !defined $mem_limit ) { $mem_limit = 15_900; }
if ( $mem>$mem_limit ) {
Utils::error(sprintf "FIXME: Increase memory_limit, more than %.1fGB of memory is required.", $mem_limit/1000);
if ($mem>500000) {
Utils::error("FIXME: This job cannot be run on the farm, more than 500GB of memory is required.");
}
elsif($mem>30000)
{
warn("$output_file: changing queue to hugemem\n");
$opts =~ s/-q normal/-q hugemem/;
$opts =~ s/-q long/-q hugemem/;
if ( !($opts=~/-q/) ) { $opts .= ' -q hugemem'; }
}

# adjust the command line to include higher memory reservation, but only
Expand Down Expand Up @@ -281,6 +286,29 @@ sub adjust_bsub_options
}


=head2 calculate_memory_limit
Arg [1] : memory in mega bytes for previously failed job
Description : Increases the memory by a set minimum or by a percentage, which ever is greater
Returntype : Int
=cut
sub calculate_memory_limit
{
my ($mem) = @_;
my $minimum_memory_increase = 1000;
my $memory_increase_percentage = 0.3;

my $updated_memory_limit = $mem*(1+$memory_increase_percentage);
if($updated_memory_limit < $mem + $minimum_memory_increase)
{
$updated_memory_limit = $mem + $minimum_memory_increase;
}

return int($updated_memory_limit);
}


=head2 run
Arg [1] : lock file where to put the JID (opened in append mode)
Expand Down
94 changes: 94 additions & 0 deletions modules/Pathogens/Import/CompressAndValidate.pm
@@ -0,0 +1,94 @@
=head1 NAME
=head1 SYNOPSIS
use Pathogens::Import::CompressAndValidate;
my $validator = Pathogens::Import::CompressAndValidate->new( irods_filename => $irods, fastq_filenames => \@fastqs );
$validator->is_compressed_and_validated() || $die("Compress and validate failed\n");
=head1 DESCRIPTION
Compress fastq file, add md5 checksum files and check number of reads against iRODS.
=cut

package Pathogens::Import::CompressAndValidate;
use Moose;
use Utils;
use VertRes::Wrapper::fastqcheck;
use Pathogens::Import::ValidateFastqConversion;

has 'irods_filename' => ( is => 'ro', isa => 'Str', required => 1);
has 'fastq_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1);

sub _compress_and_checksum
{
my($self) = @_;

for my $fastq (@{$self->{fastq_filenames}})
{
# Compress fastq
Utils::CMD(qq[gzip -9 -c $fastq > $fastq.gz]);

# Checksum fastqs
Utils::CMD(qq[md5sum $fastq > $fastq.md5]);
Utils::CMD(qq[md5sum $fastq.gz > $fastq.gz.md5]);
}
}

sub _fastqcheck
{
my($self) = @_;

for my $fastq (@{$self->{fastq_filenames}})
{
# Generate fastqcheckfile
my $fastqcheck = VertRes::Wrapper::fastqcheck->new();
$fastqcheck->run($fastq.'.gz', $fastq.'.gz.fastqcheck.tmp');

$fastqcheck->run_status >= 1 || return 0;
}
return 1;
}

sub _confirm_valid
{
my($self) = @_;
my @fastqcheck_tmp = (); # fastqcheck.tmp files.

for my $fastq (@{$self->{fastq_filenames}})
{
push @fastqcheck_tmp, $fastq.'.gz.fastqcheck.tmp';
}

# Validate against iRODS
my $validate = Pathogens::Import::ValidateFastqConversion->new(
fastqcheck_filenames => \@fastqcheck_tmp,
irods_filename => $self->irods_filename
);
return $validate->is_total_reads_valid(); # Validation check
}

sub is_compressed_and_validated
{
my($self) = @_;

# Compress and checksum
$self->_compress_and_checksum();

# Generate temp fastqcheck files
$self->_fastqcheck() || return 0;

# Check reads from fastqs match reads from iRODS.
$self->_confirm_valid() || return 0;

# Rename temp fastqcheck files if valid.
for my $fastq (@{$self->{fastq_filenames}})
{
Utils::CMD(qq[mv $fastq.gz.fastqcheck.tmp $fastq.gz.fastqcheck]);
}

return 1;
}

1;
70 changes: 70 additions & 0 deletions modules/Pathogens/Import/ValidateFastqConversion.pm
@@ -0,0 +1,70 @@
=head1 NAME
ValidateFastqConversion.pm - Checks imported reads math reads from iRODS.
=head1 SYNOPSIS
use Pathogens::Import::ValidateFastqConversion;
my $validate = Pathogens::Import::ValidateFastqConversion->new(
fastqcheck_filenames => ['1234_5_6_1.fastq.gz.fastqcheck','1234_5_6_2.fastq.gz.fastqcheck'],
irods_filename => '1234_5#6.bam'
);
$validate->is_total_reads_valid();
=cut
package Pathogens::Import::ValidateFastqConversion;

use Moose;
use VertRes::Parser::fastqcheck;
use VertRes::Wrapper::iRODS;

has 'fastqcheck_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1);
has 'irods_filename' => ( is => 'ro', isa => 'Str', required => 1);
has '_fastq_totalreads' => ( is => 'ro', isa => 'ArrayRef', lazy_build => 1);

sub _build__fastq_totalreads
{
my ($self) = @_;
my @totalreads;
for my $filename (@{$self->fastqcheck_filenames})
{
my $parser = VertRes::Parser::fastqcheck->new(file => $filename);
my $readcount = $parser->num_sequences() || 0;
push @totalreads, $readcount;
}
return \@totalreads;
}

sub _sum_fastq_reads
{
my ($self) = @_;
my $sum = 0;
for my $total_reads (@{$self->_fastq_totalreads})
{
$sum += $total_reads;
}
return $sum;
}

sub _sum_irods_reads
{
my ($self) = @_;
my $irods = VertRes::Wrapper::iRODS->new();
my $ifile = $irods->find_file_by_name($self->irods_filename);
my $readcount = $irods->get_total_reads($ifile) || 0;
return $readcount;
}

sub is_total_reads_valid
{
my ($self) = @_;
if($self->_sum_irods_reads == $self->_sum_fastq_reads)
{
return 1;
}
return 0;
}



1;
146 changes: 146 additions & 0 deletions modules/Pathogens/RNASeq/AlignmentSlice.pm
@@ -0,0 +1,146 @@
=head1 NAME
AlignmentSlice.pm - Extract a slice of reads for a sequence file within a specific region
=head1 SYNOPSIS
use Pathogens::RNASeq::AlignmentSlice;
my $alignment_slice = Pathogens::RNASeq::AlignmentSlice->new(
filename => '/abc/my_file.bam',
window_margin => 10,
total_mapped_reads => 1234,
);
my %rpkm_values = $alignment_slice->rpkm_values;
$rpkm_values{rpkm_sense};
$rpkm_values{rpkm_antisense};
$rpkm_values{mapped_reads_sense};
$rpkm_values{mapped_reads_antisense};
$rpkm_values{mapped_reads_forward};
$rpkm_values{mapped_reads_reverse};
=cut
package Pathogens::RNASeq::AlignmentSlice;
use Moose;
use Pathogens::RNASeq::Exceptions;
use Pathogens::RNASeq::Read;

# required inputs
has 'filename' => ( is => 'rw', isa => 'Str', required => 1 );
has 'feature' => ( is => 'rw', required => 1 );
has 'total_mapped_reads' => ( is => 'rw', isa => 'Int', required => 1 );
#optional input
has 'samtools_exec' => ( is => 'rw', isa => 'Str', default => "samtools" );
has 'protocol' => ( is => 'rw', isa => 'Str', default => 'StandardProtocol' );
has 'window_margin' => ( is => 'rw', isa => 'Int', default => 50 );
has 'filters' => ( is => 'rw', isa => 'Maybe[HashRef]' );
has '_input_slice_filename' => ( is => 'rw', isa => 'Str'); # allow for testing and for using VR samtools view output file
# output variable
has 'rpkm_values' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 );

# internal variables
has '_slice_file_handle' => ( is => 'rw', lazy_build => 1 );
has '_window_start' => ( is => 'rw', isa => 'Int', lazy_build => 1 );
has '_window_end' => ( is => 'rw', isa => 'Int', lazy_build => 1 );
has '_read_protocol_class' => ( is => 'rw', lazy_build => 1 );

sub _build__window_start
{
my ($self) = @_;
my $window_start = $self->feature->gene_start - $self->window_margin;
$window_start = $window_start < 1 ? 1 : $window_start;
return $window_start;
}

sub _build__window_end
{
my ($self) = @_;
$self->feature->gene_end + $self->window_margin;
}

sub _build__slice_file_handle
{
my ($self) = @_;
my $slice_file_handle;
open($slice_file_handle, $self->_slice_stream ) || Pathogens::RNASeq::Exceptions::FailedToOpenAlignmentSlice->throw( error => "Cant view slice for ".$self->filename." ".$self->_window_start." " .$self->_window_end );
return $slice_file_handle;
}

sub _slice_stream
{
my ($self) = @_;
if($self->_input_slice_filename)
{
return $self->_input_slice_filename;
}
else
{
return $self->samtools_exec." view ".$self->filename." ".$self->feature->seq_id.":".$self->_window_start."-".$self->_window_end." |";
}
}

sub _build__read_protocol_class
{
my ($self) = @_;
my $read_protocol_class = "Pathogens::RNASeq::".$self->protocol."::Read";
eval("use $read_protocol_class");
return $read_protocol_class;
}


sub _build_rpkm_values
{
my ($self) = @_;
my %rpkm_values;

$rpkm_values{mapped_reads_sense} = 0;
$rpkm_values{mapped_reads_antisense} = 0;
$rpkm_values{mapped_reads_forward} = 0;
$rpkm_values{mapped_reads_reverse} = 0;

my $file_handle = $self->_slice_file_handle;

while(my $line = <$file_handle>)
{
my $sequence_reads = $self->_read_protocol_class->new(
alignment_line => $line,
exons => $self->feature->exons,
gene_strand => $self->feature->gene_strand,
filters => $self->filters
);
my $mapped_reads = $sequence_reads->mapped_reads;

$rpkm_values{mapped_reads_sense} += $mapped_reads->{sense};
$rpkm_values{mapped_reads_antisense} += $mapped_reads->{antisense};

if($sequence_reads->read_strand == 0)
{
$rpkm_values{mapped_reads_forward} += $mapped_reads->{sense};
$rpkm_values{mapped_reads_reverse} += $mapped_reads->{antisense};
}
else
{
$rpkm_values{mapped_reads_forward} += $mapped_reads->{antisense};
$rpkm_values{mapped_reads_reverse} += $mapped_reads->{sense};
}
}

$rpkm_values{rpkm_sense} = $self->_calculate_rpkm($rpkm_values{mapped_reads_sense});
$rpkm_values{rpkm_antisense} = $self->_calculate_rpkm($rpkm_values{mapped_reads_antisense});

return \%rpkm_values;
}

sub _calculate_rpkm
{
my ($self, $mapped_reads) = @_;
#my $rpkm = $mapped_reads / ( ($self->feature->exon_length/1000) * ($self->total_mapped_reads/1000000) );
# same equation, rewritten
my $rpkm = ($mapped_reads / $self->feature->exon_length) * (1000000000/$self->total_mapped_reads);


return $rpkm;
}


1;

0 comments on commit ccc1ca9

Please sign in to comment.