Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #40 from sanger-pathogens/pipeline
Pathogens changes
- Loading branch information
Showing
83 changed files
with
9,048 additions
and
133 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
=head1 NAME | ||
=head1 SYNOPSIS | ||
use Pathogens::Import::CompressAndValidate; | ||
my $validator = Pathogens::Import::CompressAndValidate->new( irods_filename => $irods, fastq_filenames => \@fastqs ); | ||
$validator->is_compressed_and_validated() || $die("Compress and validate failed\n"); | ||
=head1 DESCRIPTION | ||
Compress fastq file, add md5 checksum files and check number of reads against iRODS. | ||
=cut | ||
|
||
package Pathogens::Import::CompressAndValidate; | ||
use Moose; | ||
use Utils; | ||
use VertRes::Wrapper::fastqcheck; | ||
use Pathogens::Import::ValidateFastqConversion; | ||
|
||
has 'irods_filename' => ( is => 'ro', isa => 'Str', required => 1); | ||
has 'fastq_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1); | ||
|
||
sub _compress_and_checksum | ||
{ | ||
my($self) = @_; | ||
|
||
for my $fastq (@{$self->{fastq_filenames}}) | ||
{ | ||
# Compress fastq | ||
Utils::CMD(qq[gzip -9 -c $fastq > $fastq.gz]); | ||
|
||
# Checksum fastqs | ||
Utils::CMD(qq[md5sum $fastq > $fastq.md5]); | ||
Utils::CMD(qq[md5sum $fastq.gz > $fastq.gz.md5]); | ||
} | ||
} | ||
|
||
sub _fastqcheck | ||
{ | ||
my($self) = @_; | ||
|
||
for my $fastq (@{$self->{fastq_filenames}}) | ||
{ | ||
# Generate fastqcheckfile | ||
my $fastqcheck = VertRes::Wrapper::fastqcheck->new(); | ||
$fastqcheck->run($fastq.'.gz', $fastq.'.gz.fastqcheck.tmp'); | ||
|
||
$fastqcheck->run_status >= 1 || return 0; | ||
} | ||
return 1; | ||
} | ||
|
||
sub _confirm_valid | ||
{ | ||
my($self) = @_; | ||
my @fastqcheck_tmp = (); # fastqcheck.tmp files. | ||
|
||
for my $fastq (@{$self->{fastq_filenames}}) | ||
{ | ||
push @fastqcheck_tmp, $fastq.'.gz.fastqcheck.tmp'; | ||
} | ||
|
||
# Validate against iRODS | ||
my $validate = Pathogens::Import::ValidateFastqConversion->new( | ||
fastqcheck_filenames => \@fastqcheck_tmp, | ||
irods_filename => $self->irods_filename | ||
); | ||
return $validate->is_total_reads_valid(); # Validation check | ||
} | ||
|
||
sub is_compressed_and_validated | ||
{ | ||
my($self) = @_; | ||
|
||
# Compress and checksum | ||
$self->_compress_and_checksum(); | ||
|
||
# Generate temp fastqcheck files | ||
$self->_fastqcheck() || return 0; | ||
|
||
# Check reads from fastqs match reads from iRODS. | ||
$self->_confirm_valid() || return 0; | ||
|
||
# Rename temp fastqcheck files if valid. | ||
for my $fastq (@{$self->{fastq_filenames}}) | ||
{ | ||
Utils::CMD(qq[mv $fastq.gz.fastqcheck.tmp $fastq.gz.fastqcheck]); | ||
} | ||
|
||
return 1; | ||
} | ||
|
||
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
=head1 NAME | ||
ValidateFastqConversion.pm - Checks imported reads math reads from iRODS. | ||
=head1 SYNOPSIS | ||
use Pathogens::Import::ValidateFastqConversion; | ||
my $validate = Pathogens::Import::ValidateFastqConversion->new( | ||
fastqcheck_filenames => ['1234_5_6_1.fastq.gz.fastqcheck','1234_5_6_2.fastq.gz.fastqcheck'], | ||
irods_filename => '1234_5#6.bam' | ||
); | ||
$validate->is_total_reads_valid(); | ||
=cut | ||
package Pathogens::Import::ValidateFastqConversion; | ||
|
||
use Moose; | ||
use VertRes::Parser::fastqcheck; | ||
use VertRes::Wrapper::iRODS; | ||
|
||
has 'fastqcheck_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1); | ||
has 'irods_filename' => ( is => 'ro', isa => 'Str', required => 1); | ||
has '_fastq_totalreads' => ( is => 'ro', isa => 'ArrayRef', lazy_build => 1); | ||
|
||
sub _build__fastq_totalreads | ||
{ | ||
my ($self) = @_; | ||
my @totalreads; | ||
for my $filename (@{$self->fastqcheck_filenames}) | ||
{ | ||
my $parser = VertRes::Parser::fastqcheck->new(file => $filename); | ||
my $readcount = $parser->num_sequences() || 0; | ||
push @totalreads, $readcount; | ||
} | ||
return \@totalreads; | ||
} | ||
|
||
sub _sum_fastq_reads | ||
{ | ||
my ($self) = @_; | ||
my $sum = 0; | ||
for my $total_reads (@{$self->_fastq_totalreads}) | ||
{ | ||
$sum += $total_reads; | ||
} | ||
return $sum; | ||
} | ||
|
||
sub _sum_irods_reads | ||
{ | ||
my ($self) = @_; | ||
my $irods = VertRes::Wrapper::iRODS->new(); | ||
my $ifile = $irods->find_file_by_name($self->irods_filename); | ||
my $readcount = $irods->get_total_reads($ifile) || 0; | ||
return $readcount; | ||
} | ||
|
||
sub is_total_reads_valid | ||
{ | ||
my ($self) = @_; | ||
if($self->_sum_irods_reads == $self->_sum_fastq_reads) | ||
{ | ||
return 1; | ||
} | ||
return 0; | ||
} | ||
|
||
|
||
|
||
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
=head1 NAME | ||
AlignmentSlice.pm - Extract a slice of reads for a sequence file within a specific region | ||
=head1 SYNOPSIS | ||
use Pathogens::RNASeq::AlignmentSlice; | ||
my $alignment_slice = Pathogens::RNASeq::AlignmentSlice->new( | ||
filename => '/abc/my_file.bam', | ||
window_margin => 10, | ||
total_mapped_reads => 1234, | ||
); | ||
my %rpkm_values = $alignment_slice->rpkm_values; | ||
$rpkm_values{rpkm_sense}; | ||
$rpkm_values{rpkm_antisense}; | ||
$rpkm_values{mapped_reads_sense}; | ||
$rpkm_values{mapped_reads_antisense}; | ||
$rpkm_values{mapped_reads_forward}; | ||
$rpkm_values{mapped_reads_reverse}; | ||
=cut | ||
package Pathogens::RNASeq::AlignmentSlice; | ||
use Moose; | ||
use Pathogens::RNASeq::Exceptions; | ||
use Pathogens::RNASeq::Read; | ||
|
||
# required inputs | ||
has 'filename' => ( is => 'rw', isa => 'Str', required => 1 ); | ||
has 'feature' => ( is => 'rw', required => 1 ); | ||
has 'total_mapped_reads' => ( is => 'rw', isa => 'Int', required => 1 ); | ||
#optional input | ||
has 'samtools_exec' => ( is => 'rw', isa => 'Str', default => "samtools" ); | ||
has 'protocol' => ( is => 'rw', isa => 'Str', default => 'StandardProtocol' ); | ||
has 'window_margin' => ( is => 'rw', isa => 'Int', default => 50 ); | ||
has 'filters' => ( is => 'rw', isa => 'Maybe[HashRef]' ); | ||
has '_input_slice_filename' => ( is => 'rw', isa => 'Str'); # allow for testing and for using VR samtools view output file | ||
# output variable | ||
has 'rpkm_values' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 ); | ||
|
||
# internal variables | ||
has '_slice_file_handle' => ( is => 'rw', lazy_build => 1 ); | ||
has '_window_start' => ( is => 'rw', isa => 'Int', lazy_build => 1 ); | ||
has '_window_end' => ( is => 'rw', isa => 'Int', lazy_build => 1 ); | ||
has '_read_protocol_class' => ( is => 'rw', lazy_build => 1 ); | ||
|
||
sub _build__window_start | ||
{ | ||
my ($self) = @_; | ||
my $window_start = $self->feature->gene_start - $self->window_margin; | ||
$window_start = $window_start < 1 ? 1 : $window_start; | ||
return $window_start; | ||
} | ||
|
||
sub _build__window_end | ||
{ | ||
my ($self) = @_; | ||
$self->feature->gene_end + $self->window_margin; | ||
} | ||
|
||
sub _build__slice_file_handle | ||
{ | ||
my ($self) = @_; | ||
my $slice_file_handle; | ||
open($slice_file_handle, $self->_slice_stream ) || Pathogens::RNASeq::Exceptions::FailedToOpenAlignmentSlice->throw( error => "Cant view slice for ".$self->filename." ".$self->_window_start." " .$self->_window_end ); | ||
return $slice_file_handle; | ||
} | ||
|
||
sub _slice_stream | ||
{ | ||
my ($self) = @_; | ||
if($self->_input_slice_filename) | ||
{ | ||
return $self->_input_slice_filename; | ||
} | ||
else | ||
{ | ||
return $self->samtools_exec." view ".$self->filename." ".$self->feature->seq_id.":".$self->_window_start."-".$self->_window_end." |"; | ||
} | ||
} | ||
|
||
sub _build__read_protocol_class | ||
{ | ||
my ($self) = @_; | ||
my $read_protocol_class = "Pathogens::RNASeq::".$self->protocol."::Read"; | ||
eval("use $read_protocol_class"); | ||
return $read_protocol_class; | ||
} | ||
|
||
|
||
sub _build_rpkm_values | ||
{ | ||
my ($self) = @_; | ||
my %rpkm_values; | ||
|
||
$rpkm_values{mapped_reads_sense} = 0; | ||
$rpkm_values{mapped_reads_antisense} = 0; | ||
$rpkm_values{mapped_reads_forward} = 0; | ||
$rpkm_values{mapped_reads_reverse} = 0; | ||
|
||
my $file_handle = $self->_slice_file_handle; | ||
|
||
while(my $line = <$file_handle>) | ||
{ | ||
my $sequence_reads = $self->_read_protocol_class->new( | ||
alignment_line => $line, | ||
exons => $self->feature->exons, | ||
gene_strand => $self->feature->gene_strand, | ||
filters => $self->filters | ||
); | ||
my $mapped_reads = $sequence_reads->mapped_reads; | ||
|
||
$rpkm_values{mapped_reads_sense} += $mapped_reads->{sense}; | ||
$rpkm_values{mapped_reads_antisense} += $mapped_reads->{antisense}; | ||
|
||
if($sequence_reads->read_strand == 0) | ||
{ | ||
$rpkm_values{mapped_reads_forward} += $mapped_reads->{sense}; | ||
$rpkm_values{mapped_reads_reverse} += $mapped_reads->{antisense}; | ||
} | ||
else | ||
{ | ||
$rpkm_values{mapped_reads_forward} += $mapped_reads->{antisense}; | ||
$rpkm_values{mapped_reads_reverse} += $mapped_reads->{sense}; | ||
} | ||
} | ||
|
||
$rpkm_values{rpkm_sense} = $self->_calculate_rpkm($rpkm_values{mapped_reads_sense}); | ||
$rpkm_values{rpkm_antisense} = $self->_calculate_rpkm($rpkm_values{mapped_reads_antisense}); | ||
|
||
return \%rpkm_values; | ||
} | ||
|
||
sub _calculate_rpkm | ||
{ | ||
my ($self, $mapped_reads) = @_; | ||
#my $rpkm = $mapped_reads / ( ($self->feature->exon_length/1000) * ($self->total_mapped_reads/1000000) ); | ||
# same equation, rewritten | ||
my $rpkm = ($mapped_reads / $self->feature->exon_length) * (1000000000/$self->total_mapped_reads); | ||
|
||
|
||
return $rpkm; | ||
} | ||
|
||
|
||
1; |
Oops, something went wrong.