Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'release/0.04'

  • Loading branch information...
commit 0e506d7b59417bf3c6ef2c2c6640870acb1496d0 2 parents ec0be10 + 650b661
Sendu Bala sb10 authored
Showing with 10,837 additions and 1,997 deletions.
  1. +2 −1  Build.PL
  2. +3 −1 HISTORY
  3. +1,318 −0 modules/Bio/Tools/NonStandardGFF.pm
  4. +34 −6 modules/LSF.pm
  5. +94 −0 modules/Pathogens/Import/CompressAndValidate.pm
  6. +70 −0 modules/Pathogens/Import/ValidateFastqConversion.pm
  7. +146 −0 modules/Pathogens/RNASeq/AlignmentSlice.pm
  8. +47 −0 modules/Pathogens/RNASeq/BAMStats.pm
  9. +113 −0 modules/Pathogens/RNASeq/BitWise.pm
  10. +165 −0 modules/Pathogens/RNASeq/CoveragePlot.pm
  11. +12 −0 modules/Pathogens/RNASeq/Exceptions.pm
  12. +180 −0 modules/Pathogens/RNASeq/Expression.pm
  13. +103 −0 modules/Pathogens/RNASeq/ExpressionStatsSpreadsheet.pm
  14. +103 −0 modules/Pathogens/RNASeq/Feature.pm
  15. +68 −0 modules/Pathogens/RNASeq/FeaturesTabFile.pm
  16. +85 −0 modules/Pathogens/RNASeq/GFF.pm
  17. +47 −0 modules/Pathogens/RNASeq/IntergenicFeature.pm
  18. +113 −0 modules/Pathogens/RNASeq/IntergenicRegions.pm
  19. +142 −0 modules/Pathogens/RNASeq/Read.pm
  20. +19 −0 modules/Pathogens/RNASeq/SequenceFile.pm
  21. +12 −0 modules/Pathogens/RNASeq/StandardProtocol/AlignmentSlice.pm
  22. +38 −0 modules/Pathogens/RNASeq/StandardProtocol/Read.pm
  23. +12 −0 modules/Pathogens/RNASeq/StrandSpecificProtocol/AlignmentSlice.pm
  24. +49 −0 modules/Pathogens/RNASeq/StrandSpecificProtocol/Read.pm
  25. +100 −0 modules/Pathogens/RNASeq/ValidateInputs.pm
  26. +82 −0 modules/Pathogens/Reports/Mapping/Report.pm
  27. +403 −0 modules/Pathogens/Reports/Mapping/Row.pm
  28. +77 −0 modules/Pathogens/Reports/Mapping/Spreadsheet.pm
  29. +125 −0 modules/VRTrack/AutoQC.pm
  30. +6 −2 modules/VRTrack/Core_obj.pm
  31. +2 −4 modules/VRTrack/Factory.pm
  32. +30 −0 modules/VRTrack/Lane.pm
  33. +114 −1 modules/VRTrack/Mapstats.pm
  34. +81 −23 modules/VRTrack/VRTrack.pm
  35. +53 −3 modules/Vcf.pm
  36. +868 −0 modules/VertRes/Pipelines/Assembly.pm
  37. +28 −0 modules/VertRes/Pipelines/Import.pm
  38. +154 −44 modules/VertRes/Pipelines/Import_iRODS_fastq.pm
  39. +118 −18 modules/VertRes/Pipelines/Mapping.pm
  40. +366 −0 modules/VertRes/Pipelines/RNASeqExpression.pm
  41. +16 −6 modules/VertRes/Pipelines/TrackQC_Bam.pm
  42. +25 −13 modules/VertRes/Pipelines/TrackQC_Fastq.pm
  43. 0  {web_code/qc_grind → modules/VertRes/QCGrind}/Util.pm
  44. +316 −0 modules/VertRes/QCGrind/ViewUtil.pm
  45. +292 −0 modules/VertRes/Utils/Assemblers/velvet.pm
  46. +140 −0 modules/VertRes/Utils/Assembly.pm
  47. +8 −3 modules/VertRes/Utils/Mappers/smalt.pm
  48. +204 −0 modules/VertRes/Utils/Mappers/tophat.pm
  49. +20 −1 modules/VertRes/Utils/Mapping.pm
  50. +50 −1 modules/VertRes/Utils/Sam.pm
  51. +68 −0 modules/VertRes/Utils/Scaffold.pm
  52. +115 −0 modules/VertRes/Utils/Scaffolders/sspace.pm
  53. +16 −0 modules/VertRes/Wrapper/WrapperI.pm
  54. +1 −1  modules/VertRes/Wrapper/bowtie.pm
  55. +6 −0 modules/VertRes/Wrapper/picard.pm
  56. +5 −0 modules/VertRes/Wrapper/samtools.pm
  57. +42 −25 modules/VertRes/Wrapper/smalt.pm
  58. +337 −0 modules/VertRes/Wrapper/tophat.pm
  59. BIN  scripts/bamcheck
  60. +6 −1 scripts/bamcheck-mousewrapper
  61. +55 −13 scripts/bamcheck.c
  62. +790 −0 scripts/plot-bamcheck-2012-02-11
  63. +82 −0 scripts/rna_seq_expression
  64. +9 −1 scripts/run-beagle
  65. +4 −4 scripts/run-mpileup
  66. +96 −6 scripts/run-pipeline
  67. +27 −0 scripts/tweak-lane
  68. +13 −6 scripts/update_db_vrpipe.sh
  69. +1 −1  scripts/vcf-annotate
  70. +198 −0 scripts/vcf-contrast
  71. +8 −2 scripts/vcf-fix-ploidy
  72. +7 −2 scripts/vcf-isec
  73. +25 −3 scripts/vr-wrapper
  74. +13 −3 scripts/vrtrack_npg_gt_update
  75. +16 −22 scripts/vrtracking_tool
  76. +23 −3 scripts/weekly_new_lane_report
  77. +2 −0  sql/VRTrack_schema_17_to_18.sql
  78. +7 −0 sql/VRTrack_schema_18_to_19.sql
  79. +16 −0 sql/VRTrack_schema_19_to_20.sql
  80. +49 −0 t/Pathogens/AlignmentSlice.t
  81. +51 −0 t/Pathogens/BitWise.t
  82. +144 −0 t/Pathogens/CompressAndValidate.t
  83. +62 −0 t/Pathogens/CoveragePlot.t
  84. +82 −0 t/Pathogens/ExpressionStatsSpreadsheet.t
  85. +55 −0 t/Pathogens/FeaturesTabFile.t
  86. +36 −0 t/Pathogens/GFF.t
  87. +22 −0 t/Pathogens/IntergenicFeature.t
  88. +51 −0 t/Pathogens/IntergenicRegions.t
  89. +23 −0 t/Pathogens/MappingBamcheckGraphs.t
  90. +273 −0 t/Pathogens/MappingReport.t
  91. +63 −0 t/Pathogens/QCMappings.t
  92. +51 −0 t/Pathogens/Read.t
  93. +13 −0 t/Pathogens/SequenceFile.t
  94. +26 −0 t/Pathogens/StrandSpecificProtocolRead.t
  95. +40 −0 t/Pathogens/ValidateFastqConversion.t
  96. +29 −0 t/Pathogens/ValidateInputs.t
  97. +66 −0 t/Pathogens/VertResUtilsSam.t
  98. +30 −1 t/VRTrack/hierarchy_noreq.t
  99. +29 −1 t/VRTrack/non_core_classes.t
  100. +95 −0 t/VertRes/Wrapper/tophat.t
  101. +84 −0 t/data/1kg_lane_sq_invalid.gff
  102. +84 −0 t/data/1kg_lane_sq_valid.gff
  103. +24 −0 t/data/Citrobacter_rodentium_ICC168_v1_test.gff
  104. +27 −0 t/data/Citrobacter_rodentium_slice
  105. BIN  t/data/rna_seq_bitwise_flags_set.bam
  106. BIN  t/data/small_multi_sequence.bam
  107. BIN  t/data/small_slice.bam
  108. 0  web_code/lookseq/lookseq.html
  109. +1 −0  web_code/lookseq/lookseq.js
  110. +149 −0 web_code/map-view/map_lanes_view.pl
  111. +51 −0 web_code/map-view/map_projects_view.pl
  112. +21 −394 web_code/map-view/map_view.pl
  113. +191 −0 web_code/pending-view/pending_requests_view.pl
  114. +21 −409 web_code/pending-view/pending_view.pl
  115. +0 −420 web_code/pending-view/pending_view2.pl
  116. +17 −10 web_code/qc_grind/study_lanes.pl
  117. +20 −542 web_code/sample-mapping/sample_mapping.pl
  118. +165 −0 web_code/sample-mapping/sample_mapping_lanes_view.pl
  119. +51 −0 web_code/sample-mapping/sample_mapping_projects_view.pl
  120. 0  web_code/snps/snps.css
  121. 0  web_code/snps/snps.js
3  Build.PL
View
@@ -26,7 +26,7 @@ use MyBuild;
my $build = MyBuild->new(
module_name => 'VertRes',
- dist_version => 0.03,
+ dist_version => 0.04,
dist_author => 'Vertebrate Resequencing group at the Sanger Institute',
dist_abstract => 'A collection of modules and scripts for processing and analysing large quantities of sequencing data.',
license => 'perl',
@@ -44,6 +44,7 @@ my $build = MyBuild->new(
'Net::FTP::Robust' => 0,
'Time::Format' => 0,
'IO::Capture::Stderr' => 0,
+ 'Math::Random' => 0,
},
pm_files => get_pm_files(),
script_files => 'scripts'
4 HISTORY
View
@@ -4,4 +4,6 @@ Version Description
------- -----------
0.01 Initial public release on github
0.02 Nice stable point, prior to an upcoming exome-related schema changes
-0.03 Stable point, known working for UK10K analysis, prior to incorporating many changes from Pathogens/HGI
+0.03 Stable point, known working for UK10K analysis, prior to incorporating
+ many changes from Pathogens/HGI
+0.04 VRTrack schema version 20, with the new AutoQC table
1,318 modules/Bio/Tools/NonStandardGFF.pm
View
@@ -0,0 +1,1318 @@
+#
+# Taken from https://github.com/bioperl/bioperl-live/blob/master/Bio/Tools/GFF.pm
+#https://github.com/bioperl/bioperl-live/commit/c044a602819e7f7bcfda35940bb4da6f22d1f1d6
+# It allows for GFF files with fasta sequences to be directly concatonated together, instead of all the fasta in header or footer
+
+# BioPerl module for Bio::Tools::GFF
+#
+# Please direct questions and support issues to <bioperl-l@bioperl.org>
+#
+# Cared for by the Bioperl core team
+#
+# Copyright Matthew Pocock
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser
+
+=head1 SYNOPSIS
+
+ use Bio::Tools::GFF;
+
+ # specify input via -fh or -file
+ my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2);
+ my $feature;
+ # loop over the input stream
+ while($feature = $gffio->next_feature()) {
+ # do something with feature
+ }
+ $gffio->close();
+
+ # you can also obtain a GFF parser as a SeqAnalasisParserI in
+ # HT analysis pipelines (see Bio::SeqAnalysisParserI and
+ # Bio::Factory::SeqAnalysisParserFactory)
+ my $factory = Bio::Factory::SeqAnalysisParserFactory->new();
+ my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff");
+ while($feature = $parser->next_feature()) {
+ # do something with feature
+ }
+
+=head1 DESCRIPTION
+
+This class provides a simple GFF parser and writer. In the sense of a
+SeqAnalysisParser, it parses an input file or stream into SeqFeatureI
+objects, but is not in any way specific to a particular analysis
+program and the output that program produces.
+
+That is, if you can get your analysis program spit out GFF, here is
+your result parser.
+
+=head1 GFF3 AND SEQUENCE DATA
+
+GFF3 supports sequence data; see
+
+http://www.sequenceontology.org/gff3.shtml
+
+There are a number of ways to deal with this -
+
+If you call
+
+ $gffio->ignore_sequence(1)
+
+prior to parsing the sequence data is ignored; this is useful if you
+just want the features. It avoids the memory overhead in building and
+caching sequences
+
+Alternatively, you can call either
+
+ $gffio->get_seqs()
+
+Or
+
+ $gffio->seq_id_by_h()
+
+At the B<end> of parsing to get either a list or hashref of Bio::Seq
+objects (see the documentation for each of these methods)
+
+Note that these objects will not have the features attached - you have
+to do this yourself, OR call
+
+ $gffio->features_attached_to_seqs(1)
+
+PRIOR to parsing; this will ensure that the Seqs have the features
+attached; ie you will then be able to call
+
+ $seq->get_SeqFeatures();
+
+And use Bio::SeqIO methods
+
+Note that auto-attaching the features to seqs will incur a higher
+memory overhead as the features must be cached until the sequence data
+is found
+
+=head1 TODO
+
+Make a Bio::SeqIO class specifically for GFF3 with sequence data
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists. Your participation is much appreciated.
+
+ bioperl-l@bioperl.org - General discussion
+ http://bioperl.org/wiki/Mailing_lists - About the mailing lists
+
+=head2 Support
+
+Please direct usage questions or support issues to the mailing list:
+
+I<bioperl-l@bioperl.org>
+
+rather than to the module maintainer directly. Many experienced and
+reponsive experts will be able look at the problem and quickly
+address it. Please include a thorough description of the problem
+with code and data examples if at all possible.
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution. Bug reports can be submitted the web:
+
+ https://redmine.open-bio.org/projects/bioperl/
+
+=head1 AUTHOR - Matthew Pocock
+
+Email mrp-at-sanger.ac.uk
+
+=head1 CONTRIBUTORS
+
+Jason Stajich, jason-at-biperl-dot-org
+Chris Mungall, cjm-at-fruitfly-dot-org
+Steffen Grossmann [SG], grossman at molgen.mpg.de
+Malcolm Cook, mec-at-stowers-institute.org
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
+
+=cut
+
+# Let the code begin...
+
+package Bio::Tools::NonStandardGFF;
+
+use vars qw($HAS_HTML_ENTITIES);
+use strict;
+
+use Bio::Seq::SeqFactory;
+use Bio::LocatableSeq;
+use Bio::SeqFeature::Generic;
+
+use base qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
+
+my $i = 0;
+my %GFF3_ID_Tags = map { $_ => $i++ } qw(ID Parent Target);
+
+# for skipping data that may be represented elsewhere; currently, this is
+# only the score
+my %SKIPPED_TAGS = map { $_ => 1 } qw(score);
+
+=head2 new
+
+ Title : new
+ Usage : my $parser = Bio::Tools::GFF->new(-gff_version => 2,
+ -file => "filename.gff");
+ or
+ my $writer = Bio::Tools::GFF->new(-gff_version => 3,
+ -file => ">filename.gff3");
+ Function: Creates a new instance. Recognized named parameters are -file, -fh,
+ and -gff_version.
+ Returns : a new object
+ Args : named parameters
+ -gff_version => [1,2,3]
+
+=cut
+
+
+{ # make a class variable such that we can generate unique ID's over
+ # a session, no matter how many instances of GFF.pm we make
+ # since these have to be unique within the scope of a GFF file.
+
+ my $gff3_featureID = 0;
+
+ sub _incrementGFF3ID {
+ my ($self) = @_;
+ return ++ $gff3_featureID;
+ }
+}
+
+
+sub new {
+ my ($class, @args) = @_;
+ my $self = $class->SUPER::new(@args);
+
+ my ($gff_version, $noparse) = $self->_rearrange([qw(GFF_VERSION NOPARSE)],@args);
+
+ # initialize IO
+ $self->_initialize_io(@args);
+ $self->_parse_header() unless $noparse;
+
+ $gff_version ||= 2;
+ if( ! $self->gff_version($gff_version) ) {
+ $self->throw("Can't build a GFF object with the unknown version ".
+ $gff_version);
+ }
+ $self->{'_first'} = 1;
+ return $self;
+}
+
+=head2 _parse_header
+
+ Title : _parse_header
+ Usage : $gffio->_parse_header()
+ Function: used to turn parse GFF header lines. currently
+ produces Bio::LocatableSeq objects from ##sequence-region
+ lines
+ Returns : 1 on success
+ Args : none
+
+
+=cut
+
+sub _parse_header{
+ my ($self) = @_;
+
+ my @unhandled;
+ local $^W = 0; # hide warnings when we try and parse from a file opened
+ # for writing - there isn't really a better way to do
+ # AFAIK - cannot detech if a FH is read or write.
+ while(my $line = $self->_readline()){
+ my $handled = 0;
+ next if /^\s+$/;
+ if($line =~ /^\#\#sequence-region\s+(\S+)\s+(\S+)\s+(\S+)\s*/){
+ my($seqid,$start,$end) = ($1,$2,$3);
+ push @{ $self->{'segments'} }, Bio::LocatableSeq->new
+ (
+ -id => unescape($seqid),
+ -start => $start,
+ -end => $end,
+ -length => ($end - $start + 1), ## make the length explicit
+ );
+ $handled = 1;
+ } elsif($line =~ /^(\#\#feature-ontology)/) {
+ #to be implemented
+ $self->warn("$1 header tag parsing unimplemented");
+ } elsif($line =~ /^(\#\#attribute-ontology)/) {
+ #to be implemented
+ $self->warn("$1 header tag parsing unimplemented");
+ } elsif($line =~ /^(\#\#source-ontology)/) {
+ #to be implemented
+ $self->warn("$1 header tag parsing unimplemented");
+ } elsif($line =~ /^(\#\#\#)/) {
+ #to be implemented
+ $self->warn("$1 header tag parsing unimplemented");
+ } elsif($line =~ /^(\#\#FASTA)/) {
+ # initial ##FASTA is optional - artemis does not use it
+ $line = $self->_readline();
+ if ($line !~ /^\>(\S+)/) {
+ $self->throw("##FASTA directive must be followed by fasta header, not: $line");
+ }
+ } else {
+ }
+
+ if ($line =~ /^\>(.*)/) {
+ # seq data can be at header or footer
+ my $seq = $self->_parse_sequence($line);
+ if ($seq) {
+ $self->_seq_by_id_h->{$seq->primary_id} = $seq;
+ }
+ }
+
+
+ if(!$handled){
+ push @unhandled, $line
+ }
+
+ #looks like the header is over!
+ last unless $line =~ /^\#/;
+ }
+
+ foreach my $line (@unhandled){
+ $self->_pushback($line);
+ }
+
+ return 1;
+}
+
+sub _parse_sequence {
+ my ($self, $line) = @_;
+
+ if ($line =~ /^\>(.*)/) {
+
+ my $seqid = $1;
+ $seqid =~ s/\s+$//;
+ my $desc = '';
+ if ($seqid =~ /(\S+)\s+(.*)/) {
+ ($seqid, $desc) = ($1,$2);
+ }
+ my $res = '';
+ while (my $line = $self->_readline) {
+ if ($line =~ /^\#/) {
+ last;
+ }
+ if ($line =~ /^\>/) {
+ $self->_pushback($line);
+ last;
+ }
+ $line =~ s/\s//g;
+ $res .= $line;
+ }
+ return if $self->ignore_sequence;
+
+ my $seqfactory = Bio::Seq::SeqFactory->new('Bio::Seq');
+ my $seq = $seqfactory->create(-seq=>$res,
+ -id=>$seqid,
+ -desc=>$desc);
+ $seq->accession_number($seqid);
+ if ($self->features_attached_to_seqs) {
+ my @feats =
+ @{$self->_feature_idx_by_seq_id->{$seqid}};
+ $seq->add_SeqFeature($_) foreach @feats;
+ @{$self->_feature_idx_by_seq_id->{$seqid}} = ();
+ }
+ return $seq;
+ }
+ else {
+ $self->throw("expected fasta header, not: $line");
+ }
+}
+
+
+=head2 next_segment
+
+ Title : next_segment
+ Usage : my $seq = $gffio->next_segment;
+ Function: Returns a Bio::LocatableSeq object corresponding to a
+ GFF "##sequence-region" header line.
+ Example :
+ Returns : A Bio::LocatableSeq object, or undef if
+ there are no more sequences.
+ Args : none
+
+
+=cut
+
+sub next_segment{
+ my ($self,@args) = @_;
+ return shift @{ $self->{'segments'} } if defined $self->{'segments'};
+ return;
+}
+
+=head2 next_feature
+
+ Title : next_feature
+ Usage : $seqfeature = $gffio->next_feature();
+ Function: Returns the next feature available in the input file or stream, or
+ undef if there are no more features.
+ Example :
+ Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
+ more features.
+ Args : none
+
+=cut
+
+sub next_feature {
+ my ($self) = @_;
+
+ my $gff_string;
+
+ # be graceful about empty lines or comments, and make sure we return undef
+ # if the input's consumed
+ while(($gff_string = $self->_readline()) && defined($gff_string)) {
+ if ($gff_string =~ /^\#\#\#/) {
+ # all forward refs have been seen; TODO
+ }
+ next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ ||
+ $gff_string =~ m{^//});
+
+ while ($gff_string =~ /^\>(.+)/) {
+ # fasta can be in header or footer
+ my $seq = $self->_parse_sequence($gff_string);
+ if ($seq) {
+ $self->_seq_by_id_h->{$seq->primary_id} = $seq;
+ $gff_string = $self->_readline;
+ last unless $gff_string;
+ }
+ }
+ next if($gff_string =~ /^\#/ );
+
+ last;
+ }
+ return unless $gff_string;
+
+ my $feat = Bio::SeqFeature::Generic->new();
+ $self->from_gff_string($feat, $gff_string);
+
+ if ($self->features_attached_to_seqs) {
+ push(@{$self->_feature_idx_by_seq_id->{$feat->seq_id}},
+ $feat);
+ }
+
+ return $feat;
+}
+
+sub _feature_idx_by_seq_id {
+ my $self = shift;
+ $self->{__feature_idx_by_seq_id} = shift if @_;
+ $self->{__feature_idx_by_seq_id} = {}
+ unless $self->{__feature_idx_by_seq_id};
+ return $self->{__feature_idx_by_seq_id};
+}
+
+
+=head2 from_gff_string
+
+ Title : from_gff_string
+ Usage : $gff->from_gff_string($feature, $gff_string);
+ Function: Sets properties of a SeqFeatureI object from a GFF-formatted
+ string. Interpretation of the string depends on the version
+ that has been specified at initialization.
+
+ This method is used by next_feature(). It actually dispatches to
+ one of the version-specific (private) methods.
+ Example :
+ Returns : void
+ Args : A Bio::SeqFeatureI implementing object to be initialized
+ The GFF-formatted string to initialize it from
+
+=cut
+
+sub from_gff_string {
+ my ($self, $feat, $gff_string) = @_;
+
+ if($self->gff_version() == 1) {
+ return $self->_from_gff1_string($feat, $gff_string);
+ } elsif( $self->gff_version() == 3 ) {
+ return $self->_from_gff3_string($feat, $gff_string);
+ } else {
+ return $self->_from_gff2_string($feat, $gff_string);
+ }
+}
+
+=head2 _from_gff1_string
+
+ Title : _from_gff1_string
+ Usage :
+ Function:
+ Example :
+ Returns : void
+ Args : A Bio::SeqFeatureI implementing object to be initialized
+ The GFF-formatted string to initialize it from
+
+=cut
+
+sub _from_gff1_string {
+ my ($gff, $feat, $string) = @_;
+ chomp $string;
+ my ($seqname, $source, $primary, $start, $end, $score,
+ $strand, $frame, @group) = split(/\t/, $string);
+
+ if ( !defined $frame ) {
+ $feat->throw("[$string] does not look like GFF to me");
+ }
+ $frame = 0 unless( $frame =~ /^\d+$/);
+ $feat->seq_id($seqname);
+ $feat->source_tag($source);
+ $feat->primary_tag($primary);
+ $feat->start($start);
+ $feat->end($end);
+ $feat->frame($frame);
+ if ( $score eq '.' ) {
+ #$feat->score(undef);
+ } else {
+ $feat->score($score);
+ }
+ if ( $strand eq '-' ) { $feat->strand(-1); }
+ if ( $strand eq '+' ) { $feat->strand(1); }
+ if ( $strand eq '.' ) { $feat->strand(0); }
+ foreach my $g ( @group ) {
+ if ( $g =~ /(\S+)=(\S+)/ ) {
+ my $tag = $1;
+ my $value = $2;
+ $feat->add_tag_value($1, $2);
+ } else {
+ $feat->add_tag_value('group', $g);
+ }
+ }
+}
+
+=head2 _from_gff2_string
+
+ Title : _from_gff2_string
+ Usage :
+ Function:
+ Example :
+ Returns : void
+ Args : A Bio::SeqFeatureI implementing object to be initialized
+ The GFF2-formatted string to initialize it from
+
+
+=cut
+
+sub _from_gff2_string {
+ my ($gff, $feat, $string) = @_;
+ chomp($string);
+
+ # according to the Sanger website, GFF2 should be single-tab
+ # separated elements, and the free-text at the end should contain
+ # text-translated tab symbols but no "real" tabs, so splitting on
+ # \t is safe, and $attribs gets the entire attributes field to be
+ # parsed later
+
+ # sendu: but the tag value pair can (should?) be separated by a tab. The
+ # 'no tabs' thing seems to apply only to the free text that is allowed for
+ # the value
+
+ my ($seqname, $source, $primary, $start,
+ $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string);
+ my $attribs = join ' ', @attribs;
+
+ if ( !defined $frame ) {
+ $feat->throw("[$string] does not look like GFF2 to me");
+ }
+ $feat->seq_id($seqname);
+ $feat->source_tag($source);
+ $feat->primary_tag($primary);
+ $feat->start($start);
+ $feat->end($end);
+ $feat->frame($frame);
+ if ( $score eq '.' ) {
+ # $feat->score(undef);
+ } else {
+ $feat->score($score);
+ }
+ if ( $strand eq '-' ) { $feat->strand(-1); }
+ if ( $strand eq '+' ) { $feat->strand(1); }
+ if ( $strand eq '.' ) { $feat->strand(0); }
+
+
+ # <Begin Inefficient Code from Mark Wilkinson>
+ # this routine is necessay to allow the presence of semicolons in
+ # quoted text Semicolons are the delimiting character for new
+ # tag/value attributes. it is more or less a "state" machine, with
+ # the "quoted" flag going up and down as we pass thorugh quotes to
+ # distinguish free-text semicolon and hash symbols from GFF control
+ # characters
+
+
+ my $flag = 0; # this could be changed to a bit and just be twiddled
+ my @parsed;
+
+ # run through each character one at a time and check it
+ # NOTE: changed to foreach loop which is more efficient in perl
+ # --jasons
+ for my $a ( split //, $attribs ) {
+ # flag up on entering quoted text, down on leaving it
+ if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 }
+ elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"}
+ elsif( $a eq '#' && ! $flag ) { last }
+ push @parsed, $a;
+ }
+ $attribs = join "", @parsed; # rejoin into a single string
+
+ # <End Inefficient Code>
+ # Please feel free to fix this and make it more "perlish"
+
+ my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited
+
+ foreach my $pair ( @key_vals ) {
+ # replace semicolons that were removed from free-text above.
+ $pair =~ s/INSERT_SEMICOLON_HERE/;/g;
+
+ # separate the key from the value
+ my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair;
+
+
+ if( defined $values ) {
+ my @values;
+ # free text is quoted, so match each free-text block
+ # and remove it from the $values string
+ while ($values =~ s/"(.*?)"//){
+ # and push it on to the list of values (tags may have
+ # more than one value... and the value may be undef)
+ push @values, $1;
+ }
+
+ # and what is left over should be space-separated
+ # non-free-text values
+
+ my @othervals = split /\s+/, $values;
+ foreach my $othervalue(@othervals){
+ # get rid of any empty strings which might
+ # result from the split
+ if (CORE::length($othervalue) > 0) {push @values, $othervalue}
+ }
+
+ foreach my $value(@values){
+ $feat->add_tag_value($key, $value);
+ }
+ }
+ }
+}
+
+
+sub _from_gff3_string {
+ my ($gff, $feat, $string) = @_;
+ chomp($string);
+
+ # according to the now nearly final GFF3 spec, columns should
+ # be tab separated, allowing unescaped spaces to occur in
+ # column 9
+
+ my ($seqname, $source, $primary, $start, $end,
+ $score, $strand, $frame, $groups) = split(/\t/, $string);
+
+ if ( ! defined $frame ) {
+ $feat->throw("[$string] does not look like GFF3 to me");
+ }
+ $feat->seq_id($seqname);
+ $feat->source_tag($source);
+ $feat->primary_tag($primary);
+ $feat->start($start);
+ $feat->end($end);
+ $feat->frame($frame);
+ if ( $score eq '.' ) {
+ #$feat->score(undef);
+ } else {
+ $feat->score($score);
+ }
+ if ( $strand eq '-' ) { $feat->strand(-1); }
+ if ( $strand eq '+' ) { $feat->strand(1); }
+ if ( $strand eq '.' ) { $feat->strand(0); }
+ my @groups = split(/\s*;\s*/, $groups);
+
+ for my $group (@groups) {
+ next unless defined($group);
+ my ($tag,$value) = split /=/,$group;
+ $tag = unescape($tag);
+ my @values = map {unescape($_)} split /,/,$value;
+ for my $v ( @values ) { $feat->add_tag_value($tag,$v); }
+ }
+}
+
+# taken from Bio::DB::GFF
+sub unescape {
+ my $v = shift;
+ $v =~ tr/+/ /;
+ $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
+ return $v;
+}
+
+=head2 write_feature
+
+ Title : write_feature
+ Usage : $gffio->write_feature($feature);
+ Function: Writes the specified SeqFeatureI object in GFF format to the stream
+ associated with this instance.
+ Returns : none
+ Args : An array of Bio::SeqFeatureI implementing objects to be serialized
+
+=cut
+
+sub write_feature {
+ my ($self, @features) = @_;
+ return unless @features;
+ if( $self->{'_first'} && $self->gff_version() == 3 ) {
+ $self->_print("##gff-version 3\n");
+ }
+ $self->{'_first'} = 0;
+ foreach my $feature ( @features ) {
+ $self->_print($self->gff_string($feature)."\n");
+ }
+}
+
+=head2 gff_string
+
+ Title : gff_string
+ Usage : $gffstr = $gffio->gff_string($feature);
+ Function: Obtain the GFF-formatted representation of a SeqFeatureI object.
+ The formatting depends on the version specified at initialization.
+
+ This method is used by write_feature(). It actually dispatches to
+ one of the version-specific (private) methods.
+ Example :
+ Returns : A GFF-formatted string representation of the SeqFeature
+ Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
+
+=cut
+
+sub gff_string{
+ my ($self, $feature) = @_;
+
+ if($self->gff_version() == 1) {
+ return $self->_gff1_string($feature);
+ } elsif( $self->gff_version() == 3 ) {
+ return $self->_gff3_string($feature);
+ } elsif( $self->gff_version() == 2.5 ) {
+ return $self->_gff25_string($feature);
+ } else {
+ return $self->_gff2_string($feature);
+ }
+}
+
+=head2 _gff1_string
+
+ Title : _gff1_string
+ Usage : $gffstr = $gffio->_gff1_string
+ Function:
+ Example :
+ Returns : A GFF1-formatted string representation of the SeqFeature
+ Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
+
+=cut
+
+sub _gff1_string{
+ my ($gff, $feat) = @_;
+ my ($str,$score,$frame,$name,$strand);
+
+ if( $feat->can('score') ) {
+ $score = $feat->score();
+ }
+ $score = '.' unless defined $score;
+
+ if( $feat->can('frame') ) {
+ $frame = $feat->frame();
+ }
+ $frame = '.' unless defined $frame;
+
+ $strand = $feat->strand();
+ if(! $strand) {
+ $strand = ".";
+ } elsif( $strand == 1 ) {
+ $strand = '+';
+ } elsif ( $feat->strand == -1 ) {
+ $strand = '-';
+ }
+
+ if( $feat->can('seqname') ) {
+ $name = $feat->seq_id();
+ $name ||= 'SEQ';
+ } else {
+ $name = 'SEQ';
+ }
+
+
+ $str = join("\t",
+ $name,
+ $feat->source_tag,
+ $feat->primary_tag,
+ $feat->start,
+ $feat->end,
+ $score,
+ $strand,
+ $frame);
+
+ foreach my $tag ( $feat->get_all_tags ) {
+ next if exists $SKIPPED_TAGS{$tag};
+ foreach my $value ( $feat->get_tag_values($tag) ) {
+ $str .= " $tag=$value" if $value;
+ }
+ }
+
+
+ return $str;
+}
+
+=head2 _gff2_string
+
+ Title : _gff2_string
+ Usage : $gffstr = $gffio->_gff2_string
+ Function:
+ Example :
+ Returns : A GFF2-formatted string representation of the SeqFeature
+ Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified
+
+=cut
+
+sub _gff2_string{
+ my ($gff, $origfeat) = @_;
+ my $feat;
+ if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
+ $feat = $origfeat->feature2;
+ } else {
+ $feat = $origfeat;
+ }
+ my ($str1, $str2,$score,$frame,$name,$strand);
+
+ if( $feat->can('score') ) {
+ $score = $feat->score();
+ }
+ $score = '.' unless defined $score;
+
+ if( $feat->can('frame') ) {
+ $frame = $feat->frame();
+ }
+ $frame = '.' unless defined $frame;
+
+ $strand = $feat->strand();
+ if(! $strand) {
+ $strand = ".";
+ } elsif( $strand == 1 ) {
+ $strand = '+';
+ } elsif ( $feat->strand == -1 ) {
+ $strand = '-';
+ }
+
+ if( $feat->can('seqname') ) {
+ $name = $feat->seq_id();
+ $name ||= 'SEQ';
+ } else {
+ $name = 'SEQ';
+ }
+ $str1 = join("\t",
+ $name,
+ $feat->source_tag(),
+ $feat->primary_tag(),
+ $feat->start(),
+ $feat->end(),
+ $score,
+ $strand,
+ $frame);
+ # the routine below is the only modification I made to the original
+ # ->gff_string routine (above) as on November 17th, 2000, the
+ # Sanger webpage describing GFF2 format reads: "From version 2
+ # onwards, the attribute field must have a tag value structure
+ # following the syntax used within objects in a .ace file,
+ # flattened onto one line by semicolon separators. Tags must be
+ # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values
+ # must be quoted with double quotes".
+
+ # MW
+
+
+ my @all_tags = $feat->all_tags;
+ my @group;
+ if (@all_tags) { # only play this game if it is worth playing...
+ foreach my $tag ( @all_tags ) {
+ next if exists $SKIPPED_TAGS{$tag};
+ my @v;
+ foreach my $value ( $feat->each_tag_value($tag) ) {
+ unless( defined $value && length($value) ) {
+ $value = '""';
+ } elsif ($value =~ /[^A-Za-z0-9_]/){
+ $value =~ s/\t/\\t/g; # substitute tab and newline
+ # characters
+ $value =~ s/\n/\\n/g; # to their UNIX equivalents
+ $value = '"' . $value . '" ';
+ } # if the value contains
+ # anything other than valid
+ # tag/value characters, then
+ # quote it
+ push @v, $value;
+ # for this tag (allowed in GFF2 and .ace format)
+ }
+ push @group, "$tag ".join(" ", @v);
+ }
+ }
+ $str2 .= join(' ; ', @group);
+ # Add Target information for Feature Pairs
+ if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
+ ! $feat->has_tag('Group') &&
+ $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
+ $str2 = sprintf("Target %s %d %d", $origfeat->feature1->seq_id,
+ ( $origfeat->feature1->strand < 0 ?
+ ( $origfeat->feature1->end,
+ $origfeat->feature1->start) :
+ ( $origfeat->feature1->start,
+ $origfeat->feature1->end)
+ )) . ($str2?" ; ".$str2:""); # need to put Target information before other tag/value pairs - mw
+ }
+ return $str1."\t".$str2;
+}
+
+
+
+=head2 _gff25_string
+
+ Title : _gff25_string
+ Usage : $gffstr = $gffio->_gff2_string
+ Function: To get a format of GFF that is peculiar to Gbrowse/Bio::DB::GFF
+ Example :
+ Returns : A GFF2.5-formatted string representation of the SeqFeature
+ Args : A Bio::SeqFeatureI implementing object to be GFF2.5-stringified
+
+=cut
+
+sub _gff25_string {
+ my ($gff, $origfeat) = @_;
+ my $feat;
+ if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
+ $feat = $origfeat->feature2;
+ } else {
+ $feat = $origfeat;
+ }
+ my ($str1, $str2,$score,$frame,$name,$strand);
+
+ if( $feat->can('score') ) {
+ $score = $feat->score();
+ }
+ $score = '.' unless defined $score;
+
+ if( $feat->can('frame') ) {
+ $frame = $feat->frame();
+ }
+ $frame = '.' unless defined $frame;
+
+ $strand = $feat->strand();
+ if(! $strand) {
+ $strand = ".";
+ } elsif( $strand == 1 ) {
+ $strand = '+';
+ } elsif ( $feat->strand == -1 ) {
+ $strand = '-';
+ }
+
+ if( $feat->can('seqname') ) {
+ $name = $feat->seq_id();
+ $name ||= 'SEQ';
+ } else {
+ $name = 'SEQ';
+ }
+ $str1 = join("\t",
+ $name,
+ $feat->source_tag(),
+ $feat->primary_tag(),
+ $feat->start(),
+ $feat->end(),
+ $score,
+ $strand,
+ $frame);
+
+ my @all_tags = $feat->all_tags;
+ my @group; my @firstgroup;
+ if (@all_tags) { # only play this game if it is worth playing...
+ foreach my $tag ( @all_tags ) {
+ my @v;
+ foreach my $value ( $feat->each_tag_value($tag) ) {
+ next if exists $SKIPPED_TAGS{$tag};
+ unless( defined $value && length($value) ) {
+ $value = '""';
+ } elsif ($value =~ /[^A-Za-z0-9_]/){
+ $value =~ s/\t/\\t/g; # substitute tab and newline
+ # characters
+ $value =~ s/\n/\\n/g; # to their UNIX equivalents
+ $value = '"' . $value . '" ';
+ } # if the value contains
+ # anything other than valid
+ # tag/value characters, then
+ # quote it
+ push @v, $value;
+ # for this tag (allowed in GFF2 and .ace format)
+ }
+ if (($tag eq 'Group') || ($tag eq 'Target')){ # hopefully we wont get both...
+ push @firstgroup, "$tag ".join(" ", @v);
+ } else {
+ push @group, "$tag ".join(" ", @v);
+ }
+ }
+ }
+ $str2 = join(' ; ', (@firstgroup, @group));
+ # Add Target information for Feature Pairs
+ if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
+ ! $feat->has_tag('Group') &&
+ $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
+ $str2 = sprintf("Target %s ; tstart %d ; tend %d", $origfeat->feature1->seq_id,
+ ( $origfeat->feature1->strand < 0 ?
+ ( $origfeat->feature1->end,
+ $origfeat->feature1->start) :
+ ( $origfeat->feature1->start,
+ $origfeat->feature1->end)
+ )) . ($str2?" ; ".$str2:""); # need to put the target info before other tag/value pairs - mw
+ }
+ return $str1 . "\t". $str2;
+}
+
+
+=head2 _gff3_string
+
+ Title : _gff3_string
+ Usage : $gffstr = $gffio->_gff3_string
+ Function:
+ Example :
+ Returns : A GFF3-formatted string representation of the SeqFeature
+ Args : A Bio::SeqFeatureI implementing object to be GFF3-stringified
+
+=cut
+
+sub _gff3_string {
+ my ($gff, $origfeat) = @_;
+ my $feat;
+ if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
+ $feat = $origfeat->feature2;
+ } else {
+ $feat = $origfeat;
+ }
+
+ my $ID = $gff->_incrementGFF3ID();
+
+ my ($score,$frame,$name,$strand);
+
+ if( $feat->can('score') ) {
+ $score = $feat->score();
+ }
+ $score = '.' unless defined $score;
+
+ if( $feat->can('frame') ) {
+ $frame = $feat->frame();
+ }
+ $frame = '1' unless defined $frame;
+
+ $strand = $feat->strand();
+
+ if(! $strand) {
+ $strand = ".";
+ } elsif( $strand == 1 ) {
+ $strand = '+';
+ } elsif ( $feat->strand == -1 ) {
+ $strand = '-';
+ }
+
+ if( $feat->can('seqname') ) {
+ $name = $feat->seq_id();
+ $name ||= 'SEQ';
+ } else {
+ $name = 'SEQ';
+ }
+
+ my @groups;
+
+ # force leading ID and Parent tags
+ my @all_tags = grep { ! exists $GFF3_ID_Tags{$_} } $feat->all_tags;
+ for my $t ( sort { $GFF3_ID_Tags{$b} <=> $GFF3_ID_Tags{$a} }
+ keys %GFF3_ID_Tags ) {
+ unshift @all_tags, $t if $feat->has_tag($t);
+ }
+
+ for my $tag ( @all_tags ) {
+ next if exists $SKIPPED_TAGS{$tag};
+ # next if $tag eq 'Target';
+ if ($tag eq 'Target' && ! $origfeat->isa('Bio::SeqFeature::FeaturePair')){
+ # simple Target,start,stop
+ my($target_id, $b,$e,$strand) = $feat->get_tag_values($tag);
+ next unless(defined($e) && defined($b) && $target_id);
+ ($b,$e)= ($e,$b) if(defined $strand && $strand<0);
+ $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
+ push @groups, sprintf("Target=%s %d %d", $target_id,$b,$e);
+ next;
+ }
+
+ my $valuestr;
+ # a string which will hold one or more values
+ # for this tag, with quoted free text and
+ # space-separated individual values.
+ my @v;
+ for my $value ( $feat->each_tag_value($tag) ) {
+ if( defined $value && length($value) ) {
+ #$value =~ tr/ /+/; #spaces are allowed now
+ if ( ref $value eq 'Bio::Annotation::Comment') {
+ $value = $value->text;
+ }
+
+ if ($value =~ /[^a-zA-Z0-9\,\;\=\.:\%\^\*\$\@\!\+\_\?\-]/) {
+ $value =~ s/\t/\\t/g; # substitute tab and newline
+ # characters
+ $value =~ s/\n/\\n/g; # to their UNIX equivalents
+
+ # Unescaped quotes are not allowed in GFF3
+ # $value = '"' . $value . '"';
+ }
+ $value =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
+ } else {
+ # if it is completely empty,
+ # then just make empty double
+ # quotes
+ $value = '""';
+ }
+ push @v, $value;
+ }
+ # can we figure out how to improve this?
+ $tag= lcfirst($tag) unless ($tag
+ =~ /^(ID|Name|Alias|Parent|Gap|Target|Derives_from|Note|Dbxref|Ontology_term)$/);
+
+ push @groups, "$tag=".join(",",@v);
+ }
+ # Add Target information for Feature Pairs
+ if( $feat->has_tag('Target') &&
+ ! $feat->has_tag('Group') &&
+ $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
+
+ my $target_id = $origfeat->feature1->seq_id;
+ $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
+
+ push @groups, sprintf("Target=%s %d %d",
+ $target_id,
+ ( $origfeat->feature1->strand < 0 ?
+ ( $origfeat->feature1->end,
+ $origfeat->feature1->start) :
+ ( $origfeat->feature1->start,
+ $origfeat->feature1->end)
+ ));
+ }
+
+ # unshift @groups, "ID=autogenerated$ID" unless ($feat->has_tag('ID'));
+ if ( $feat->can('name') && defined($feat->name) ) {
+ # such as might be for Bio::DB::SeqFeature
+ unshift @groups, 'Name=' . $feat->name;
+ }
+
+ my $gff_string = "";
+ if ($feat->location->isa("Bio::Location::SplitLocationI")) {
+ my @locs = $feat->location->each_Location;
+ foreach my $loc (@locs) {
+ $gff_string .= join("\t",
+ $name,
+ $feat->source_tag() || '.',
+ $feat->primary_tag(),
+ $loc->start(),
+ $loc->end(),
+ $score,
+ $strand,
+ $frame,
+ join(';', @groups)) . "\n";
+ }
+ chop $gff_string;
+ return $gff_string;
+ } else {
+ $gff_string = join("\t",
+ $name,
+ $feat->source_tag() || '.',
+ $feat->primary_tag(),
+ $feat->start(),
+ $feat->end(),
+ $score,
+ $strand,
+ $frame,
+ join(';', @groups));
+ }
+ return $gff_string;
+}
+
+=head2 gff_version
+
+ Title : _gff_version
+ Usage : $gffversion = $gffio->gff_version
+ Function:
+ Example :
+ Returns : The GFF version this parser will accept and emit.
+ Args : none
+
+=cut
+
+sub gff_version {
+ my ($self, $value) = @_;
+ if(defined $value && grep {$value == $_ } ( 1, 2, 2.5, 3)) {
+ $self->{'GFF_VERSION'} = $value;
+ }
+ return $self->{'GFF_VERSION'};
+}
+
+# Make filehandles
+
+=head2 newFh
+
+ Title : newFh
+ Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
+ Function: does a new() followed by an fh()
+ Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
+ $feature = <$fh>; # read a feature object
+ print $fh $feature; # write a feature object
+ Returns : filehandle tied to the Bio::Tools::GFF class
+ Args :
+
+
+=cut
+
+sub newFh {
+ my $class = shift;
+ return unless my $self = $class->new(@_);
+ return $self->fh;
+}
+
+=head2 fh
+
+ Title : fh
+ Usage : $obj->fh
+ Function:
+ Example : $fh = $obj->fh; # make a tied filehandle
+ $feature = <$fh>; # read a feature object
+ print $fh $feature; # write a feature object
+ Returns : filehandle tied to Bio::Tools::GFF class
+ Args : none
+
+
+=cut
+
+
+sub fh {
+ my $self = shift;
+ my $class = ref($self) || $self;
+ my $s = Symbol::gensym;
+ tie $$s,$class,$self;
+ return $s;
+}
+
+# This accessor is used for accessing the Bio::Seq objects from a GFF3
+# file; if the file you are using has no sequence data you can ignore
+# this accessor
+
+# This accessor returns a hash reference containing Bio::Seq objects,
+# indexed by Bio::Seq->primary_id
+
+sub _seq_by_id_h {
+ my $self = shift;
+
+ return $self->{'_seq_by_id_h'} = shift if @_;
+ $self->{'_seq_by_id_h'} = {}
+ unless $self->{'_seq_by_id_h'};
+ return $self->{'_seq_by_id_h'};
+}
+
+=head2 get_seqs
+
+ Title : get_seqs
+ Usage :
+ Function: Returns all Bio::Seq objects populated by GFF3 file
+ Example :
+ Returns :
+ Args :
+
+=cut
+
+sub get_seqs {
+ my ($self,@args) = @_;
+ return values %{$self->_seq_by_id_h};
+}
+
+=head2 features_attached_to_seqs
+
+ Title : features_attached_to_seqs
+ Usage : $obj->features_attached_to_seqs(1);
+ Function: For use with GFF3 containg sequence only
+
+ Setting this B<before> parsing ensures that all Bio::Seq object
+ created will have the appropriate features added to them
+
+ defaults to false (off)
+
+ Note that this mode will incur higher memory usage because features
+ will have to be cached until the relevant feature comes along
+
+ Example :
+ Returns : value of features_attached_to_seqs (a boolean)
+ Args : on set, new value (a boolean, optional)
+
+
+=cut
+
+sub features_attached_to_seqs{
+ my $self = shift;
+
+ return $self->{'_features_attached_to_seqs'} = shift if @_;
+ return $self->{'_features_attached_to_seqs'};
+}
+
+=head2 ignore_sequence
+
+ Title : ignore_sequence
+ Usage : $obj->ignore_sequence(1);
+ Function: For use with GFF3 containg sequence only
+
+ Setting this B<before> parsing means that all sequence data will be
+ ignored
+
+ Example :
+ Returns : value of ignore_sequence (a boolean)
+ Args : on set, new value (a boolean, optional)
+
+=cut
+
+sub ignore_sequence{
+ my $self = shift;
+
+ return $self->{'_ignore_sequence'} = shift if @_;
+ return $self->{'_ignore_sequence'};
+}
+
+
+sub DESTROY {
+ my $self = shift;
+ $self->close();
+}
+
+sub TIEHANDLE {
+ my ($class,$val) = @_;
+ return bless {'gffio' => $val}, $class;
+}
+
+sub READLINE {
+ my $self = shift;
+ return $self->{'gffio'}->next_feature() unless wantarray;
+ my (@list, $obj);
+ push @list, $obj while $obj = $self->{'gffio'}->next_feature();
+ return @list;
+}
+
+sub PRINT {
+ my $self = shift;
+ $self->{'gffio'}->write_feature(@_);
+}
+
+1;
+
40 modules/LSF.pm
View
@@ -235,13 +235,18 @@ sub adjust_bsub_options
}
if (defined $mem) {
- # at some point an attempt to run this failed due to MEMLIMIT, using
- # $mem max memory; increase by 1000MB
- $mem += 1000;
+ # at some point an attempt to run this failed due to MEMLIMIT
+ $mem = calculate_memory_limit($mem);
- if ( !defined $mem_limit ) { $mem_limit = 15_900; }
- if ( $mem>$mem_limit ) {
- Utils::error(sprintf "FIXME: Increase memory_limit, more than %.1fGB of memory is required.", $mem_limit/1000);
+ if ($mem>500000) {
+ Utils::error("FIXME: This job cannot be run on the farm, more than 500GB of memory is required.");
+ }
+ elsif($mem>30000)
+ {
+ warn("$output_file: changing queue to hugemem\n");
+ $opts =~ s/-q normal/-q hugemem/;
+ $opts =~ s/-q long/-q hugemem/;
+ if ( !($opts=~/-q/) ) { $opts .= ' -q hugemem'; }
}
# adjust the command line to include higher memory reservation, but only
@@ -281,6 +286,29 @@ sub adjust_bsub_options
}
+=head2 calculate_memory_limit
+
+ Arg [1] : memory in mega bytes for previously failed job
+ Description : Increases the memory by a set minimum or by a percentage, which ever is greater
+ Returntype : Int
+
+=cut
+sub calculate_memory_limit
+{
+ my ($mem) = @_;
+ my $minimum_memory_increase = 1000;
+ my $memory_increase_percentage = 0.3;
+
+ my $updated_memory_limit = $mem*(1+$memory_increase_percentage);
+ if($updated_memory_limit < $mem + $minimum_memory_increase)
+ {
+ $updated_memory_limit = $mem + $minimum_memory_increase;
+ }
+
+ return int($updated_memory_limit);
+}
+
+
=head2 run
Arg [1] : lock file where to put the JID (opened in append mode)
94 modules/Pathogens/Import/CompressAndValidate.pm
View
@@ -0,0 +1,94 @@
+=head1 NAME
+
+=head1 SYNOPSIS
+
+use Pathogens::Import::CompressAndValidate;
+my $validator = Pathogens::Import::CompressAndValidate->new( irods_filename => $irods, fastq_filenames => \@fastqs );
+$validator->is_compressed_and_validated() || $die("Compress and validate failed\n");
+
+=head1 DESCRIPTION
+
+Compress fastq file, add md5 checksum files and check number of reads against iRODS.
+
+=cut
+
+package Pathogens::Import::CompressAndValidate;
+use Moose;
+use Utils;
+use VertRes::Wrapper::fastqcheck;
+use Pathogens::Import::ValidateFastqConversion;
+
+has 'irods_filename' => ( is => 'ro', isa => 'Str', required => 1);
+has 'fastq_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1);
+
+sub _compress_and_checksum
+{
+ my($self) = @_;
+
+ for my $fastq (@{$self->{fastq_filenames}})
+ {
+ # Compress fastq
+ Utils::CMD(qq[gzip -9 -c $fastq > $fastq.gz]);
+
+ # Checksum fastqs
+ Utils::CMD(qq[md5sum $fastq > $fastq.md5]);
+ Utils::CMD(qq[md5sum $fastq.gz > $fastq.gz.md5]);
+ }
+}
+
+sub _fastqcheck
+{
+ my($self) = @_;
+
+ for my $fastq (@{$self->{fastq_filenames}})
+ {
+ # Generate fastqcheckfile
+ my $fastqcheck = VertRes::Wrapper::fastqcheck->new();
+ $fastqcheck->run($fastq.'.gz', $fastq.'.gz.fastqcheck.tmp');
+
+ $fastqcheck->run_status >= 1 || return 0;
+ }
+ return 1;
+}
+
+sub _confirm_valid
+{
+ my($self) = @_;
+ my @fastqcheck_tmp = (); # fastqcheck.tmp files.
+
+ for my $fastq (@{$self->{fastq_filenames}})
+ {
+ push @fastqcheck_tmp, $fastq.'.gz.fastqcheck.tmp';
+ }
+
+ # Validate against iRODS
+ my $validate = Pathogens::Import::ValidateFastqConversion->new(
+ fastqcheck_filenames => \@fastqcheck_tmp,
+ irods_filename => $self->irods_filename
+ );
+ return $validate->is_total_reads_valid(); # Validation check
+}
+
+sub is_compressed_and_validated
+{
+ my($self) = @_;
+
+ # Compress and checksum
+ $self->_compress_and_checksum();
+
+ # Generate temp fastqcheck files
+ $self->_fastqcheck() || return 0;
+
+ # Check reads from fastqs match reads from iRODS.
+ $self->_confirm_valid() || return 0;
+
+ # Rename temp fastqcheck files if valid.
+ for my $fastq (@{$self->{fastq_filenames}})
+ {
+ Utils::CMD(qq[mv $fastq.gz.fastqcheck.tmp $fastq.gz.fastqcheck]);
+ }
+
+ return 1;
+}
+
+1;
70 modules/Pathogens/Import/ValidateFastqConversion.pm
View
@@ -0,0 +1,70 @@
+=head1 NAME
+
+ValidateFastqConversion.pm - Checks imported reads math reads from iRODS.
+
+=head1 SYNOPSIS
+
+use Pathogens::Import::ValidateFastqConversion;
+my $validate = Pathogens::Import::ValidateFastqConversion->new(
+ fastqcheck_filenames => ['1234_5_6_1.fastq.gz.fastqcheck','1234_5_6_2.fastq.gz.fastqcheck'],
+ irods_filename => '1234_5#6.bam'
+ );
+$validate->is_total_reads_valid();
+
+=cut
+package Pathogens::Import::ValidateFastqConversion;
+
+use Moose;
+use VertRes::Parser::fastqcheck;
+use VertRes::Wrapper::iRODS;
+
+has 'fastqcheck_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1);
+has 'irods_filename' => ( is => 'ro', isa => 'Str', required => 1);
+has '_fastq_totalreads' => ( is => 'ro', isa => 'ArrayRef', lazy_build => 1);
+
+sub _build__fastq_totalreads
+{
+ my ($self) = @_;
+ my @totalreads;
+ for my $filename (@{$self->fastqcheck_filenames})
+ {
+ my $parser = VertRes::Parser::fastqcheck->new(file => $filename);
+ my $readcount = $parser->num_sequences() || 0;
+ push @totalreads, $readcount;
+ }
+ return \@totalreads;
+}
+
+sub _sum_fastq_reads
+{
+ my ($self) = @_;
+ my $sum = 0;
+ for my $total_reads (@{$self->_fastq_totalreads})
+ {
+ $sum += $total_reads;
+ }
+ return $sum;
+}
+
+sub _sum_irods_reads
+{
+ my ($self) = @_;
+ my $irods = VertRes::Wrapper::iRODS->new();
+ my $ifile = $irods->find_file_by_name($self->irods_filename);
+ my $readcount = $irods->get_total_reads($ifile) || 0;
+ return $readcount;
+}
+
+sub is_total_reads_valid
+{
+ my ($self) = @_;
+ if($self->_sum_irods_reads == $self->_sum_fastq_reads)
+ {
+ return 1;
+ }
+ return 0;
+}
+
+
+
+1;
146 modules/Pathogens/RNASeq/AlignmentSlice.pm
View
@@ -0,0 +1,146 @@
+=head1 NAME
+
+AlignmentSlice.pm - Extract a slice of reads for a sequence file within a specific region
+
+=head1 SYNOPSIS
+
+use Pathogens::RNASeq::AlignmentSlice;
+my $alignment_slice = Pathogens::RNASeq::AlignmentSlice->new(
+ filename => '/abc/my_file.bam',
+ window_margin => 10,
+ total_mapped_reads => 1234,
+ );
+ my %rpkm_values = $alignment_slice->rpkm_values;
+ $rpkm_values{rpkm_sense};
+ $rpkm_values{rpkm_antisense};
+ $rpkm_values{mapped_reads_sense};
+ $rpkm_values{mapped_reads_antisense};
+
+ $rpkm_values{mapped_reads_forward};
+ $rpkm_values{mapped_reads_reverse};
+
+=cut
+package Pathogens::RNASeq::AlignmentSlice;
+use Moose;
+use Pathogens::RNASeq::Exceptions;
+use Pathogens::RNASeq::Read;
+
+# required inputs
+has 'filename' => ( is => 'rw', isa => 'Str', required => 1 );
+has 'feature' => ( is => 'rw', required => 1 );
+has 'total_mapped_reads' => ( is => 'rw', isa => 'Int', required => 1 );
+#optional input
+has 'samtools_exec' => ( is => 'rw', isa => 'Str', default => "samtools" );
+has 'protocol' => ( is => 'rw', isa => 'Str', default => 'StandardProtocol' );
+has 'window_margin' => ( is => 'rw', isa => 'Int', default => 50 );
+has 'filters' => ( is => 'rw', isa => 'Maybe[HashRef]' );
+has '_input_slice_filename' => ( is => 'rw', isa => 'Str'); # allow for testing and for using VR samtools view output file
+# output variable
+has 'rpkm_values' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 );
+
+# internal variables
+has '_slice_file_handle' => ( is => 'rw', lazy_build => 1 );
+has '_window_start' => ( is => 'rw', isa => 'Int', lazy_build => 1 );
+has '_window_end' => ( is => 'rw', isa => 'Int', lazy_build => 1 );
+has '_read_protocol_class' => ( is => 'rw', lazy_build => 1 );
+
+sub _build__window_start
+{
+ my ($self) = @_;
+ my $window_start = $self->feature->gene_start - $self->window_margin;
+ $window_start = $window_start < 1 ? 1 : $window_start;
+ return $window_start;
+}
+
+sub _build__window_end
+{
+ my ($self) = @_;
+ $self->feature->gene_end + $self->window_margin;
+}
+
+sub _build__slice_file_handle
+{
+ my ($self) = @_;
+ my $slice_file_handle;
+ open($slice_file_handle, $self->_slice_stream ) || Pathogens::RNASeq::Exceptions::FailedToOpenAlignmentSlice->throw( error => "Cant view slice for ".$self->filename." ".$self->_window_start." " .$self->_window_end );
+ return $slice_file_handle;
+}
+
+sub _slice_stream
+{
+ my ($self) = @_;
+ if($self->_input_slice_filename)
+ {
+ return $self->_input_slice_filename;
+ }
+ else
+ {
+ return $self->samtools_exec." view ".$self->filename." ".$self->feature->seq_id.":".$self->_window_start."-".$self->_window_end." |";
+ }
+}
+
+sub _build__read_protocol_class
+{
+ my ($self) = @_;
+ my $read_protocol_class = "Pathogens::RNASeq::".$self->protocol."::Read";
+ eval("use $read_protocol_class");
+ return $read_protocol_class;
+}
+
+
+sub _build_rpkm_values
+{
+ my ($self) = @_;
+ my %rpkm_values;
+
+ $rpkm_values{mapped_reads_sense} = 0;
+ $rpkm_values{mapped_reads_antisense} = 0;
+ $rpkm_values{mapped_reads_forward} = 0;
+ $rpkm_values{mapped_reads_reverse} = 0;
+
+ my $file_handle = $self->_slice_file_handle;
+
+ while(my $line = <$file_handle>)
+ {
+ my $sequence_reads = $self->_read_protocol_class->new(
+ alignment_line => $line,
+ exons => $self->feature->exons,
+ gene_strand => $self->feature->gene_strand,
+ filters => $self->filters
+ );
+ my $mapped_reads = $sequence_reads->mapped_reads;
+
+ $rpkm_values{mapped_reads_sense} += $mapped_reads->{sense};
+ $rpkm_values{mapped_reads_antisense} += $mapped_reads->{antisense};
+
+ if($sequence_reads->read_strand == 0)
+ {
+ $rpkm_values{mapped_reads_forward} += $mapped_reads->{sense};
+ $rpkm_values{mapped_reads_reverse} += $mapped_reads->{antisense};
+ }
+ else
+ {
+ $rpkm_values{mapped_reads_forward} += $mapped_reads->{antisense};
+ $rpkm_values{mapped_reads_reverse} += $mapped_reads->{sense};
+ }
+ }
+
+ $rpkm_values{rpkm_sense} = $self->_calculate_rpkm($rpkm_values{mapped_reads_sense});
+ $rpkm_values{rpkm_antisense} = $self->_calculate_rpkm($rpkm_values{mapped_reads_antisense});
+
+ return \%rpkm_values;
+}
+
+sub _calculate_rpkm
+{
+ my ($self, $mapped_reads) = @_;
+ #my $rpkm = $mapped_reads / ( ($self->feature->exon_length/1000) * ($self->total_mapped_reads/1000000) );
+ # same equation, rewritten
+ my $rpkm = ($mapped_reads / $self->feature->exon_length) * (1000000000/$self->total_mapped_reads);
+
+
+ return $rpkm;
+}
+
+
+1;
47 modules/Pathogens/RNASeq/BAMStats.pm
View
@@ -0,0 +1,47 @@
+=head1 NAME
+
+BAMStats.pm - Functionality for Stats files for a BAM
+
+=head1 SYNOPSIS
+
+use Pathogens::RNASeq::BAM;
+my $bam_container = Pathogens::RNASeq::BAM->new(
+ filename => '/abc/my_file.bam'
+ );
+
+=cut
+package Pathogens::RNASeq::BAMStats;
+use Moose;
+use VertRes::Parser::bas;
+use VertRes::Utils::Sam;
+use Time::Format;
+
+has 'total_mapped_reads' => ( is => 'rw', isa => 'Str', lazy_build => 1 );
+has '_parser' => ( is => 'rw', isa => 'VertRes::Parser::bas', lazy_build => 1 );
+
+sub _build__parser
+{
+ my ($self) = @_;
+ unless(-e $self->filename.'.bas')
+ {
+ $self->_create_stats_files();
+ }
+
+ VertRes::Parser::bas->new(file => $self->filename.'.bas');
+}
+
+
+sub _build_total_mapped_reads
+{
+ my ($self) = @_;
+ $self->_parser->mapped_reads;
+}
+
+
+sub _create_stats_files
+{
+ my ($self) = @_;
+ my $sam = VertRes::Utils::Sam->new();
+ $sam->stats("$time{'yyyymmdd'}", $self->filename);
+}
+1;
113 modules/Pathogens/RNASeq/BitWise.pm
View
@@ -0,0 +1,113 @@
+=head1 NAME
+
+BitWise.pm - Manipulate the bitwise flags in a BAM file
+
+=head1 SYNOPSIS
+
+use Pathogens::RNASeq::BitWise;
+my $bitwise = Pathogens::RNASeq::BitWise->new(
+ filename => '/abc/my_file.bam',
+ output_filename => 'output_file.bam'
+ );
+$bitwise->update_bitwise_flags;
+
+=cut
+package Pathogens::RNASeq::BitWise;
+use Moose;
+use Pathogens::RNASeq::Exceptions;
+use VertRes::Wrapper::samtools;
+
+has 'filename' => ( is => 'rw', isa => 'Str', required => 1 );
+has 'output_filename' => ( is => 'rw', isa => 'Str', required => 1 );
+has 'protocol' => ( is => 'rw', isa => 'Str', default => 'StandardProtocol' );
+has 'samtools_exec' => ( is => 'rw', isa => 'Str', default => "samtools" );
+
+has '_input_sequence_data_filename' => ( is => 'rw', isa => 'Str'); # allow for testing
+has '_sequence_data_file_handle' => ( is => 'rw', lazy_build => 1 );
+has '_read_protocol_class' => ( is => 'rw', lazy_build => 1 );
+has '_output_file_handle' => ( is => 'rw', lazy_build => 1 );
+
+
+sub update_bitwise_flags
+{
+ my ($self) = @_;
+ my $file_handle = $self->_sequence_data_file_handle;
+
+ while(my $line = <$file_handle>)
+ {
+ if($line =~ /^@/)
+ {
+ print {$self->_output_file_handle} $line;
+ }
+ else
+ {
+ if($line =~ /^([^\t]+\t)([\d]+)(\t.+)$/ )
+ {
+ my $start_of_line = $1;
+ my $flag = $2;
+ my $end_of_line = $3;
+
+ $flag = $self->_read_protocol_class->_calculate_bitwise_flag($flag);
+ print {$self->_output_file_handle} $start_of_line.$flag.$end_of_line."\n" ;
+ }
+ else
+ {
+ print {$self->_output_file_handle} $line;
+ }
+
+ }
+
+ }
+ close($self->_output_file_handle);
+ $self->_index_output_file;
+ return 1;
+}
+
+sub _index_output_file
+{
+ my ($self) = @_;
+ my $samtools = VertRes::Wrapper::samtools->new();
+ $samtools->index($self->output_filename, $self->output_filename.".bai");
+ return 1;
+}
+
+
+sub _build__output_file_handle
+{
+ my ($self) = @_;
+ my $output_file_handle;
+ open($output_file_handle, '|- ', $self->samtools_exec." view -bS - > ". $self->output_filename) || Pathogens::RNASeq::Exceptions::FailedToCreateNewBAM->throw(error => "Couldnt open output file for writing ". $self->output_filename);
+ return $output_file_handle;
+}
+
+sub _build__read_protocol_class
+{
+ my ($self) = @_;
+ my $read_protocol_class = "Pathogens::RNASeq::".$self->protocol."::Read";
+ eval("use $read_protocol_class");
+ return $read_protocol_class;
+}
+
+sub _build__sequence_data_file_handle
+{
+ my ($self) = @_;
+ my $sequence_data_file_handle;
+ open($sequence_data_file_handle, $self->_sequence_data_stream ) || Pathogens::RNASeq::Exceptions::FailedToOpenAlignmentSlice->throw( error => "Cant view ".$self->filename."" );
+ return $sequence_data_file_handle;
+}
+
+sub _sequence_data_stream
+{
+ my ($self) = @_;
+ if($self->_input_sequence_data_filename)
+ {
+ return $self->_input_sequence_data_filename;
+ }
+ else
+ {
+ return $self->samtools_exec." view -h ".$self->filename." |";
+ }
+}
+
+1;
+
165 modules/Pathogens/RNASeq/CoveragePlot.pm
View
@@ -0,0 +1,165 @@
+=head1 NAME
+
+CoveragePlot.pm - Take in a sequencing file, do a pileup and generate a dot plot of the number of reads at each position.Assumes the BAM is sorted.
+
+=head1 SYNOPSIS
+
+use Pathogens::RNASeq::CoveragePlot;
+my $coverage_plots_from_bam = Pathogens::RNASeq::CoveragePlot->new(
+ filename => 'my_file.bam',
+ output_base_filename => 'my_output_file',
+ mpileup_cmd => "samtools mpileup "
+ );
+$coverage_plots_from_bam->create_plots();
+
+
+=cut
+package Pathogens::RNASeq::CoveragePlot;
+use Moose;
+use VertRes::Parser::bam;
+
+has 'filename' => ( is => 'rw', isa => 'Str', required => 1 );
+has 'output_base_filename' => ( is => 'rw', isa => 'Str', required => 1 );
+# optional inputs
+has 'mpileup_cmd' => ( is => 'rw', isa => 'Str', default => "samtools mpileup -d 1000" );
+
+has '_input_file_handle' => ( is => 'rw', lazy_build => 1 );
+has '_output_file_handles' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 );
+has '_sequence_names' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
+has '_sequence_base_counters' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 );
+has '_sequence_information' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 );
+
+sub _build__sequence_information
+{
+ my ($self) = @_;
+ my %all_sequences_info = VertRes::Parser::bam->new( file => $self->filename )->sequence_info();
+ return \%all_sequences_info;
+}
+
+sub _build__sequence_names
+{
+ my ($self) = @_;
+ my @sequence_names = keys %{$self->_sequence_information} ;
+ return \@sequence_names;
+}
+
+sub _build__sequence_base_counters
+{
+ my ($self) = @_;
+ my %sequence_base_counters;
+ for my $sequence_name (@{$self->_sequence_names} )
+ {
+ $sequence_base_counters{$sequence_name} = 0;
+ }
+ return \%sequence_base_counters;
+}
+
+sub _build__output_file_handles
+{
+ my ($self) = @_;
+ my %output_file_handles;
+ for my $sequence_name (@{$self->_sequence_names} )
+ {
+ open($output_file_handles{$sequence_name}, '|-', " gzip >". $self->output_base_filename.".$sequence_name.coverageplot.gz") || Pathogens::RNASeq::Exceptions::FailedToCreateOutputFileHandle->throw(error => "Couldnt create output file handle for saving coverage plot results for ". $sequence_name. " in ". $self->filename. " and output base ".$self->output_base_filename);
+ }
+
+ return \%output_file_handles;
+}
+
+sub _build__input_file_handle
+{
+ my ($self) = @_;
+ my $input_file_handle;
+ # TODO remove direct call to samtools and allow for filtering options
+ # this only extracts the sequence name, base position, bases.
+ open($input_file_handle, '-|', $self->mpileup_cmd." ". $self->filename.' | awk \'{print $1"\t"$2"\t"$5}\'') || Pathogens::RNASeq::Exceptions::FailedToCreateMpileup->throw(error => "Failed to create mpileup for ".$self->filename );
+ return $input_file_handle;
+}
+
+sub _number_of_forward_reads
+{
+ my ($self, $read_string) = @_;
+ return $self->_number_of_reads("[ACGTN]", $read_string);
+}
+
+sub _number_of_reverse_reads
+{
+ my ($self, $read_string) = @_;
+ return $self->_number_of_reads("[acgtn]", $read_string);
+}
+
+sub _number_of_reads
+{
+ my ($self,$base_regex, $read_string) = @_;
+ $_ = $read_string;
+ my $base_count = s/$base_regex//g;
+ $base_count = ($base_count eq "") ? 0 : $base_count;
+
+ while ($read_string =~ /[\+-]([\d]+)$base_regex/g)
+ {
+ $base_count -= $1;
+ }
+ $base_count = ($base_count < 0) ? 0 : $base_count;
+ return $base_count;
+}
+
+# work out if padding is needed and return it as a formatted string
+sub _create_padding_string
+{
+ my ($self,$previous_counter, $current_counter) = @_;
+ my $padding_string = "";
+ for(my $i = $previous_counter+1 ; $i < $current_counter; $i++)
+ {
+ $padding_string .= "0 0\n";
+ }
+ return $padding_string;
+}
+
+sub _print_padding_at_end_of_sequence
+{
+ my ($self) = @_;
+ for my $sequence_name (@{$self->_sequence_names} )
+ {
+ my $sequence_length = $self->_sequence_information->{$sequence_name}->{'LN'};
+ next unless($sequence_length =~ /^[\d]+$/);
+ $sequence_length++;
+ my $padding_string = $self->_create_padding_string($self->_sequence_base_counters->{$sequence_name}, $sequence_length);
+ $self->_sequence_base_counters->{$sequence_name} = $sequence_length;
+ print { $self->_output_file_handles->{$sequence_name} } $padding_string;
+ }
+}
+
+sub _close_output_file_handles
+{
+ my ($self) = @_;
+ for my $output_file_handle (values %{$self->_output_file_handles} )
+ {
+ close($output_file_handle);
+ }
+ return;
+}
+
+sub create_plots
+{
+ my ($self) = @_;
+ my $input_file_handle = $self->_input_file_handle;
+
+ while(my $line = <$input_file_handle>)
+ {
+ my($sequence_name, $base_position, $read_string) = split(/\t/, $line);
+ my $padding_string = $self->_create_padding_string($self->_sequence_base_counters->{$sequence_name},$base_position);
+
+ $self->_sequence_base_counters->{$sequence_name} = $base_position;
+ my $forward_reads = $self->_number_of_forward_reads($read_string);
+ my $reverse_reads = $self->_number_of_reverse_reads($read_string);
+
+ print { $self->_output_file_handles->{$sequence_name} } $padding_string.$forward_reads." ".$reverse_reads."\n";
+ }
+ $self->_print_padding_at_end_of_sequence;
+ $self->_close_output_file_handles;
+ return 1;
+}
+
+
+
+1;
12 modules/Pathogens/RNASeq/Exceptions.pm
View
@@ -0,0 +1,12 @@
+package Pathogens::RNASeq::Exceptions;
+
+use Exception::Class (
+ Pathogens::RNASeq::Exceptions::FailedToOpenAlignmentSlice => { description => 'Couldnt get reads from alignment slice. Error with Samtools or BAM' },
+ Pathogens::RNASeq::Exceptions::FailedToOpenExpressionResultsSpreadsheetForWriting => { description => 'Couldnt write out the results for expression' },
+ Pathogens::RNASeq::Exceptions::InvalidInputFiles => { description => 'Invalid inputs, sequence names or lengths are incorrect' },
+ Pathogens::RNASeq::Exceptions::FailedToCreateNewBAM => { description => 'Couldnt create a new bam file' },
+ Pathogens::RNASeq::Exceptions::FailedToCreateMpileup => { description => 'Couldnt create an mpileup' },
+ Pathogens::RNASeq::Exceptions::FailedToOpenFeaturesTabFileForWriting => { description => 'Couldnt write tab file' },
+);
+
+1;
180 modules/Pathogens/RNASeq/Expression.pm
View
@@ -0,0 +1,180 @@
+=head1 NAME
+
+Expression.pm - Find the expression when given an input aligned file and an annotation file
+
+=head1 SYNOPSIS
+
+use Pathogens::RNASeq::Expression;
+my $expression_results = Pathogens::RNASeq::Expression->new(
+ sequence_filename => 'my_aligned_sequence.bam',
+ annotation_filename => 'my_annotation_file.gff',
+ output_base_filename => 'my_alignement_basename'
+ );
+
+$expression_results->output_spreadsheet();
+
+=cut
+package Pathogens::RNASeq::Expression;
+use Moose;
+use Pathogens::RNASeq::SequenceFile;
+use Pathogens::RNASeq::GFF;
+use Pathogens::RNASeq::AlignmentSlice;
+use Pathogens::RNASeq::ExpressionStatsSpreadsheet;
+use Pathogens::RNASeq::ValidateInputs;
+use Pathogens::RNASeq::Exceptions;
+use Pathogens::RNASeq::BitWise;
+use Pathogens::RNASeq::IntergenicRegions;
+use Pathogens::RNASeq::FeaturesTabFile;
+
+has 'sequence_filename' => ( is => 'rw', isa => 'Str', required => 1 );
+has 'annotation_filename' => ( is => 'rw', isa => 'Str', required => 1 );
+has 'output_base_filename' => ( is => 'rw', isa => 'Str', required => 1 );
+
+#optional input parameters
+has 'filters' => ( is => 'rw', isa => 'Maybe[HashRef]' );
+has 'protocol' => ( is => 'rw', isa => 'Str', default => 'StandardProtocol' );
+has 'samtools_exec' => ( is => 'rw', isa => 'Str', default => "samtools" );
+has 'window_margin' => ( is => 'rw', isa => 'Int', default => 50 );
+has 'intergenic_regions' => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'minimum_intergenic_size' => ( is => 'rw', isa => 'Int', default => 10 );
+
+has '_sequence_file' => ( is => 'rw', isa => 'Pathogens::RNASeq::SequenceFile', lazy_build => 1 );
+has '_annotation_file' => ( is => 'rw', isa => 'Pathogens::RNASeq::GFF', lazy_build => 1 );
+has '_results_spreadsheet' => ( is => 'rw', isa => 'Pathogens::RNASeq::ExpressionStatsSpreadsheet', lazy_build => 1 );
+has '_expression_results' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
+has '_alignment_slice_protocol_class' => ( is => 'rw', lazy_build => 1 );
+
+sub _build__sequence_file
+{
+ my ($self) = @_;
+
+ my $validator = Pathogens::RNASeq::ValidateInputs->new( sequence_filename => $self->sequence_filename, annotation_filename => $self->annotation_filename);
+ if($validator->are_input_files_valid() == 0)
+ {
+ Pathogens::RNASeq::Exceptions::FailedToOpenAlignmentSlice->throw( error => "Input files invalid: ".$self->sequence_filename." ".$self->annotation_filename."\n" );
+ }
+
+ Pathogens::RNASeq::SequenceFile->new(filename => $self->sequence_filename);
+}
+
+sub _build__annotation_file
+{
+ my ($self) = @_;
+ Pathogens::RNASeq::GFF->new( filename => $self->annotation_filename);
+}
+
+sub _build__results_spreadsheet
+{
+ my ($self) = @_;
+ Pathogens::RNASeq::ExpressionStatsSpreadsheet->new( output_filename => $self->output_base_filename.".expression.csv", protocol => $self->protocol);
+}
+
+sub _corrected_sequence_filename
+{
+ my ($self) = @_;
+ return $self->output_base_filename.".corrected.bam";
+}
+
+sub _build__expression_results
+{
+ my ($self) = @_;
+ my $total_mapped_reads = $self->_sequence_file->total_mapped_reads;
+
+ Pathogens::RNASeq::BitWise->new(
+ filename => $self->sequence_filename,
+ output_filename => $self->_corrected_sequence_filename,
+ protocol => $self->protocol,
+ samtools_exec => $self->samtools_exec
+ )->update_bitwise_flags();
+
+ my @expression_results = ();
+
+ for my $feature_id (keys %{$self->_annotation_file->features})
+ {
+ my $alignment_slice = $self->_alignment_slice_protocol_class->new(
+ filename => $self->_corrected_sequence_filename,
+ total_mapped_reads => $total_mapped_reads,
+ feature => $self->_annotation_file->features->{$feature_id},
+ filters => $self->filters,
+ protocol => $self->protocol,
+ samtools_exec => $self->samtools_exec,
+ window_margin => $self->window_margin
+ );
+ my $alignment_slice_results = $alignment_slice->rpkm_values;
+
+ $alignment_slice_results->{gene_id} = $feature_id;
+ $alignment_slice_results->{seq_id} = $self->_annotation_file->features->{$feature_id}->seq_id;
+ push(@expression_results, $alignment_slice_results);
+ }
+
+ if(defined($self->intergenic_regions) && $self->intergenic_regions == 1)
+ {
+ $self->_calculate_values_for_intergenic_regions(\@expression_results,$total_mapped_reads );
+ }
+
+ return \@expression_results;
+}
+
+sub _calculate_values_for_intergenic_regions
+{
+ my ($self, $expression_results, $total_mapped_reads) = @_;
+ # get intergenic regions
+ my $intergenic_regions = Pathogens::RNASeq::IntergenicRegions->new(
+ features => $self->_annotation_file->features,
+ window_margin => $self->window_margin,
+ minimum_size => $self->minimum_intergenic_size,
+ sequence_lengths => $self->_annotation_file->sequence_lengths
+ );
+
+ # print out the features into a tab file for loading into Artemis
+ my $tab_file_results = Pathogens::RNASeq::FeaturesTabFile->new(
+ output_filename => $self->_corrected_sequence_filename.".intergenic",
+ features => $intergenic_regions->intergenic_features,
+ sequence_names => $intergenic_regions->sequence_names
+ );
+ $tab_file_results->create_files;
+
+ for my $feature_id (keys %{$intergenic_regions->intergenic_features})
+ {
+ my $alignment_slice = $self->_alignment_slice_protocol_class->new(
+ filename => $self->_corrected_sequence_filename,
+ total_mapped_reads => $total_mapped_reads,
+ feature => $intergenic_regions->intergenic_features->{$feature_id},
+ filters => $self->filters,
+ protocol => $self->protocol,
+ samtools_exec => $self->samtools_exec,
+ window_margin => 0,
+ );
+ my $alignment_slice_results = $alignment_slice->rpkm_values;
+
+ $alignment_slice_results->{gene_id} = $feature_id;
+ $alignment_slice_results->{seq_id} = $intergenic_regions->intergenic_features->{$feature_id}->seq_id;
+ push(@{$expression_results}, $alignment_slice_results);
+ }
+
+ return $expression_results;
+}
+
+
+sub _build__alignment_slice_protocol_class
+{
+ my ($self) = @_;
+ my $alignment_slice_protocol_class = "Pathogens::RNASeq::".$self->protocol."::AlignmentSlice";
+ eval("use $alignment_slice_protocol_class");
+ return $alignment_slice_protocol_class;
+}
+
+sub output_spreadsheet
+{
+ my ($self) = @_;
+ for my $expression_result (@{$self->_expression_results})
+ {
+ $self->_results_spreadsheet->add_result($expression_result);
+ }
+ $self->_results_spreadsheet->build_and_close();
+ return 1;
+}
+
+
+1;
+
103 modules/Pathogens/RNASeq/ExpressionStatsSpreadsheet.pm
View
@@ -0,0 +1,103 @@
+=head1 NAME
+
+ExpressionStatsSpreadsheet.pm - Builds a spreadsheet of expression results
+
+=head1 SYNOPSIS
+
+use Pathogens::RNASeq::ExpressionStatsSpreadsheet;
+my $expression_results = Pathogens::RNASeq::ExpressionStatsSpreadsheet->new(
+ output_filename => '/abc/my_results.csv',
+ );
+$expression_results->add_result($my_rpkm_values);
+$expression_results->add_result($more_rpkm_values);
+$expression_results->build_and_close();
+
+
+=cut
+package Pathogens::RNASeq::ExpressionStatsSpreadsheet;
+use Moose;
+use Text::CSV;
+
+
+has 'output_filename' => ( is => 'rw', isa => 'Str', required => 1 );
+has '_output_file_handle' => ( is => 'rw', lazy_build => 1 );
+has '_results' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
+has 'protocol' => ( is => 'rw', isa => 'Str', default => 'StandardProtocol' );
+
+sub _build__output_file_handle
+{
+ my ($self) = @_;
+ my $output_file_handle;
+ open($output_file_handle, ">:encoding(utf8)", $self->output_filename ) || Pathogens::RNASeq::Exceptions::FailedToOpenExpressionResultsSpreadsheetForWriting->throw( error => "Cant open ".$self->output_filename." for writing");
+ return $output_file_handle;
+}
+
+sub _build__results
+{
+ my ($self) = @_;
+ my @results = () ;
+ return \@results;
+}
+
+sub _result_rows
+{
+ my ($self) = @_;
+ my @denormalised_results;
+ for my $result_set (@{$self->_results})
+ {
+ push(@denormalised_results,
+ [
+ $result_set->{seq_id},
+ $result_set->{gene_id},
+ $result_set->{mapped_reads_sense},
+ $result_set->{rpkm_sense},
+ $result_set->{mapped_reads_antisense},
+ $result_set->{rpkm_antisense}
+ ]);
+ }
+ return \@denormalised_results;
+}
+
+sub _header
+{
+ my ($self) = @_;
+ my @header;
+ if($self->protocol eq "StrandSpecificProtocol")
+ {
+ @header = ["Seq ID","GeneID","Reads Mapping", "RPKM", "Antisense Reads Mapping", "Antisense RPKM"];
+ }
+ else
+ {
+ @header = ["Seq ID","GeneID", "Antisense Reads Mapping", "Antisense RPKM","Reads Mapping", "RPKM"];
+ }
+ return \@header;
+}