diff --git a/PerlLib/Exons_to_geneobj.pm b/PerlLib/Exons_to_geneobj.pm index 2025b16..c026b9b 100755 --- a/PerlLib/Exons_to_geneobj.pm +++ b/PerlLib/Exons_to_geneobj.pm @@ -27,9 +27,13 @@ sub create_gene_obj { ## exons_ref should be end5's keyed to end3's for all exons. my ($gene_struct_mod, $cdna_seq) = &get_cdna_seq ($exons_href, $sequence_ref); + + my $cdna_seq_length = length $cdna_seq; my $long_orf_obj = new Longest_orf(); + #print STDERR "CDNA_SEQ: [$cdna_seq], length: $cdna_seq_length\n"; + # establish long orf finding parameters. $long_orf_obj->forward_strand_only(); if ($partial_info_href->{"5prime"}) { @@ -43,24 +47,28 @@ sub create_gene_obj { $long_orf_obj->get_longest_orf($cdna_seq); my ($end5, $end3) = $long_orf_obj->get_end5_end3(); - print "CDS: $end5, $end3\n" if $SEE; + #print STDERR "*** CDS: $end5, $end3\n";# if $SEE; my $gene_obj = &create_gene ($gene_struct_mod, $end5, $end3); + #print STDERR $gene_obj->toString(); + $gene_obj->create_all_sequence_types($sequence_ref); my $protein = $gene_obj->get_protein_sequence(); - + my $recons_cds = $gene_obj->get_CDS_sequence(); + #print STDERR "reconsCDS: $recons_cds\n"; + ## check partiality if ($protein) { # it is possible that we won't have any cds structure if ($protein !~ /^M/) { # this would require that we allowed for 5prime partials unless ($partial_info_href->{"5prime"}) { - confess "Error, have 5' partial protein when 5prime partials weren't allowed!\n$protein\n"; + confess "Error, have 5' partial protein when 5prime partials weren't allowed!\n$protein\n$cdna_seq\n"; } } if ($protein !~ /\*$/) { # this would require that we allowed for 3prime partials unless ($partial_info_href->{"3prime"}) { - confess "Error, have 3' partial protein when 3prime partials weren't allowed!\n$protein\n"; + confess "Error, have 3' partial protein when 3prime partials weren't allowed!\n$protein\n$cdna_seq\n"; } } @@ -80,6 +88,9 @@ sub create_gene_obj { #### sub get_cdna_seq { my ($gene_struct, $assembly_seq_ref) = @_; + + my $seq_length = length($$assembly_seq_ref); + my (@end5s) = sort {$a<=>$b} keys %$gene_struct; my $strand = "?"; foreach my $end5 (@end5s) { @@ -90,7 +101,7 @@ sub get_cdna_seq { } if ($strand eq "?") { print Dumper ($gene_struct); - die "ERROR: I can't determine what orientation the cDNA is in!\n"; + confess "ERROR: I can't determine what orientation the cDNA is in!\n"; } print NOTES "strand: $strand\n"; my $cdna_seq; @@ -99,6 +110,11 @@ sub get_cdna_seq { foreach my $end5 (@end5s) { #print $end5; my $end3 = $gene_struct->{$end5}; + + if ($end5 > $seq_length || $end3 > $seq_length) { + confess "Error, coords are out of bounds of sequence length: $seq_length:\n" . Dumper(\$gene_struct); + } + my ($coord1, $coord2) = sort {$a<=>$b} ($end5, $end3); my $exon_seq = substr ($$assembly_seq_ref, $coord1 - 1, ($coord2 - $coord1 + 1)); $cdna_seq .= $exon_seq; @@ -114,8 +130,12 @@ sub get_cdna_seq { #### sub create_gene { my ($gene_struct_mod, $cds_pointer_lend, $cds_pointer_rend) = @_; + + #use Data::Dumper; + #print STDERR Dumper($gene_struct_mod) . "CDS: $cds_pointer_lend, $cds_pointer_rend\n"; + my $strand = $gene_struct_mod->{strand}; - my @exons = @{$gene_struct_mod->{exons}}; + my @exons = sort {$a->[0]<=>$b->[0]} @{$gene_struct_mod->{exons}}; if ($strand eq '-') { @exons = reverse (@exons); } @@ -123,12 +143,13 @@ sub create_gene { my $mRNA_pointer_rend = 0; my $gene_obj = new Gene_obj(); foreach my $coordset_ref (@exons) { - my ($coord1, $coord2) = @$coordset_ref; + my ($coord1, $coord2) = sort {$a<=>$b} @$coordset_ref; my ($end5, $end3) = ($strand eq '+') ? ($coord1, $coord2) : ($coord2, $coord1); my $exon_obj = new mRNA_exon_obj($end5, $end3); my $exon_length = ($coord2 - $coord1 + 1); $mRNA_pointer_rend = $mRNA_pointer_lend + $exon_length - 1; ## see if cds is within current cDNA range. + #print STDERR "mRNA coords: $mRNA_pointer_lend-$mRNA_pointer_rend\n"; if ( $cds_pointer_rend >= $mRNA_pointer_lend && $cds_pointer_lend <= $mRNA_pointer_rend) { #overlap my $diff = $cds_pointer_lend - $mRNA_pointer_lend; my $delta_lend = ($diff >0) ? $diff : 0; diff --git a/PerlLib/Fastq_reader.pm b/PerlLib/Fastq_reader.pm index c2011ee..e6e9b02 100755 --- a/PerlLib/Fastq_reader.pm +++ b/PerlLib/Fastq_reader.pm @@ -3,8 +3,6 @@ package Fastq_reader; use strict; use warnings; -use PerlIO::gzip; - sub new { my ($packagename, $fastqFile) = @_; @@ -24,7 +22,11 @@ sub new { } else { if ( $fastqFile =~ /\.gz$/ ) { - open ($filehandle, "<:gzip", $fastqFile) or die "Error: Couldn't open compressed $fastqFile\n"; + open ($filehandle, "gunzip -c $fastqFile | ") or die "Error: Couldn't open compressed $fastqFile\n"; + } + elsif ($fastqFile =~ /\.bz2$/) { + open ($filehandle, "bunzip2 -c $fastqFile | ") or die "Error, couldn't open compressed $fastqFile $!"; + } else { open ($filehandle, $fastqFile) or die "Error: Couldn't open $fastqFile\n"; } diff --git a/PerlLib/GFF3_alignment_utils.pm b/PerlLib/GFF3_alignment_utils.pm index 4bf321c..35f6755 100755 --- a/PerlLib/GFF3_alignment_utils.pm +++ b/PerlLib/GFF3_alignment_utils.pm @@ -1,3 +1,5 @@ +#!/usr/bin/env perl + package GFF3_alignment_utils; use strict; @@ -5,10 +7,14 @@ use warnings; use Carp; use Gene_obj; +use Gene_obj_indexer; use CDNA::Alignment_segment; use CDNA::CDNA_alignment; +use File::Basename; + +__run_test() unless caller; -sub index_GFF3_alignment_objs { +sub index_alignment_objs { my ($gff3_alignment_file, $genome_alignment_indexer_href) = @_; unless ($gff3_alignment_file && -s $gff3_alignment_file) { @@ -17,10 +23,11 @@ sub index_GFF3_alignment_objs { unless (ref $genome_alignment_indexer_href) { confess "Error, need genome indexer href as param "; } - - + my %genome_trans_to_alignment_segments; + my %trans_to_gene_id; + open (my $fh, $gff3_alignment_file) or die "Error, cannot open file $gff3_alignment_file"; while (<$fh>) { @@ -69,17 +76,19 @@ sub index_GFF3_alignment_objs { my ($end5, $end3) = ($orient eq '+') ? ($lend, $rend) : ($rend, $lend); - $info =~ /Target=\S+ (\d+) (\d+) ([\+\-])/ or die "Error, cannot extract match coordinates from info: $info"; + $info =~ /Target=\S+ (\d+) (\d+)/ or die "Error, cannot extract match coordinates from info: $info"; my $cdna_seg_lend = $1; my $cdna_seg_rend = $2; - my $cdna_orient = $3; # always set to + in pasa + + ($cdna_seg_lend, $cdna_seg_rend) = sort {$a<=>$b} ($cdna_seg_lend, $cdna_seg_rend); # always + orient for transcript coords. + my $alignment_segment = new CDNA::Alignment_segment($end5, $end3, $cdna_seg_lend, $cdna_seg_rend, $per_id); - push (@{$genome_trans_to_alignment_segments{$scaff}->{$gene_id}}, $alignment_segment); + push (@{$genome_trans_to_alignment_segments{$scaff}->{$trans_id}}, $alignment_segment); - + $trans_to_gene_id{$trans_id} = $gene_id; } @@ -109,8 +118,21 @@ sub index_GFF3_alignment_objs { $cdna_alignment_obj->set_acc($alignment_acc); $cdna_alignment_obj->{genome_acc} = $scaff; - $genome_alignment_indexer_href->{$alignment_acc} = $cdna_alignment_obj; + my $gene_id = $trans_to_gene_id{$alignment_acc} or confess "Error no gene_id for acc: $alignment_acc"; + + $cdna_alignment_obj->{gene_id} = $gene_id; + + + $cdna_alignment_obj->{source} = basename($gff3_alignment_file); + if (ref $genome_alignment_indexer_href eq "Gene_obj_indexer") { + $genome_alignment_indexer_href->store_gene($alignment_acc, $cdna_alignment_obj); + + } + else { + + $genome_alignment_indexer_href->{$alignment_acc} = $cdna_alignment_obj; + } push (@{$scaff_to_align_list{$scaff}}, $alignment_acc); } } @@ -118,5 +140,47 @@ sub index_GFF3_alignment_objs { return(%scaff_to_align_list); } + + +################# +## Testing +################# + + +sub __run_test { + + my $usage = "usage: $0 file.alignment.gff3\n\n"; + + my $gff3_file = $ARGV[0] or die $usage; + + my $indexer = {}; + my %scaff_to_alignments = &index_alignment_objs($gff3_file, $indexer); + + + foreach my $scaffold (keys %scaff_to_alignments) { + + my @align_ids = @{$scaff_to_alignments{$scaffold}}; + + foreach my $align_id (@align_ids) { + my $cdna_obj = $indexer->{$align_id}; + + print $cdna_obj->toString(); + } + } + + + + exit(0); + + + +} + + + + + + + 1; #EOM diff --git a/PerlLib/GFF3_utils.pm b/PerlLib/GFF3_utils.pm index aa77d12..d3d4b60 100644 --- a/PerlLib/GFF3_utils.pm +++ b/PerlLib/GFF3_utils.pm @@ -66,7 +66,7 @@ sub index_GFF3_gene_objs { unless ($feat_type) { die "Error, $_, no feat_type: line\[$_\]"; } - unless ($feat_type =~ /^(gene|mRNA|CDS|exon)$/) { next;} + unless ($feat_type =~ /^(gene|mRNA|transcript|CDS|exon)$/) { next;} $gene_info = uri_unescape($gene_info); @@ -107,7 +107,7 @@ sub index_GFF3_gene_objs { # print "id: $id, parent: $parent\n"; - if ($feat_type eq 'mRNA') { + if ($feat_type eq 'mRNA' || $feat_type eq 'transcript') { ## just get the identifier info $transcript_to_gene{$id} = $parent; next; diff --git a/PerlLib/GTF_alignment_utils.pm b/PerlLib/GTF_alignment_utils.pm new file mode 100755 index 0000000..5985174 --- /dev/null +++ b/PerlLib/GTF_alignment_utils.pm @@ -0,0 +1,187 @@ +#!/usr/bin/env perl + +package GTF_alignment_utils; + +use strict; +use warnings; +use Carp; + +use Gene_obj; +use Gene_obj_indexer; +use CDNA::Alignment_segment; +use CDNA::CDNA_alignment; +use File::Basename; + +__run_test() unless caller; + + + +sub index_alignment_objs { + my ($gtf_alignment_file, $genome_alignment_indexer_href) = @_; + + unless ($gtf_alignment_file && -s $gtf_alignment_file) { + confess "Error, cannot find or open file $gtf_alignment_file"; + } + unless (ref $genome_alignment_indexer_href) { + confess "Error, need genome indexer href as param "; + } + + + + my %genome_trans_to_alignment_segments; + + my %trans_to_gene_id; + + open (my $fh, $gtf_alignment_file) or die "Error, cannot open file $gtf_alignment_file"; + while (<$fh>) { + chomp; + if (/^\#/) { next; } + unless (/\w/) { next; } + + my @x = split(/\t/); + + unless (scalar (@x) >= 8 && $x[8] =~ /transcript_id/) { + print STDERR "ignoring line: $_\n"; + next; + } + + my $scaff = $x[0]; + my $type = $x[2]; + my $lend = $x[3]; + my $rend = $x[4]; + my $per_id = $x[5]; + + unless ($type eq 'exon') { next; } + + + if ($per_id eq ".") { $per_id = 100; } # making an assumption here. + + my $orient = $x[6]; + + my $info = $x[8]; + + my @parts = split(/;/, $info); + my %atts; + foreach my $part (@parts) { + $part =~ s/^\s+|\s+$//; + $part =~ s/\"//g; + my ($att, $val) = split(/\s+/, $part); + + if (exists $atts{$att}) { + die "Error, already defined attribute $att in $_"; + } + + $atts{$att} = $val; + } + + my $gene_id = $atts{gene_id} or die "Error, no gene_id at $_"; + my $trans_id = $atts{transcript_id} or die "Error, no trans_id at $_"; + + + $trans_to_gene_id{$trans_id} = $gene_id; + + push (@{$genome_trans_to_alignment_segments{$scaff}->{$trans_id}}, [$lend, $rend, $orient]); + + } + + + my %scaff_to_align_list; + + + ## Output genes in gtf format: + + foreach my $scaff (sort keys %genome_trans_to_alignment_segments) { + + my @alignment_accs = keys %{$genome_trans_to_alignment_segments{$scaff}}; + + foreach my $alignment_acc (@alignment_accs) { + + my @segments = @{$genome_trans_to_alignment_segments{$scaff}->{$alignment_acc}}; + @segments = sort {$a->[0]<=>$b->[0]} @segments; + my $orient = $segments[0]->[2]; + if ($orient eq '-') { + @segments = reverse @segments; + } + my @cdna_align_segs; + my @coords; + my $curr_cdna_len = 0; + foreach my $segment (@segments) { + my ($lend, $rend, $orient) = @$segment; + my ($m_lend, $m_rend) = ($curr_cdna_len + 1, $curr_cdna_len + abs($rend - $lend) + 1); + $curr_cdna_len = $m_rend; + if ($orient eq '-') { + ($m_lend, $m_rend) = ($m_rend, $m_lend); + } + my $alignment_segment = new CDNA::Alignment_segment($lend, $rend, $m_lend, $m_rend, 100); + push (@cdna_align_segs, $alignment_segment); + + } + + my $cdna_alignment_obj = new CDNA::CDNA_alignment($curr_cdna_len, \@cdna_align_segs); + $cdna_alignment_obj->set_acc($alignment_acc); + $cdna_alignment_obj->{genome_acc} = $scaff; + + $cdna_alignment_obj->{gene_id} = $trans_to_gene_id{$alignment_acc}; + + my $spliced_orient = $orient; + if ($spliced_orient !~ /^[\+\-]$/) { + $spliced_orient = '?'; + } + $cdna_alignment_obj->set_spliced_orientation($spliced_orient); + + $cdna_alignment_obj->{source} = basename($gtf_alignment_file); + + if (ref $genome_alignment_indexer_href eq "Gene_obj_indexer") { + $genome_alignment_indexer_href->store_gene($alignment_acc, $cdna_alignment_obj); + } + else { + $genome_alignment_indexer_href->{$alignment_acc} = $cdna_alignment_obj; + } + + push (@{$scaff_to_align_list{$scaff}}, $alignment_acc); + } + } + + return(%scaff_to_align_list); +} + + +########### +# Testing +########## + +sub __run_test { + + my $usage = "usage: $0 file.gtf\n\n"; + + my $gtf_file = $ARGV[0] or die $usage; + + my $indexer = {}; + my %scaff_to_alignments = &index_alignment_objs($gtf_file, $indexer); + + + foreach my $scaffold (keys %scaff_to_alignments) { + + my @align_ids = @{$scaff_to_alignments{$scaffold}}; + + foreach my $align_id (@align_ids) { + my $cdna_obj = $indexer->{$align_id}; + + print $cdna_obj->toString(); + } + } + + + + exit(0); + + + +} + + + + + +1; #EOM + diff --git a/PerlLib/GTF_utils.pm b/PerlLib/GTF_utils.pm index 5c8e030..1ae3888 100644 --- a/PerlLib/GTF_utils.pm +++ b/PerlLib/GTF_utils.pm @@ -35,8 +35,12 @@ sub index_GTF_gene_objs_from_GTF { my $gene_id = $gene_obj->{TU_feat_name}; + if ($seen{$gene_id}) { - confess "Error, already processed gene: $gene_id\nearlier: " . $seen{$gene_id}->toString() . " but now as: " . $gene_obj->toString(); + confess "Error, already processed gene: $gene_id\n" + . " here: " . $gene_obj->toString() . "\n" + . " and earlier: " . $seen{$gene_id}->toString(); + } $seen{$gene_id} = $gene_obj; @@ -71,6 +75,7 @@ sub GTF_to_gene_objs { my %gene_id_to_name; my %gene_id_to_seq_name; + my %gene_id_to_gene_name; open (my $fh, $gtf_filename) or die "Error, cannot open $gtf_filename"; while (<$fh>) { @@ -94,24 +99,25 @@ sub GTF_to_gene_objs { $gene_id_to_source{$gene_id} = $source; - my $transcript_id; - if ($annot =~ /transcript_id \"([^\"]+)\"/) { - $transcript_id = $1; - } - else { - #print STDERR "\nWARNING: cannot get transcript_id from $annot of line\n$_\n"; - next; - } + $annot =~ /transcript_id \"([^\"]+)\"/ or confess "Error, cannot get transcript_id from $annot of line\n$_"; + my $transcript_id = $1; if ($annot =~ /name \"([^\"]+)\"/) { my $name = $1; $gene_id_to_name{$gene_id} = $name; } + + my $gene_name = ""; + if ($annot =~ /gene_name \"([^\"]+)\"/) { + $gene_name = $1; + $gene_id_to_gene_name{$gene_id} = $gene_name; + } + # print "gene_id: $gene_id, transcrpt_id: $transcript_id, $type\n"; if ($type eq 'transcript' || $type eq 'gene') { next; } # capture by exon coordinates - + if ($type eq 'CDS' || $type eq 'stop_codon' || $type eq 'start_codon') { push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{CDS}}, [$end5, $end3] ); @@ -120,11 +126,11 @@ sub GTF_to_gene_objs { elsif ($type eq "exon" || $type =~ /UTR/) { push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{mRNA}}, [$end5, $end3] ); } - elsif ($type =~ /rna/i) { + else { ## assuming noncoding feature push (@{$noncoding_features{$seqname}->{$type}->{$gene_id}->{$transcript_id}}, [$end5, $end3] ); } - + } close $fh; @@ -188,6 +194,9 @@ sub GTF_to_gene_objs { else { $gene_obj->{com_name} = $transcript_id; } + if (my $gene_name = $gene_id_to_gene_name{$gene_id}) { + $gene_obj->{gene_name} = $gene_name; + } $gene_obj->{asmbl_id} = $seqname; $gene_obj->{source} = $source; @@ -203,7 +212,6 @@ sub GTF_to_gene_objs { foreach my $other_gene_obj (@gene_objs) { $template_gene_obj->add_isoform($other_gene_obj); } - $template_gene_obj->refine_gene_object(); push (@top_gene_objs, $template_gene_obj); # print $template_gene_obj->toString(); diff --git a/PerlLib/Pipeliner.pm b/PerlLib/Pipeliner.pm new file mode 100644 index 0000000..03e7f22 --- /dev/null +++ b/PerlLib/Pipeliner.pm @@ -0,0 +1,141 @@ +package Pipeliner; + +use strict; +use warnings; +use Carp; + +################################ +## Verbose levels: +## 1: see CMD string +## 2: see stderr during process +my $VERBOSE = 0; +################################ + + +#### +sub new { + my $packagename = shift; + my %params = @_; + + if ($params{-verbose}) { + $VERBOSE = $params{-verbose}; + } + + my $self = { + cmd_objs => [], + }; + + bless ($self, $packagename); + + return($self); +} + + +sub add_commands { + my $self = shift; + my @cmds = @_; + + foreach my $cmd (@cmds) { + unless (ref($cmd) =~ /Command/) { + confess "Error, need Command object as param"; + } + push (@{$self->{cmd_objs}}, $cmd); + } + + return $self; + +} + +sub run { + my $self = shift; + + foreach my $cmd_obj ($self->_get_commands()) { + + my $cmdstr = $cmd_obj->get_cmdstr(); + my $checkpoint_file = $cmd_obj->get_checkpoint_file(); + + if (-e $checkpoint_file) { + print STDERR "-- Skipping CMD: $cmdstr, checkpoint exists.\n" if $VERBOSE; + } + else { + print STDERR "* Running CMD: $cmdstr\n" if $VERBOSE; + + my $tmp_stderr = "tmp.$$.stderr"; + if (-e $tmp_stderr) { + unlink($tmp_stderr); + } + unless ($VERBOSE == 2) { + $cmdstr .= " 2>$tmp_stderr"; + } + + my $ret = system($cmdstr); + if ($ret) { + + if (-e $tmp_stderr) { + system("cat $tmp_stderr"); + unlink($tmp_stderr); + } + + confess "Error, cmd: $cmdstr died with ret $ret"; + } + else { + `touch $checkpoint_file`; + if ($?) { + + confess "Error creating checkpoint file: $checkpoint_file"; + } + } + + if (-e $tmp_stderr) { + unlink($tmp_stderr); + } + } + } + + return; +} + +sub _get_commands { + my $self = shift; + + return(@{$self->{cmd_objs}}); +} + +package Command; +use strict; +use warnings; +use Carp; + +sub new { + my $packagename = shift; + + my ($cmdstr, $checkpoint_file) = @_; + + unless ($cmdstr && $checkpoint_file) { + confess "Error, need cmdstr and checkpoint filename as params"; + } + + my $self = { cmdstr => $cmdstr, + checkpoint_file => $checkpoint_file, + }; + + bless ($self, $packagename); + + return($self); +} + +#### +sub get_cmdstr { + my $self = shift; + return($self->{cmdstr}); +} + +#### +sub get_checkpoint_file { + my $self = shift; + return($self->{checkpoint_file}); +} + + + +1; #EOM diff --git a/PerlLib/SingleLinkageClusterer.pm b/PerlLib/SingleLinkageClusterer.pm index a3a1fa3..0a32a95 100755 --- a/PerlLib/SingleLinkageClusterer.pm +++ b/PerlLib/SingleLinkageClusterer.pm @@ -1,3 +1,5 @@ +#!/usr/bin/env perl + package main; our $CLUSTERPATH; @@ -12,39 +14,45 @@ package SingleLinkageClusterer; ## return ([1,2,3] , [6,7,8], ...) use strict; +use warnings; + +__run_test() unless caller; sub build_clusters { my @pairs = @_; - my $pairfile = "/tmp/$$.pairs"; + + my $uniq_stamp = "$$." . time() . "." . rand(); + + my $pairfile = "/tmp/$uniq_stamp.pairs"; #must do mapping because cluster program doesn't like word chars, just ints. my %map_id_to_feat; my %map_feat_to_id; my $id = 1; - + open (PAIRLIST, ">$pairfile") or die "Can't write $pairfile to /tmp"; foreach my $pair (@pairs) { - my ($a, $b) = @$pair; - unless ($map_feat_to_id{$a}) { - $map_feat_to_id{$a} = $id; - $map_id_to_feat{$id} = $a; - $id++; - } - unless ($map_feat_to_id{$b}) { - $map_feat_to_id{$b} = $id; - $map_id_to_feat{$id} = $b; - $id++; - } - - print PAIRLIST "$map_feat_to_id{$a} $map_feat_to_id{$b}\n"; + my ($a, $b) = @$pair; + unless ($map_feat_to_id{$a}) { + $map_feat_to_id{$a} = $id; + $map_id_to_feat{$id} = $a; + $id++; + } + unless ($map_feat_to_id{$b}) { + $map_feat_to_id{$b} = $id; + $map_id_to_feat{$id} = $b; + $id++; + } + + print PAIRLIST "$map_feat_to_id{$a} $map_feat_to_id{$b}\n"; } close PAIRLIST; - my $clusterfile = "/tmp/$$.clusters"; - + my $clusterfile = "/tmp/$uniq_stamp.clusters"; + my $cluster_prog = "slclust"; if ($CLUSTERPATH) { - $cluster_prog = $CLUSTERPATH; + $cluster_prog = $CLUSTERPATH; } system "touch $clusterfile"; @@ -52,20 +60,20 @@ sub build_clusters { my $cmd = "$cluster_prog < $pairfile > $clusterfile"; my $ret = system ($cmd); if ($ret) { - die "ERROR: Couldn't run cluster properly via path: $cluster_prog.\ncmd: $cmd"; + die "ERROR: Couldn't run cluster properly via path: $cluster_prog.\ncmd: $cmd"; } - + my @clusters; open (CLUSTERS, $clusterfile); while (my $line = ) { - my @elements; - while ($line =~ /(\d+)\s?/g) { - push (@elements, $map_id_to_feat{$1}); - } - if (@elements) { - push (@clusters, [@elements]); - } + my @elements; + while ($line =~ /(\d+)\s?/g) { + push (@elements, $map_id_to_feat{$1}); + } + if (@elements) { + push (@clusters, [@elements]); + } } close CLUSTERS; @@ -77,4 +85,23 @@ sub build_clusters { } +############ +## Testing +########### + +sub __run_test { + + my @pairs = ( [1,2], [2,3], [4,5] ); + + my @clusters = &SingleLinkageClusterer::build_clusters(@pairs); + + use Data::Dumper; + + print "Input: " . Dumper(\@pairs); + print "Output: " . Dumper(\@clusters); + + exit(0); +} + + 1; diff --git a/PerlLib/Thread_helper.pm b/PerlLib/Thread_helper.pm index 72e4636..bb43dcb 100644 --- a/PerlLib/Thread_helper.pm +++ b/PerlLib/Thread_helper.pm @@ -202,7 +202,10 @@ sub wait_for_all_threads_to_complete { $self->_add_error_thread($thread); $status = "ERROR"; } - $self->report_thread_info($thread_id, $status); + $self->{thread_id_timing}->{end}->{$thread_id} = time(); + if ($THREAD_MONITORING) { + $self->report_thread_info($thread_id, $status); + } } @{$self->{current_threads}} = (); # clear them out. diff --git a/SAMPLE_HOOKS/GFF3/GFF3_annot_retriever.pm b/SAMPLE_HOOKS/GFF3/GFF3_annot_retriever.pm index 7d5182b..30cbe97 100644 --- a/SAMPLE_HOOKS/GFF3/GFF3_annot_retriever.pm +++ b/SAMPLE_HOOKS/GFF3/GFF3_annot_retriever.pm @@ -24,7 +24,7 @@ sub get_annot_retriever { confess "Error, cannot locate gff3 file $gff3_filename"; } - my $gene_obj_indexer = new Gene_obj_indexer( { create => "$gff3_filename.inx"} ); + my $gene_obj_indexer = new Gene_obj_indexer( { create => "$gff3_filename.$$.inx"} ); &GFF3_utils::index_GFF3_gene_objs($gff3_filename, $gene_obj_indexer); diff --git a/misc_utilities/gff3_gene_to_transcript_gff3.pl b/misc_utilities/deprecated/old.gff3_gene_to_transcript_gff3.pl similarity index 100% rename from misc_utilities/gff3_gene_to_transcript_gff3.pl rename to misc_utilities/deprecated/old.gff3_gene_to_transcript_gff3.pl diff --git a/misc_utilities/gff3_alignment_to_gtf_format.pl b/misc_utilities/gff3_alignment_to_gtf_format.pl new file mode 100755 index 0000000..df4d944 --- /dev/null +++ b/misc_utilities/gff3_alignment_to_gtf_format.pl @@ -0,0 +1,41 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Carp; + +use FindBin; +use lib ("$FindBin::Bin/../PerlLib"); +use GFF3_alignment_utils; +use Fasta_retriever; + +my $usage = "usage: $0 trans_align.gff3\n\n"; + +my $trans_align_gff3 = $ARGV[0] or die $usage; + + +main: { + + my %cdna_alignments; + + print STDERR "-parsing $trans_align_gff3\n"; + my %scaff_to_cdna_ids = &GFF3_alignment_utils::index_alignment_objs($trans_align_gff3, \%cdna_alignments); + + + print STDERR "-outputting GTF format\n"; + foreach my $scaff (keys %scaff_to_cdna_ids) { + + foreach my $cdna_id (@{$scaff_to_cdna_ids{$scaff}}) { + + my $cdna_obj = $cdna_alignments{$cdna_id}; + + print $cdna_obj->to_GTF_format( gene_id => $cdna_obj->{gene_id} ) . "\n"; + + } + } + + + exit(0); +} + + diff --git a/misc_utilities/gff3_file_toString.pl b/misc_utilities/gff3_file_toString.pl index ea20ff3..f59e825 100755 --- a/misc_utilities/gff3_file_toString.pl +++ b/misc_utilities/gff3_file_toString.pl @@ -24,7 +24,8 @@ foreach my $gene_id (@gene_ids) { my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id}; - + + print "// $asmbl_id, gene: $gene_id\n"; print $gene_obj_ref->toString(); } diff --git a/misc_utilities/gtf_file_to_proteins.pl b/misc_utilities/gtf_file_to_proteins.pl index 07fcdeb..53cc51f 100755 --- a/misc_utilities/gtf_file_to_proteins.pl +++ b/misc_utilities/gtf_file_to_proteins.pl @@ -84,6 +84,8 @@ foreach my $isoform ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms()) { my $isoform_id = $isoform->{Model_feat_name}; + my $gene_id = $isoform->{TU_feat_name}; + my $seq = ""; @@ -105,7 +107,16 @@ my $com_name = $isoform->{com_name} || ""; - print ">$isoform_id\n$seq\n"; + my $gene_name = $isoform->{gene_name}; + my $header = ">$isoform_id $gene_id"; + if ($gene_name) { + $header .= " $gene_name "; + } + if ($com_name) { + $gene_name .= " $com_name"; + } + + print "$header\n$seq\n"; } } } diff --git a/pasa_conf/pasa.CONFIG.template b/pasa_conf/pasa.CONFIG.template index 1c693f8..7b22f48 100755 --- a/pasa_conf/pasa.CONFIG.template +++ b/pasa_conf/pasa.CONFIG.template @@ -6,6 +6,7 @@ # server actively running MySQL MYSQLSERVER=localhost # or server.com +# Pass socket connections through Perl DBI syntax e.g. MYSQLSERVER=mysql_socket=/tmp/mysql.sock # read-write username and password MYSQL_RW_USER=xxxxxx diff --git a/pasa_conf/sample_test.conf b/pasa_conf/sample_test.conf index 3c2fe9c..62729d8 100755 --- a/pasa_conf/sample_test.conf +++ b/pasa_conf/sample_test.conf @@ -20,11 +20,11 @@ USE_PASA_DB_SETUP_HOOK=false ##################################### # server actively running MySQL -MYSQLSERVER=bhaas-lx +MYSQLSERVER=localhost # read-only username and password -MYSQL_RO_USER=access -MYSQL_RO_PASSWORD=access +MYSQL_RO_USER=pasa_access +MYSQL_RO_PASSWORD=pasa_access # read-write username and password MYSQL_RW_USER=access @@ -35,7 +35,7 @@ MYSQL_RW_PASSWORD=access # Web browser navigation settings: ######### ############################################ -BASE_PASA_URL=http://bhaas-lx:8080/cgi-bin/ +BASE_PASA_URL=http://localhost:8080/cgi-bin/where_you_installed_pasa_cgi_root ############################################# diff --git a/pasa_cpp/pasa b/pasa_cpp/pasa index 1012ac8..4d6ae13 100755 Binary files a/pasa_cpp/pasa and b/pasa_cpp/pasa differ diff --git a/schema/cdna_alignment_mysqlschema b/schema/cdna_alignment_mysqlschema index 3076867..8d3077c 100644 --- a/schema/cdna_alignment_mysqlschema +++ b/schema/cdna_alignment_mysqlschema @@ -110,7 +110,7 @@ CREATE TABLE alt_splice_tokens ( CREATE TABLE annotation_admin ( version_id int(11) NOT NULL auto_increment, - date datetime NOT NULL default '0000-00-00 00:00:00', + date datetime NOT NULL, PRIMARY KEY (version_id) ) ; @@ -120,7 +120,7 @@ CREATE TABLE annotation_admin ( CREATE TABLE annotation_compare ( compare_id int(11) NOT NULL auto_increment, - date datetime NOT NULL default '0000-00-00 00:00:00', + date datetime NOT NULL, annotation_version int(10) unsigned NOT NULL default '0', PRIMARY KEY (compare_id) ) ; diff --git a/schema/notes b/schema/notes index edda33a..9930a80 100755 --- a/schema/notes +++ b/schema/notes @@ -16,7 +16,8 @@ Using the backticks instead of single quotes in the db name pattern is mandatory ------ create user 'pasa_access' identified by 'pasa_access'; -grant select on *.* to 'pasa_access@%'; -grant all on *.* to 'pasa_write@%'; +create user 'pasa_write' identified by '...'; + +grant select on *.* to 'pasa_access'; +grant all on *.* to 'pasa_write'; -grant all on *.* to 'pasa_write'@'localhost' identified by '...'; diff --git a/scripts/Launch_PASA_pipeline.pl b/scripts/Launch_PASA_pipeline.pl index 205b5d5..7bdec0f 100755 --- a/scripts/Launch_PASA_pipeline.pl +++ b/scripts/Launch_PASA_pipeline.pl @@ -133,7 +133,7 @@ # # # // Jump-starting or prematurely terminating -# -x flag, print cmds only, don\'t process anything. (useful to get indices for -x or -e opts below) +# -x flag, print cmds only, don\'t process anything. (useful to get indices for -s or -e opts below) # -s pipeline index to start running at (avoid rerunning searches). # -e pipeline index where to stop running, and do not execute this entry. # @@ -419,18 +419,6 @@ } - ###################################################### - ## Optimize the database after uploading all that data - ###################################################### - - push (@cmds, { prog => "$UTILDIR/optimize_tables.dbi", - params => " -c $configfile ", - input => undef, - output => undef, - } ); - - - ############################## ## done aligning transcripts. ############################## diff --git a/scripts/build_comprehensive_transcriptome.dbi b/scripts/build_comprehensive_transcriptome.dbi index d3da7c5..ff85a7a 100755 --- a/scripts/build_comprehensive_transcriptome.dbi +++ b/scripts/build_comprehensive_transcriptome.dbi @@ -351,6 +351,13 @@ sub get_TDN_component { elsif ($cdna_acc =~ /^(c\d+_g\d+)/) { $component = $1; } + elsif ($cdna_acc =~ /^(TR\d+\|c\d+_g\d+)/) { + # trinity de novo 2.0.0+ style + $component = $1; + } + elsif ($cdna_acc =~ /^(TRINITY\S+_c\d+_g\d+)/) { + $component = $1; + } else { confess "Error, cannot extract component info from trinity de novo assembly: $cdna_acc"; } diff --git a/scripts/cDNA_annotation_comparer.dbi b/scripts/cDNA_annotation_comparer.dbi index 6c7fdf0..bfdaf24 100755 --- a/scripts/cDNA_annotation_comparer.dbi +++ b/scripts/cDNA_annotation_comparer.dbi @@ -34,6 +34,9 @@ use vars qw ($opt_h $opt_p $DEBUG $opt_M $opt_G $MIN_PERCENT_OVERLAP $MIN_PERCEN $STOMP_HIGH_PERCENTAGE_OVERLAPPING_GENE $MIN_PERCENT_ALIGN_LENGTH $MIN_PERCENT_LENGTH_FL_COMPARE $TRUST_FL_STATUS $CLUSTER_ID_ONLY); + +my $RESTRICT_SINGLE_CONTIG = undef; + my $CPU = 2; &GetOptions ('M=s' => \$opt_M, @@ -62,6 +65,8 @@ my $CPU = 2; ## hidden opts for debugging purposes 'CLUSTER_ID_ONLY=i' => \$CLUSTER_ID_ONLY, # the only cluster ID to process. + 'RESTRICT_SINGLE_CONTIG=s' => \$RESTRICT_SINGLE_CONTIG, + ); @@ -140,6 +145,12 @@ _EOH_ ; +### Hidden options +### +## --RESTRICT_SINGLE_CONTIG genomic contig to restrict analysis to +### + + if ($opt_h) {die $usage;} @@ -291,7 +302,9 @@ main: { print "### There are $num_contigs genomic contigs which will be examined.\n"; foreach my $result (@results) { my $asmbl_id = $result->[0]; - + + if ($RESTRICT_SINGLE_CONTIG && $asmbl_id ne $RESTRICT_SINGLE_CONTIG) { next; } + $thread_helper->wait_for_open_thread(); my $thread = threads->create('process_contig', $asmbl_id); @@ -308,7 +321,17 @@ main: { if (@failures) { ## examine them... these are the same threads created above, use use the threads api to access info about them ## such as error messages - die "Error, there were " . scalar(@failures) . " threads (contig jobs) that failed... See error messages above in order to troubleshoot further"; + + print STDERR "Error, there were " . scalar(@failures) . " threads (contig jobs) that failed... See error messages above in order to troubleshoot further"; + + foreach my $failed_thread (@failures) { + my $thread_id = $failed_thread->tid; + print STDERR "Failed thread ($thread_id) info:\n"; + $thread_helper->report_thread_info($thread_id, "FAILED"); + } + + exit(1); + } else { ## all good! @@ -1444,7 +1467,7 @@ sub get_annotated_gene_models_on_contig { my @models; # get the annotated working models for that chromosome: - my $query = "select a.gene_id, min(a.lend), max(a.rend), a.orient from annotation_store a where a.annotdb_asmbl_id = ? and annotation_version = ? group by a.gene_id\n"; + my $query = "select a.gene_id, min(a.lend), max(a.rend), a.orient from annotation_store a where a.annotdb_asmbl_id = ? and annotation_version = ? group by a.gene_id, a.orient\n"; my @results = &Mysql_connect::do_sql_2D($dbproc, $query, $asmbl_id, $annot_version); foreach my $result (@results) { diff --git a/scripts/create_mysql_cdnaassembly_db.dbi b/scripts/create_mysql_cdnaassembly_db.dbi index 2742ed3..2b3435b 100755 --- a/scripts/create_mysql_cdnaassembly_db.dbi +++ b/scripts/create_mysql_cdnaassembly_db.dbi @@ -14,7 +14,7 @@ use ConfigFileReader; use vars qw ($opt_c $opt_p $opt_d $opt_h $opt_S $opt_r); -&getopts ('c:p:S:r'); +&getopts ('c:p:S:rd'); $|=1; @@ -54,7 +54,10 @@ my $mysql_rw_password = &Pasa_conf::getParam("MYSQL_RW_PASSWORD"); ## Create the database if needed my $dbproc = &Mysql_connect::connect_to_db($mysql_server,"",$mysql_rw_user,$mysql_rw_password); -&Mysql_connect::RunMod($dbproc,"drop database $mysql_db") if $opt_r; +eval { + &Mysql_connect::RunMod($dbproc,"drop database $mysql_db") if $opt_r; +}; + my $query = "create database $mysql_db"; &Mysql_connect::RunMod($dbproc, $query); $dbproc->disconnect; @@ -70,11 +73,22 @@ if ($mysql_server=~s/:(\d+)$//){ # if $mysql_server includes :port then remove and add as port $mysql_options .= " --port=$1"; } -$mysql_options .= " -h$mysql_server"; + +# check if mysql server is configured with host or socket location i.e. +# the conf.txt contains MYSQLSERVER=mysql_socket= +if (index($mysql_server, "mysql_socket") != -1) { + #Drop the msyql_socket= from the string + #The regex drop everything before and including the '=' character + $mysql_server =~ s/^\s*\S+=//; + $mysql_options .= " -S$mysql_server"; +} else { + $mysql_options .= " -h$mysql_server"; +} ## Populate the database structure and static data. my $cmd = "$mysql_exec $mysql_options -D$mysql_db -e 'source $schemafile'"; +print STDERR "CMD: $cmd\n" if $DEBUG; my $result = system ($cmd); die "CMD: $cmd failed.\n" if $result; diff --git a/scripts/process_GMAP_alignments_gff3_chimeras_ok.pl b/scripts/process_GMAP_alignments_gff3_chimeras_ok.pl index c396a28..0c66c79 100755 --- a/scripts/process_GMAP_alignments_gff3_chimeras_ok.pl +++ b/scripts/process_GMAP_alignments_gff3_chimeras_ok.pl @@ -88,7 +88,14 @@ my $cmd = "gmap -D $genomeBaseDir -d $genomeDir $transcriptDB -f $format -n $num_gmap_top_hits -x 50 -t $CPU -B 5 "; if ($max_intron) { - $cmd .= " --intronlength=$max_intron "; + my $gmapv=`gmap --version 2>&1 | head -1`; + chomp($gmapv); + $gmapv=~/GMAP\s+version\s+(\d{4})-(\d{2})-.*/; + if( ($1 == 2016 && $2 >= 5) || $1 > 2016){ + $cmd .= " --max-intronlength-middle=$max_intron --max-intronlength-ends=$max_intron "; + }else{ + $cmd .= " --intronlength=$max_intron "; + } } &process_cmd($cmd);