diff --git a/PerlLib/Exons_to_geneobj.pm b/PerlLib/Exons_to_geneobj.pm index 2025b16..c026b9b 100755 --- a/PerlLib/Exons_to_geneobj.pm +++ b/PerlLib/Exons_to_geneobj.pm @@ -27,9 +27,13 @@ sub create_gene_obj { ## exons_ref should be end5's keyed to end3's for all exons. my ($gene_struct_mod, $cdna_seq) = &get_cdna_seq ($exons_href, $sequence_ref); + + my $cdna_seq_length = length $cdna_seq; my $long_orf_obj = new Longest_orf(); + #print STDERR "CDNA_SEQ: [$cdna_seq], length: $cdna_seq_length\n"; + # establish long orf finding parameters. $long_orf_obj->forward_strand_only(); if ($partial_info_href->{"5prime"}) { @@ -43,24 +47,28 @@ sub create_gene_obj { $long_orf_obj->get_longest_orf($cdna_seq); my ($end5, $end3) = $long_orf_obj->get_end5_end3(); - print "CDS: $end5, $end3\n" if $SEE; + #print STDERR "*** CDS: $end5, $end3\n";# if $SEE; my $gene_obj = &create_gene ($gene_struct_mod, $end5, $end3); + #print STDERR $gene_obj->toString(); + $gene_obj->create_all_sequence_types($sequence_ref); my $protein = $gene_obj->get_protein_sequence(); - + my $recons_cds = $gene_obj->get_CDS_sequence(); + #print STDERR "reconsCDS: $recons_cds\n"; + ## check partiality if ($protein) { # it is possible that we won't have any cds structure if ($protein !~ /^M/) { # this would require that we allowed for 5prime partials unless ($partial_info_href->{"5prime"}) { - confess "Error, have 5' partial protein when 5prime partials weren't allowed!\n$protein\n"; + confess "Error, have 5' partial protein when 5prime partials weren't allowed!\n$protein\n$cdna_seq\n"; } } if ($protein !~ /\*$/) { # this would require that we allowed for 3prime partials unless ($partial_info_href->{"3prime"}) { - confess "Error, have 3' partial protein when 3prime partials weren't allowed!\n$protein\n"; + confess "Error, have 3' partial protein when 3prime partials weren't allowed!\n$protein\n$cdna_seq\n"; } } @@ -80,6 +88,9 @@ sub create_gene_obj { #### sub get_cdna_seq { my ($gene_struct, $assembly_seq_ref) = @_; + + my $seq_length = length($$assembly_seq_ref); + my (@end5s) = sort {$a<=>$b} keys %$gene_struct; my $strand = "?"; foreach my $end5 (@end5s) { @@ -90,7 +101,7 @@ sub get_cdna_seq { } if ($strand eq "?") { print Dumper ($gene_struct); - die "ERROR: I can't determine what orientation the cDNA is in!\n"; + confess "ERROR: I can't determine what orientation the cDNA is in!\n"; } print NOTES "strand: $strand\n"; my $cdna_seq; @@ -99,6 +110,11 @@ sub get_cdna_seq { foreach my $end5 (@end5s) { #print $end5; my $end3 = $gene_struct->{$end5}; + + if ($end5 > $seq_length || $end3 > $seq_length) { + confess "Error, coords are out of bounds of sequence length: $seq_length:\n" . Dumper(\$gene_struct); + } + my ($coord1, $coord2) = sort {$a<=>$b} ($end5, $end3); my $exon_seq = substr ($$assembly_seq_ref, $coord1 - 1, ($coord2 - $coord1 + 1)); $cdna_seq .= $exon_seq; @@ -114,8 +130,12 @@ sub get_cdna_seq { #### sub create_gene { my ($gene_struct_mod, $cds_pointer_lend, $cds_pointer_rend) = @_; + + #use Data::Dumper; + #print STDERR Dumper($gene_struct_mod) . "CDS: $cds_pointer_lend, $cds_pointer_rend\n"; + my $strand = $gene_struct_mod->{strand}; - my @exons = @{$gene_struct_mod->{exons}}; + my @exons = sort {$a->[0]<=>$b->[0]} @{$gene_struct_mod->{exons}}; if ($strand eq '-') { @exons = reverse (@exons); } @@ -123,12 +143,13 @@ sub create_gene { my $mRNA_pointer_rend = 0; my $gene_obj = new Gene_obj(); foreach my $coordset_ref (@exons) { - my ($coord1, $coord2) = @$coordset_ref; + my ($coord1, $coord2) = sort {$a<=>$b} @$coordset_ref; my ($end5, $end3) = ($strand eq '+') ? ($coord1, $coord2) : ($coord2, $coord1); my $exon_obj = new mRNA_exon_obj($end5, $end3); my $exon_length = ($coord2 - $coord1 + 1); $mRNA_pointer_rend = $mRNA_pointer_lend + $exon_length - 1; ## see if cds is within current cDNA range. + #print STDERR "mRNA coords: $mRNA_pointer_lend-$mRNA_pointer_rend\n"; if ( $cds_pointer_rend >= $mRNA_pointer_lend && $cds_pointer_lend <= $mRNA_pointer_rend) { #overlap my $diff = $cds_pointer_lend - $mRNA_pointer_lend; my $delta_lend = ($diff >0) ? $diff : 0; diff --git a/PerlLib/GTF_utils.pm b/PerlLib/GTF_utils.pm index 5c8e030..1ae3888 100644 --- a/PerlLib/GTF_utils.pm +++ b/PerlLib/GTF_utils.pm @@ -35,8 +35,12 @@ sub index_GTF_gene_objs_from_GTF { my $gene_id = $gene_obj->{TU_feat_name}; + if ($seen{$gene_id}) { - confess "Error, already processed gene: $gene_id\nearlier: " . $seen{$gene_id}->toString() . " but now as: " . $gene_obj->toString(); + confess "Error, already processed gene: $gene_id\n" + . " here: " . $gene_obj->toString() . "\n" + . " and earlier: " . $seen{$gene_id}->toString(); + } $seen{$gene_id} = $gene_obj; @@ -71,6 +75,7 @@ sub GTF_to_gene_objs { my %gene_id_to_name; my %gene_id_to_seq_name; + my %gene_id_to_gene_name; open (my $fh, $gtf_filename) or die "Error, cannot open $gtf_filename"; while (<$fh>) { @@ -94,24 +99,25 @@ sub GTF_to_gene_objs { $gene_id_to_source{$gene_id} = $source; - my $transcript_id; - if ($annot =~ /transcript_id \"([^\"]+)\"/) { - $transcript_id = $1; - } - else { - #print STDERR "\nWARNING: cannot get transcript_id from $annot of line\n$_\n"; - next; - } + $annot =~ /transcript_id \"([^\"]+)\"/ or confess "Error, cannot get transcript_id from $annot of line\n$_"; + my $transcript_id = $1; if ($annot =~ /name \"([^\"]+)\"/) { my $name = $1; $gene_id_to_name{$gene_id} = $name; } + + my $gene_name = ""; + if ($annot =~ /gene_name \"([^\"]+)\"/) { + $gene_name = $1; + $gene_id_to_gene_name{$gene_id} = $gene_name; + } + # print "gene_id: $gene_id, transcrpt_id: $transcript_id, $type\n"; if ($type eq 'transcript' || $type eq 'gene') { next; } # capture by exon coordinates - + if ($type eq 'CDS' || $type eq 'stop_codon' || $type eq 'start_codon') { push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{CDS}}, [$end5, $end3] ); @@ -120,11 +126,11 @@ sub GTF_to_gene_objs { elsif ($type eq "exon" || $type =~ /UTR/) { push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{mRNA}}, [$end5, $end3] ); } - elsif ($type =~ /rna/i) { + else { ## assuming noncoding feature push (@{$noncoding_features{$seqname}->{$type}->{$gene_id}->{$transcript_id}}, [$end5, $end3] ); } - + } close $fh; @@ -188,6 +194,9 @@ sub GTF_to_gene_objs { else { $gene_obj->{com_name} = $transcript_id; } + if (my $gene_name = $gene_id_to_gene_name{$gene_id}) { + $gene_obj->{gene_name} = $gene_name; + } $gene_obj->{asmbl_id} = $seqname; $gene_obj->{source} = $source; @@ -203,7 +212,6 @@ sub GTF_to_gene_objs { foreach my $other_gene_obj (@gene_objs) { $template_gene_obj->add_isoform($other_gene_obj); } - $template_gene_obj->refine_gene_object(); push (@top_gene_objs, $template_gene_obj); # print $template_gene_obj->toString(); diff --git a/PerlLib/Thread_helper.pm b/PerlLib/Thread_helper.pm index 72e4636..bb43dcb 100644 --- a/PerlLib/Thread_helper.pm +++ b/PerlLib/Thread_helper.pm @@ -202,7 +202,10 @@ sub wait_for_all_threads_to_complete { $self->_add_error_thread($thread); $status = "ERROR"; } - $self->report_thread_info($thread_id, $status); + $self->{thread_id_timing}->{end}->{$thread_id} = time(); + if ($THREAD_MONITORING) { + $self->report_thread_info($thread_id, $status); + } } @{$self->{current_threads}} = (); # clear them out. diff --git a/misc_utilities/gff3_file_toString.pl b/misc_utilities/gff3_file_toString.pl index ea20ff3..f59e825 100755 --- a/misc_utilities/gff3_file_toString.pl +++ b/misc_utilities/gff3_file_toString.pl @@ -24,7 +24,8 @@ foreach my $gene_id (@gene_ids) { my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id}; - + + print "// $asmbl_id, gene: $gene_id\n"; print $gene_obj_ref->toString(); } diff --git a/misc_utilities/gtf_file_to_proteins.pl b/misc_utilities/gtf_file_to_proteins.pl index 07fcdeb..53cc51f 100755 --- a/misc_utilities/gtf_file_to_proteins.pl +++ b/misc_utilities/gtf_file_to_proteins.pl @@ -84,6 +84,8 @@ foreach my $isoform ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms()) { my $isoform_id = $isoform->{Model_feat_name}; + my $gene_id = $isoform->{TU_feat_name}; + my $seq = ""; @@ -105,7 +107,16 @@ my $com_name = $isoform->{com_name} || ""; - print ">$isoform_id\n$seq\n"; + my $gene_name = $isoform->{gene_name}; + my $header = ">$isoform_id $gene_id"; + if ($gene_name) { + $header .= " $gene_name "; + } + if ($com_name) { + $gene_name .= " $com_name"; + } + + print "$header\n$seq\n"; } } } diff --git a/pasa_conf/pasa.CONFIG.template b/pasa_conf/pasa.CONFIG.template index 1c693f8..7b22f48 100755 --- a/pasa_conf/pasa.CONFIG.template +++ b/pasa_conf/pasa.CONFIG.template @@ -6,6 +6,7 @@ # server actively running MySQL MYSQLSERVER=localhost # or server.com +# Pass socket connections through Perl DBI syntax e.g. MYSQLSERVER=mysql_socket=/tmp/mysql.sock # read-write username and password MYSQL_RW_USER=xxxxxx diff --git a/pasa_conf/sample_test.conf b/pasa_conf/sample_test.conf index 3c2fe9c..62729d8 100755 --- a/pasa_conf/sample_test.conf +++ b/pasa_conf/sample_test.conf @@ -20,11 +20,11 @@ USE_PASA_DB_SETUP_HOOK=false ##################################### # server actively running MySQL -MYSQLSERVER=bhaas-lx +MYSQLSERVER=localhost # read-only username and password -MYSQL_RO_USER=access -MYSQL_RO_PASSWORD=access +MYSQL_RO_USER=pasa_access +MYSQL_RO_PASSWORD=pasa_access # read-write username and password MYSQL_RW_USER=access @@ -35,7 +35,7 @@ MYSQL_RW_PASSWORD=access # Web browser navigation settings: ######### ############################################ -BASE_PASA_URL=http://bhaas-lx:8080/cgi-bin/ +BASE_PASA_URL=http://localhost:8080/cgi-bin/where_you_installed_pasa_cgi_root ############################################# diff --git a/pasa_cpp/pasa b/pasa_cpp/pasa index 1012ac8..4d6ae13 100755 Binary files a/pasa_cpp/pasa and b/pasa_cpp/pasa differ diff --git a/schema/cdna_alignment_mysqlschema b/schema/cdna_alignment_mysqlschema index 3076867..8d3077c 100644 --- a/schema/cdna_alignment_mysqlschema +++ b/schema/cdna_alignment_mysqlschema @@ -110,7 +110,7 @@ CREATE TABLE alt_splice_tokens ( CREATE TABLE annotation_admin ( version_id int(11) NOT NULL auto_increment, - date datetime NOT NULL default '0000-00-00 00:00:00', + date datetime NOT NULL, PRIMARY KEY (version_id) ) ; @@ -120,7 +120,7 @@ CREATE TABLE annotation_admin ( CREATE TABLE annotation_compare ( compare_id int(11) NOT NULL auto_increment, - date datetime NOT NULL default '0000-00-00 00:00:00', + date datetime NOT NULL, annotation_version int(10) unsigned NOT NULL default '0', PRIMARY KEY (compare_id) ) ; diff --git a/schema/notes b/schema/notes index edda33a..9930a80 100755 --- a/schema/notes +++ b/schema/notes @@ -16,7 +16,8 @@ Using the backticks instead of single quotes in the db name pattern is mandatory ------ create user 'pasa_access' identified by 'pasa_access'; -grant select on *.* to 'pasa_access@%'; -grant all on *.* to 'pasa_write@%'; +create user 'pasa_write' identified by '...'; + +grant select on *.* to 'pasa_access'; +grant all on *.* to 'pasa_write'; -grant all on *.* to 'pasa_write'@'localhost' identified by '...'; diff --git a/scripts/Launch_PASA_pipeline.pl b/scripts/Launch_PASA_pipeline.pl index 500f172..7bdec0f 100755 --- a/scripts/Launch_PASA_pipeline.pl +++ b/scripts/Launch_PASA_pipeline.pl @@ -133,7 +133,7 @@ # # # // Jump-starting or prematurely terminating -# -x flag, print cmds only, don\'t process anything. (useful to get indices for -x or -e opts below) +# -x flag, print cmds only, don\'t process anything. (useful to get indices for -s or -e opts below) # -s pipeline index to start running at (avoid rerunning searches). # -e pipeline index where to stop running, and do not execute this entry. # diff --git a/scripts/build_comprehensive_transcriptome.dbi b/scripts/build_comprehensive_transcriptome.dbi index aff0414..ff85a7a 100755 --- a/scripts/build_comprehensive_transcriptome.dbi +++ b/scripts/build_comprehensive_transcriptome.dbi @@ -355,6 +355,9 @@ sub get_TDN_component { # trinity de novo 2.0.0+ style $component = $1; } + elsif ($cdna_acc =~ /^(TRINITY\S+_c\d+_g\d+)/) { + $component = $1; + } else { confess "Error, cannot extract component info from trinity de novo assembly: $cdna_acc"; } diff --git a/scripts/cDNA_annotation_comparer.dbi b/scripts/cDNA_annotation_comparer.dbi index 6c7fdf0..bfdaf24 100755 --- a/scripts/cDNA_annotation_comparer.dbi +++ b/scripts/cDNA_annotation_comparer.dbi @@ -34,6 +34,9 @@ use vars qw ($opt_h $opt_p $DEBUG $opt_M $opt_G $MIN_PERCENT_OVERLAP $MIN_PERCEN $STOMP_HIGH_PERCENTAGE_OVERLAPPING_GENE $MIN_PERCENT_ALIGN_LENGTH $MIN_PERCENT_LENGTH_FL_COMPARE $TRUST_FL_STATUS $CLUSTER_ID_ONLY); + +my $RESTRICT_SINGLE_CONTIG = undef; + my $CPU = 2; &GetOptions ('M=s' => \$opt_M, @@ -62,6 +65,8 @@ my $CPU = 2; ## hidden opts for debugging purposes 'CLUSTER_ID_ONLY=i' => \$CLUSTER_ID_ONLY, # the only cluster ID to process. + 'RESTRICT_SINGLE_CONTIG=s' => \$RESTRICT_SINGLE_CONTIG, + ); @@ -140,6 +145,12 @@ _EOH_ ; +### Hidden options +### +## --RESTRICT_SINGLE_CONTIG genomic contig to restrict analysis to +### + + if ($opt_h) {die $usage;} @@ -291,7 +302,9 @@ main: { print "### There are $num_contigs genomic contigs which will be examined.\n"; foreach my $result (@results) { my $asmbl_id = $result->[0]; - + + if ($RESTRICT_SINGLE_CONTIG && $asmbl_id ne $RESTRICT_SINGLE_CONTIG) { next; } + $thread_helper->wait_for_open_thread(); my $thread = threads->create('process_contig', $asmbl_id); @@ -308,7 +321,17 @@ main: { if (@failures) { ## examine them... these are the same threads created above, use use the threads api to access info about them ## such as error messages - die "Error, there were " . scalar(@failures) . " threads (contig jobs) that failed... See error messages above in order to troubleshoot further"; + + print STDERR "Error, there were " . scalar(@failures) . " threads (contig jobs) that failed... See error messages above in order to troubleshoot further"; + + foreach my $failed_thread (@failures) { + my $thread_id = $failed_thread->tid; + print STDERR "Failed thread ($thread_id) info:\n"; + $thread_helper->report_thread_info($thread_id, "FAILED"); + } + + exit(1); + } else { ## all good! @@ -1444,7 +1467,7 @@ sub get_annotated_gene_models_on_contig { my @models; # get the annotated working models for that chromosome: - my $query = "select a.gene_id, min(a.lend), max(a.rend), a.orient from annotation_store a where a.annotdb_asmbl_id = ? and annotation_version = ? group by a.gene_id\n"; + my $query = "select a.gene_id, min(a.lend), max(a.rend), a.orient from annotation_store a where a.annotdb_asmbl_id = ? and annotation_version = ? group by a.gene_id, a.orient\n"; my @results = &Mysql_connect::do_sql_2D($dbproc, $query, $asmbl_id, $annot_version); foreach my $result (@results) { diff --git a/scripts/create_mysql_cdnaassembly_db.dbi b/scripts/create_mysql_cdnaassembly_db.dbi index e491e10..2b3435b 100755 --- a/scripts/create_mysql_cdnaassembly_db.dbi +++ b/scripts/create_mysql_cdnaassembly_db.dbi @@ -14,7 +14,7 @@ use ConfigFileReader; use vars qw ($opt_c $opt_p $opt_d $opt_h $opt_S $opt_r); -&getopts ('c:p:S:r'); +&getopts ('c:p:S:rd'); $|=1; @@ -73,11 +73,22 @@ if ($mysql_server=~s/:(\d+)$//){ # if $mysql_server includes :port then remove and add as port $mysql_options .= " --port=$1"; } -$mysql_options .= " -h$mysql_server"; + +# check if mysql server is configured with host or socket location i.e. +# the conf.txt contains MYSQLSERVER=mysql_socket= +if (index($mysql_server, "mysql_socket") != -1) { + #Drop the msyql_socket= from the string + #The regex drop everything before and including the '=' character + $mysql_server =~ s/^\s*\S+=//; + $mysql_options .= " -S$mysql_server"; +} else { + $mysql_options .= " -h$mysql_server"; +} ## Populate the database structure and static data. my $cmd = "$mysql_exec $mysql_options -D$mysql_db -e 'source $schemafile'"; +print STDERR "CMD: $cmd\n" if $DEBUG; my $result = system ($cmd); die "CMD: $cmd failed.\n" if $result; diff --git a/scripts/process_GMAP_alignments_gff3_chimeras_ok.pl b/scripts/process_GMAP_alignments_gff3_chimeras_ok.pl index c396a28..0c66c79 100755 --- a/scripts/process_GMAP_alignments_gff3_chimeras_ok.pl +++ b/scripts/process_GMAP_alignments_gff3_chimeras_ok.pl @@ -88,7 +88,14 @@ my $cmd = "gmap -D $genomeBaseDir -d $genomeDir $transcriptDB -f $format -n $num_gmap_top_hits -x 50 -t $CPU -B 5 "; if ($max_intron) { - $cmd .= " --intronlength=$max_intron "; + my $gmapv=`gmap --version 2>&1 | head -1`; + chomp($gmapv); + $gmapv=~/GMAP\s+version\s+(\d{4})-(\d{2})-.*/; + if( ($1 == 2016 && $2 >= 5) || $1 > 2016){ + $cmd .= " --max-intronlength-middle=$max_intron --max-intronlength-ends=$max_intron "; + }else{ + $cmd .= " --intronlength=$max_intron "; + } } &process_cmd($cmd);