Permalink
...
Comparing changes
Open a pull request
- 15 commits
- 15 files changed
- 0 commit comments
- 4 contributors
Commits on Jun 11, 2015
|
|
brianjohnhaas |
92d87d0
|
|||
|
|
brianjohnhaas |
74418d3
|
Commits on Jun 13, 2015
|
|
brianjohnhaas |
7fe6b78
|
|||
|
|
brianjohnhaas |
3c8e88a
|
Commits on Aug 10, 2015
|
|
mimarsh2 |
7d5dc8a
|
|||
|
|
mimarsh2 |
11dff71
|
Commits on Aug 11, 2015
|
|
brianjohnhaas |
7d6e2e6
|
Commits on Dec 07, 2015
|
|
brianjohnhaas |
69927de
|
Commits on Jan 20, 2016
|
|
corburn |
a30894d
|
Commits on Oct 17, 2016
|
|
brianjohnhaas |
ea25011
|
Commits on Dec 27, 2016
|
|
yuragal |
440ec0e
|
Commits on Dec 28, 2016
|
|
brianjohnhaas |
6f6f2e9
|
Commits on Feb 15, 2017
|
|
brianjohnhaas |
2bf11be
|
Commits on Apr 21, 2017
|
|
brianjohnhaas |
752ce7f
|
Commits on Jul 11, 2017
|
|
brianjohnhaas |
8d31504
|
Unified
Split
Showing
with
129 additions
and 39 deletions.
- +28 −7 PerlLib/Exons_to_geneobj.pm
- +21 −13 PerlLib/GTF_utils.pm
- +4 −1 PerlLib/Thread_helper.pm
- +2 −1 misc_utilities/gff3_file_toString.pl
- +12 −1 misc_utilities/gtf_file_to_proteins.pl
- +1 −0 pasa_conf/pasa.CONFIG.template
- +4 −4 pasa_conf/sample_test.conf
- BIN pasa_cpp/pasa
- +2 −2 schema/cdna_alignment_mysqlschema
- +4 −3 schema/notes
- +1 −1 scripts/Launch_PASA_pipeline.pl
- +3 −0 scripts/build_comprehensive_transcriptome.dbi
- +26 −3 scripts/cDNA_annotation_comparer.dbi
- +13 −2 scripts/create_mysql_cdnaassembly_db.dbi
- +8 −1 scripts/process_GMAP_alignments_gff3_chimeras_ok.pl
View
35
PerlLib/Exons_to_geneobj.pm
| @@ -27,9 +27,13 @@ sub create_gene_obj { | ||
| ## exons_ref should be end5's keyed to end3's for all exons. | ||
| my ($gene_struct_mod, $cdna_seq) = &get_cdna_seq ($exons_href, $sequence_ref); | ||
| + | ||
| + | ||
| my $cdna_seq_length = length $cdna_seq; | ||
| my $long_orf_obj = new Longest_orf(); | ||
| + #print STDERR "CDNA_SEQ: [$cdna_seq], length: $cdna_seq_length\n"; | ||
| + | ||
| # establish long orf finding parameters. | ||
| $long_orf_obj->forward_strand_only(); | ||
| if ($partial_info_href->{"5prime"}) { | ||
| @@ -43,24 +47,28 @@ sub create_gene_obj { | ||
| $long_orf_obj->get_longest_orf($cdna_seq); | ||
| my ($end5, $end3) = $long_orf_obj->get_end5_end3(); | ||
| - print "CDS: $end5, $end3\n" if $SEE; | ||
| + #print STDERR "*** CDS: $end5, $end3\n";# if $SEE; | ||
| my $gene_obj = &create_gene ($gene_struct_mod, $end5, $end3); | ||
| + #print STDERR $gene_obj->toString(); | ||
| + | ||
| $gene_obj->create_all_sequence_types($sequence_ref); | ||
| my $protein = $gene_obj->get_protein_sequence(); | ||
| - | ||
| + my $recons_cds = $gene_obj->get_CDS_sequence(); | ||
| + #print STDERR "reconsCDS: $recons_cds\n"; | ||
| + | ||
| ## check partiality | ||
| if ($protein) { # it is possible that we won't have any cds structure | ||
| if ($protein !~ /^M/) { | ||
| # this would require that we allowed for 5prime partials | ||
| unless ($partial_info_href->{"5prime"}) { | ||
| - confess "Error, have 5' partial protein when 5prime partials weren't allowed!\n$protein\n"; | ||
| + confess "Error, have 5' partial protein when 5prime partials weren't allowed!\n$protein\n$cdna_seq\n"; | ||
| } | ||
| } | ||
| if ($protein !~ /\*$/) { | ||
| # this would require that we allowed for 3prime partials | ||
| unless ($partial_info_href->{"3prime"}) { | ||
| - confess "Error, have 3' partial protein when 3prime partials weren't allowed!\n$protein\n"; | ||
| + confess "Error, have 3' partial protein when 3prime partials weren't allowed!\n$protein\n$cdna_seq\n"; | ||
| } | ||
| } | ||
| @@ -80,6 +88,9 @@ sub create_gene_obj { | ||
| #### | ||
| sub get_cdna_seq { | ||
| my ($gene_struct, $assembly_seq_ref) = @_; | ||
| + | ||
| + my $seq_length = length($$assembly_seq_ref); | ||
| + | ||
| my (@end5s) = sort {$a<=>$b} keys %$gene_struct; | ||
| my $strand = "?"; | ||
| foreach my $end5 (@end5s) { | ||
| @@ -90,7 +101,7 @@ sub get_cdna_seq { | ||
| } | ||
| if ($strand eq "?") { | ||
| print Dumper ($gene_struct); | ||
| - die "ERROR: I can't determine what orientation the cDNA is in!\n"; | ||
| + confess "ERROR: I can't determine what orientation the cDNA is in!\n"; | ||
| } | ||
| print NOTES "strand: $strand\n"; | ||
| my $cdna_seq; | ||
| @@ -99,6 +110,11 @@ sub get_cdna_seq { | ||
| foreach my $end5 (@end5s) { | ||
| #print $end5; | ||
| my $end3 = $gene_struct->{$end5}; | ||
| + | ||
| + if ($end5 > $seq_length || $end3 > $seq_length) { | ||
| + confess "Error, coords are out of bounds of sequence length: $seq_length:\n" . Dumper(\$gene_struct); | ||
| + } | ||
| + | ||
| my ($coord1, $coord2) = sort {$a<=>$b} ($end5, $end3); | ||
| my $exon_seq = substr ($$assembly_seq_ref, $coord1 - 1, ($coord2 - $coord1 + 1)); | ||
| $cdna_seq .= $exon_seq; | ||
| @@ -114,21 +130,26 @@ sub get_cdna_seq { | ||
| #### | ||
| sub create_gene { | ||
| my ($gene_struct_mod, $cds_pointer_lend, $cds_pointer_rend) = @_; | ||
| + | ||
| + #use Data::Dumper; | ||
| + #print STDERR Dumper($gene_struct_mod) . "CDS: $cds_pointer_lend, $cds_pointer_rend\n"; | ||
| + | ||
| my $strand = $gene_struct_mod->{strand}; | ||
| - my @exons = @{$gene_struct_mod->{exons}}; | ||
| + my @exons = sort {$a->[0]<=>$b->[0]} @{$gene_struct_mod->{exons}}; | ||
| if ($strand eq '-') { | ||
| @exons = reverse (@exons); | ||
| } | ||
| my $mRNA_pointer_lend = 1; | ||
| my $mRNA_pointer_rend = 0; | ||
| my $gene_obj = new Gene_obj(); | ||
| foreach my $coordset_ref (@exons) { | ||
| - my ($coord1, $coord2) = @$coordset_ref; | ||
| + my ($coord1, $coord2) = sort {$a<=>$b} @$coordset_ref; | ||
| my ($end5, $end3) = ($strand eq '+') ? ($coord1, $coord2) : ($coord2, $coord1); | ||
| my $exon_obj = new mRNA_exon_obj($end5, $end3); | ||
| my $exon_length = ($coord2 - $coord1 + 1); | ||
| $mRNA_pointer_rend = $mRNA_pointer_lend + $exon_length - 1; | ||
| ## see if cds is within current cDNA range. | ||
| + #print STDERR "mRNA coords: $mRNA_pointer_lend-$mRNA_pointer_rend\n"; | ||
| if ( $cds_pointer_rend >= $mRNA_pointer_lend && $cds_pointer_lend <= $mRNA_pointer_rend) { #overlap | ||
| my $diff = $cds_pointer_lend - $mRNA_pointer_lend; | ||
| my $delta_lend = ($diff >0) ? $diff : 0; | ||
View
34
PerlLib/GTF_utils.pm
| @@ -35,8 +35,12 @@ sub index_GTF_gene_objs_from_GTF { | ||
| my $gene_id = $gene_obj->{TU_feat_name}; | ||
| + | ||
| if ($seen{$gene_id}) { | ||
| - confess "Error, already processed gene: $gene_id\nearlier: " . $seen{$gene_id}->toString() . " but now as: " . $gene_obj->toString(); | ||
| + confess "Error, already processed gene: $gene_id\n" | ||
| + . " here: " . $gene_obj->toString() . "\n" | ||
| + . " and earlier: " . $seen{$gene_id}->toString(); | ||
| + | ||
| } | ||
| $seen{$gene_id} = $gene_obj; | ||
| @@ -71,6 +75,7 @@ sub GTF_to_gene_objs { | ||
| my %gene_id_to_name; | ||
| my %gene_id_to_seq_name; | ||
| + my %gene_id_to_gene_name; | ||
| open (my $fh, $gtf_filename) or die "Error, cannot open $gtf_filename"; | ||
| while (<$fh>) { | ||
| @@ -94,24 +99,25 @@ sub GTF_to_gene_objs { | ||
| $gene_id_to_source{$gene_id} = $source; | ||
| - my $transcript_id; | ||
| - if ($annot =~ /transcript_id \"([^\"]+)\"/) { | ||
| - $transcript_id = $1; | ||
| - } | ||
| - else { | ||
| - #print STDERR "\nWARNING: cannot get transcript_id from $annot of line\n$_\n"; | ||
| - next; | ||
| - } | ||
| + $annot =~ /transcript_id \"([^\"]+)\"/ or confess "Error, cannot get transcript_id from $annot of line\n$_"; | ||
| + my $transcript_id = $1; | ||
| if ($annot =~ /name \"([^\"]+)\"/) { | ||
| my $name = $1; | ||
| $gene_id_to_name{$gene_id} = $name; | ||
| } | ||
| + | ||
| + my $gene_name = ""; | ||
| + if ($annot =~ /gene_name \"([^\"]+)\"/) { | ||
| + $gene_name = $1; | ||
| + $gene_id_to_gene_name{$gene_id} = $gene_name; | ||
| + } | ||
| + | ||
| # print "gene_id: $gene_id, transcrpt_id: $transcript_id, $type\n"; | ||
| if ($type eq 'transcript' || $type eq 'gene') { next; } # capture by exon coordinates | ||
| - | ||
| + | ||
| if ($type eq 'CDS' || $type eq 'stop_codon' || $type eq 'start_codon') { | ||
| push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{CDS}}, [$end5, $end3] ); | ||
| @@ -120,11 +126,11 @@ sub GTF_to_gene_objs { | ||
| elsif ($type eq "exon" || $type =~ /UTR/) { | ||
| push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{mRNA}}, [$end5, $end3] ); | ||
| } | ||
| - elsif ($type =~ /rna/i) { | ||
| + else { | ||
| ## assuming noncoding feature | ||
| push (@{$noncoding_features{$seqname}->{$type}->{$gene_id}->{$transcript_id}}, [$end5, $end3] ); | ||
| } | ||
| - | ||
| + | ||
| } | ||
| close $fh; | ||
| @@ -188,6 +194,9 @@ sub GTF_to_gene_objs { | ||
| else { | ||
| $gene_obj->{com_name} = $transcript_id; | ||
| } | ||
| + if (my $gene_name = $gene_id_to_gene_name{$gene_id}) { | ||
| + $gene_obj->{gene_name} = $gene_name; | ||
| + } | ||
| $gene_obj->{asmbl_id} = $seqname; | ||
| $gene_obj->{source} = $source; | ||
| @@ -203,7 +212,6 @@ sub GTF_to_gene_objs { | ||
| foreach my $other_gene_obj (@gene_objs) { | ||
| $template_gene_obj->add_isoform($other_gene_obj); | ||
| } | ||
| - $template_gene_obj->refine_gene_object(); | ||
| push (@top_gene_objs, $template_gene_obj); | ||
| # print $template_gene_obj->toString(); | ||
View
5
PerlLib/Thread_helper.pm
| @@ -202,7 +202,10 @@ sub wait_for_all_threads_to_complete { | ||
| $self->_add_error_thread($thread); | ||
| $status = "ERROR"; | ||
| } | ||
| - $self->report_thread_info($thread_id, $status); | ||
| + $self->{thread_id_timing}->{end}->{$thread_id} = time(); | ||
| + if ($THREAD_MONITORING) { | ||
| + $self->report_thread_info($thread_id, $status); | ||
| + } | ||
| } | ||
| @{$self->{current_threads}} = (); # clear them out. | ||
View
3
misc_utilities/gff3_file_toString.pl
| @@ -24,7 +24,8 @@ | ||
| foreach my $gene_id (@gene_ids) { | ||
| my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id}; | ||
| - | ||
| + | ||
| + print "// $asmbl_id, gene: $gene_id\n"; | ||
| print $gene_obj_ref->toString(); | ||
| } | ||
View
13
misc_utilities/gtf_file_to_proteins.pl
| @@ -84,6 +84,8 @@ | ||
| foreach my $isoform ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms()) { | ||
| my $isoform_id = $isoform->{Model_feat_name}; | ||
| + my $gene_id = $isoform->{TU_feat_name}; | ||
| + | ||
| my $seq = ""; | ||
| @@ -105,7 +107,16 @@ | ||
| my $com_name = $isoform->{com_name} || ""; | ||
| - print ">$isoform_id\n$seq\n"; | ||
| + my $gene_name = $isoform->{gene_name}; | ||
| + my $header = ">$isoform_id $gene_id"; | ||
| + if ($gene_name) { | ||
| + $header .= " $gene_name "; | ||
| + } | ||
| + if ($com_name) { | ||
| + $gene_name .= " $com_name"; | ||
| + } | ||
| + | ||
| + print "$header\n$seq\n"; | ||
| } | ||
| } | ||
| } | ||
View
1
pasa_conf/pasa.CONFIG.template
| @@ -6,6 +6,7 @@ | ||
| # server actively running MySQL | ||
| MYSQLSERVER=localhost # or server.com | ||
| +# Pass socket connections through Perl DBI syntax e.g. MYSQLSERVER=mysql_socket=/tmp/mysql.sock | ||
| # read-write username and password | ||
| MYSQL_RW_USER=xxxxxx | ||
View
8
pasa_conf/sample_test.conf
| @@ -20,11 +20,11 @@ USE_PASA_DB_SETUP_HOOK=false | ||
| ##################################### | ||
| # server actively running MySQL | ||
| -MYSQLSERVER=bhaas-lx | ||
| +MYSQLSERVER=localhost | ||
| # read-only username and password | ||
| -MYSQL_RO_USER=access | ||
| -MYSQL_RO_PASSWORD=access | ||
| +MYSQL_RO_USER=pasa_access | ||
| +MYSQL_RO_PASSWORD=pasa_access | ||
| # read-write username and password | ||
| MYSQL_RW_USER=access | ||
| @@ -35,7 +35,7 @@ MYSQL_RW_PASSWORD=access | ||
| # Web browser navigation settings: ######### | ||
| ############################################ | ||
| -BASE_PASA_URL=http://bhaas-lx:8080/cgi-bin/ | ||
| +BASE_PASA_URL=http://localhost:8080/cgi-bin/where_you_installed_pasa_cgi_root | ||
| ############################################# | ||
View
BIN
pasa_cpp/pasa
Binary file not shown.
View
4
schema/cdna_alignment_mysqlschema
| @@ -110,7 +110,7 @@ CREATE TABLE alt_splice_tokens ( | ||
| CREATE TABLE annotation_admin ( | ||
| version_id int(11) NOT NULL auto_increment, | ||
| - date datetime NOT NULL default '0000-00-00 00:00:00', | ||
| + date datetime NOT NULL, | ||
| PRIMARY KEY (version_id) | ||
| ) ; | ||
| @@ -120,7 +120,7 @@ CREATE TABLE annotation_admin ( | ||
| CREATE TABLE annotation_compare ( | ||
| compare_id int(11) NOT NULL auto_increment, | ||
| - date datetime NOT NULL default '0000-00-00 00:00:00', | ||
| + date datetime NOT NULL, | ||
| annotation_version int(10) unsigned NOT NULL default '0', | ||
| PRIMARY KEY (compare_id) | ||
| ) ; | ||
View
7
schema/notes
| @@ -16,7 +16,8 @@ Using the backticks instead of single quotes in the db name pattern is mandatory | ||
| ------ | ||
| create user 'pasa_access' identified by 'pasa_access'; | ||
| -grant select on *.* to 'pasa_access@%'; | ||
| -grant all on *.* to 'pasa_write@%'; | ||
| +create user 'pasa_write' identified by '...'; | ||
| + | ||
| +grant select on *.* to 'pasa_access'; | ||
| +grant all on *.* to 'pasa_write'; | ||
| -grant all on *.* to 'pasa_write'@'localhost' identified by '...'; | ||
View
2
scripts/Launch_PASA_pipeline.pl
| @@ -133,7 +133,7 @@ | ||
| # | ||
| # | ||
| # // Jump-starting or prematurely terminating | ||
| -# -x flag, print cmds only, don\'t process anything. (useful to get indices for -x or -e opts below) | ||
| +# -x flag, print cmds only, don\'t process anything. (useful to get indices for -s or -e opts below) | ||
| # -s <int> pipeline index to start running at (avoid rerunning searches). | ||
| # -e <int> pipeline index where to stop running, and do not execute this entry. | ||
| # | ||
View
3
scripts/build_comprehensive_transcriptome.dbi
| @@ -355,6 +355,9 @@ sub get_TDN_component { | ||
| # trinity de novo 2.0.0+ style | ||
| $component = $1; | ||
| } | ||
| + elsif ($cdna_acc =~ /^(TRINITY\S+_c\d+_g\d+)/) { | ||
| + $component = $1; | ||
| + } | ||
| else { | ||
| confess "Error, cannot extract component info from trinity de novo assembly: $cdna_acc"; | ||
| } | ||
View
29
scripts/cDNA_annotation_comparer.dbi
| @@ -34,6 +34,9 @@ use vars qw ($opt_h $opt_p $DEBUG $opt_M $opt_G $MIN_PERCENT_OVERLAP $MIN_PERCEN | ||
| $STOMP_HIGH_PERCENTAGE_OVERLAPPING_GENE $MIN_PERCENT_ALIGN_LENGTH | ||
| $MIN_PERCENT_LENGTH_FL_COMPARE $TRUST_FL_STATUS $CLUSTER_ID_ONLY); | ||
| + | ||
| +my $RESTRICT_SINGLE_CONTIG = undef; | ||
| + | ||
| my $CPU = 2; | ||
| &GetOptions ('M=s' => \$opt_M, | ||
| @@ -62,6 +65,8 @@ my $CPU = 2; | ||
| ## hidden opts for debugging purposes | ||
| 'CLUSTER_ID_ONLY=i' => \$CLUSTER_ID_ONLY, # the only cluster ID to process. | ||
| + 'RESTRICT_SINGLE_CONTIG=s' => \$RESTRICT_SINGLE_CONTIG, | ||
| + | ||
| ); | ||
| @@ -140,6 +145,12 @@ _EOH_ | ||
| ; | ||
| +### Hidden options | ||
| +### | ||
| +## --RESTRICT_SINGLE_CONTIG <string> genomic contig to restrict analysis to | ||
| +### | ||
| + | ||
| + | ||
| if ($opt_h) {die $usage;} | ||
| @@ -291,7 +302,9 @@ main: { | ||
| print "### There are $num_contigs genomic contigs which will be examined.\n"; | ||
| foreach my $result (@results) { | ||
| my $asmbl_id = $result->[0]; | ||
| - | ||
| + | ||
| + if ($RESTRICT_SINGLE_CONTIG && $asmbl_id ne $RESTRICT_SINGLE_CONTIG) { next; } | ||
| + | ||
| $thread_helper->wait_for_open_thread(); | ||
| my $thread = threads->create('process_contig', $asmbl_id); | ||
| @@ -308,7 +321,17 @@ main: { | ||
| if (@failures) { | ||
| ## examine them... these are the same threads created above, use use the threads api to access info about them | ||
| ## such as error messages | ||
| - die "Error, there were " . scalar(@failures) . " threads (contig jobs) that failed... See error messages above in order to troubleshoot further"; | ||
| + | ||
| + print STDERR "Error, there were " . scalar(@failures) . " threads (contig jobs) that failed... See error messages above in order to troubleshoot further"; | ||
| + | ||
| + foreach my $failed_thread (@failures) { | ||
| + my $thread_id = $failed_thread->tid; | ||
| + print STDERR "Failed thread ($thread_id) info:\n"; | ||
| + $thread_helper->report_thread_info($thread_id, "FAILED"); | ||
| + } | ||
| + | ||
| + exit(1); | ||
| + | ||
| } | ||
| else { | ||
| ## all good! | ||
| @@ -1444,7 +1467,7 @@ sub get_annotated_gene_models_on_contig { | ||
| my @models; | ||
| # get the annotated working models for that chromosome: | ||
| - my $query = "select a.gene_id, min(a.lend), max(a.rend), a.orient from annotation_store a where a.annotdb_asmbl_id = ? and annotation_version = ? group by a.gene_id\n"; | ||
| + my $query = "select a.gene_id, min(a.lend), max(a.rend), a.orient from annotation_store a where a.annotdb_asmbl_id = ? and annotation_version = ? group by a.gene_id, a.orient\n"; | ||
| my @results = &Mysql_connect::do_sql_2D($dbproc, $query, $asmbl_id, $annot_version); | ||
| foreach my $result (@results) { | ||
Oops, something went wrong.