/
refold.pl
executable file
·328 lines (267 loc) · 8.47 KB
/
refold.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
#!/usr/bin/perl -w
# -*-Perl-*-
# Last changed Time-stamp: <2006-08-15 22:04:27 ivo>
# after predicting a conensus structure using RNAalifold,
# refold the indivual sequences, using the consensus structure as constraint
# read a clustal fromat alignment, plus either the standart output of
# RNAalifold or a dot plot produced by "RNAalifold -p"
# usage refold.pl [-t thresh] clustal.aln alidot.ps
# or refold.pl clustal.aln clustal.alifold
use Getopt::Long;
#use POSIX;
#use GD;
use strict;
my $thresh = 0.9;
my $help = 0;
my $TURN = 3;
GetOptions ('-t=f' => \$thresh,
'turn' => \$TURN,
"help"=>\$help,
"h"=>\$help,
);
if ($help){
print "\ refold.pl [-t threshold] myseqs.aln alidot.ps | RNAfold -C\n";
print "or refold.pl [--turn looplength] myseqs.aln myseqs.alifold | RNAfold -C\n";
print " -t ... use only pairs with p>thresh (default: 0.9)\n";
print " --turn ... use only pairs closing hairpins with length > looplength (default: 3)\n\n";
exit(1);
}
my $aln = readClustal();
my $len = length($aln->[0]->{seq});
$_ = <>;
#print STDERR "$_\n";
my $constraint = (/^%\!PS/) ? read_dot() : read_alifold();
#print STDERR "$constraint\n";
foreach my $a (@$aln) {
print "> $a->{name}\n";
# remove gaps
my $seq = $a->{seq};
my $cons = $constraint;
my @pt = make_pair_table($cons);
my %pair = ("AU" => 5,
"GC" => 1,
"CG" => 2,
"UA" => 6,
"GU" => 3,
"GT" => 3,
"TG" => 4,
"UG" => 4,
"AT" => 5,
"TA" => 6);
for my $p (0..length($seq)-1) {
# remove non-compatible pairs as well as pairs to a gap position
my $c = substr($seq,$p,1);
if ($c eq '-') {
substr($cons,$p,1) = 'x'; # mark for removal
substr($cons,$pt[$p],1) = '.'
if $pt[$p]>0 && substr($cons,$pt[$p],1) ne 'x'; # open pair
}
elsif ($pt[$p]>$p) {
substr($cons,$p,1) = substr($cons,$pt[$p],1) = '.'
unless exists $pair{$c . substr($seq,$pt[$p],1)};
}
}
# print STDERR length($seq), length($cons), "\n";
$cons =~ s/x//g;
$seq =~ s/-//g;
# remove all hairpins with length < TURN
@pt = make_pair_table($cons);
for my $p (0..length($seq)-1) {
next if $p > $pt[$p];
substr($cons,$p,1) = substr($cons,$pt[$p],1) = '.'
if $pt[$p] - $p - 1 < $TURN;
}
# print STDERR length($seq), length($cons), "\n";
print "$seq\n$cons\n";
}
sub make_pair_table {
#indices start at 0 in this version!
my $structure = shift;
my (@olist, @table);
my ($hx,$i) = (0,0);
foreach my $c (split(//,$structure)) {
if ($c eq '.') {
$table[$i]= -1;
} elsif ($c eq '(') {
$olist[$hx++]=$i;
} elsif ($c eq ')') {
my $j = $olist[--$hx];
die ("unbalanced brackets in make_pair_table") if ($hx<0);
$table[$i]=$j;
$table[$j]=$i;
}
$i++;
}
die ("too few closed brackets in make_pair_table") if ($hx!=0);
return @table;
}
sub read_alifold {
while (<>) {
return $1 if /^([(.)]+)/;
}
}
sub read_dot {
my $cons = '.' x $len;
while(<>) {
next if /^%/;
next unless /lbox$/;
my @F = split;
next if $F[5] < $thresh;
substr($cons,$F[3]-1,1) = '(';
substr($cons,$F[4]-1,1) = ')';
}
return $cons;
}
######################################################################
#
# readClustal(filehandle)
#
# Reads Clustal W formatted alignment file and returns it in list of
# hash references with keys "name" and "seq" for the name and the sequence,
# resp.
#
# Fixme: the code below assumes all uppercase sequences
######################################################################
sub readClustal{
# my $fh=shift;
my @out=();
my (%order, $order, %alignments);
while(<>) {
next if ( /^\s+$/ );
my ($seqname, $aln_line) = ('', '');
if( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) {
# clustal 1.4 format
($seqname,$aln_line) = ("$1/$2-$3",$4);
} elsif( /^(\S+)\s+([A-Z\-]+)\s*$/ ) {
($seqname,$aln_line) = ($1,$2);
} else {
next;
}
if( !exists $order{$seqname} ) {
$order{$seqname} = $order++;
}
$alignments{$seqname} .= $aln_line;
last if eof;
}
foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) {
if( $name =~ /(\S+):(\d+)-(\d+)/ ) {
(my $sname,my $start, my $end) = ($1,$2,$3);
} else {
(my $sname, my $start) = ($name,1);
my $str = $alignments{$name};
$str =~ s/[^A-Za-z]//g;
my $end = length($str);
}
my $seq=$alignments{$name};
push @out, {name=>$name,seq=>$seq};
}
return [@out];
}
######################################################################
#
# getPairs(\@aln alnref, $ss string)
#
# Evalutates the pairing of an alignment according to a given
# consensus secondary structure
#
# Returns list of all base pairs which is a hash with the following
# keys:
#
# open ... column in the alignment of the first base in the pair "("
# close ... column in the alignment of the second base in the pair ")"
# all ... list of all basepairs in the different sequences in the alignment
# pairing ... list of all different pairing basepairs
# nonpairing ... list of all incompatible basepairs
#
######################################################################
sub getPairs{
my @inputAln=@{$_[0]};
my $ss=$_[1];
# return nothing if there are no pairs
if (!($ss=~tr/(/(/)){
return ();
}
my @aln=();
foreach my $row (@inputAln){
my $seq=$row->{seq};
$seq=uc($seq);
$seq=~s/T/U/g;
my @tmp=split(//,$seq);
push @aln,\@tmp;
}
my @ss=split(//,$ss);
my @pairs=();
my @stack=();
foreach my $column (0..$#ss){
my $currChar=$ss[$column];
if ($currChar eq '('){
push @stack,$column;
}
if ($currChar eq ')'){
my $openedCol=pop @stack;
push @pairs,{open=>$openedCol,close=>$column};
}
}
@pairs=sort {$a->{open} <=> $b->{open}} @pairs;
foreach my $i (0..$#pairs){
#print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n";
my @all=();
my @pairing=();
my @nonpairing=();
for my $j (0..$#aln){
my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}];
push @all,$currPair;
}
for my $pair (@all){
if (($pair eq 'AU') or
($pair eq 'UA') or
($pair eq 'GC') or
($pair eq 'CG') or
($pair eq 'UG') or
($pair eq 'GU')){
push @pairing, $pair;
} elsif ($pair eq "--"){
# do nothing
} else {
push @nonpairing,$pair;
}
}
undef my %saw;
my @uniquePairing = grep(!$saw{$_}++, @pairing);
$pairs[$i]->{all}=[@all];
$pairs[$i]->{pairing}=[@uniquePairing];
$pairs[$i]->{nonpairing}=[@nonpairing];
}
return @pairs;
}
=head1 NAME
refold.pl - refold using consensus structure as constraint
=head1 SYNOPSIS
refold.pl [-t thresh] file.aln alidot.ps
refold.pl [--turn length] file.aln file.alifold
=head1 DESCRIPTION
refold.pl reads an alignment in CLUSTAL format, and a consensus
secondary structure, either in the form of the standard output of
C<RNAalifold>, or in the form of a dot plot as produced by
C<RNAalifold -p>. For each sequence in the alignment it writes the
name, sequence, and constraint structure to stdout in a format
suitable for piping into C<RNAfold -C>.
The constraint string is produced by removing from the consensus
structure all gaps, as well as all pairs not compatible with the
particular sequence. If the structure is read from a dot plot file,
only pairs with probability > then some threshold (default 0.9) are
used.
=head1 OPTIONS
=over 4
=item B<-t> I<thresh>
use only pairs with p>thresh as constraint. Only applicable when
reading consensus structure from a dot plot.
=item B<--turn> I<length>
use only pairs closing a hairpin with looplength >length. This option
allows for setting minimal loop length of hairpin loops (default: 3)
=back
=head1 EXAMPLE
RNAalifold test.aln > test.alifold
refold.pl test.aln test.alifold > test.cfold
RNAfold -C < test.cfold
=cut