/
orderchr
968 lines (788 loc) · 29.3 KB
/
orderchr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
#!/bin/env perl
=pod
=head1 NAME
orderchr - determine an ideogram order that minimizes the cross-over of links
=head1 SYNOPSIS
orderchr -links linkfile.txt
-karyotype karyotype.txt
{ -shuffle_file chrs_to_shuffle.txt | -shuffle LIST | -shuffle_rx REGEX_LIST }
{-static LIST} {-static_rx REGEX_LIST}
{-init_order LIST} {-init_order_rx REGEX_LIST}
=head1 DESCRIPTION
By examining the frequencies of ideogram relationships
defined in the link file, this script suggests a new order for
ideograms that results in fewer cross-overs between links. Simulated
annealing is used to optimize the ideogram order (read below about
parameters). Run the simulation a couple of times to check convergence
(finding a global minimum is not guaranteed).
For a visual explanation of what is happening, see Tutorial 9.2 on the
Circos web site (mkweb.bcgsc.ca/circos).
=head1 DEFINING WHICH IDEOGRAMS TO SHUFFLE
The set of ideograms to shuffle is specified by either (a) link
data, whereby all ideograms that have links are subject to shuffling
(b) -shuffle_file, whereby only those ideograms listed in the file
are shuffled (c) -shuffle, whereby the set of ideograms to shuffle
is specified by a list, and (d) -shuffle_rx, whereby only those
ideograms that match a regular expression are shuffled. Any
ideogram that is not identified by one of these methods does not
participate in the shuffling process. Specifically, any links to/from
such ideograms are not considered when minimizing link crossing.
=head2 MODE 1 - shuffle set defined by link data
> orderchr -links linkfile.txt
All ideograms mentioned in the -links file will be subject to
reordering. The initial order will be taken from order of appearance
in the karyotype file.
=head2 MODE 2 - shuffle set defined by file
> orderchr -links linkfile.txt -shuffle_file chrs_to_shuffle.txt
The set of ideograms to shuffle is given in the -shuffle_file file,
which contains one ideogram per line. For example
> cat chrs_to_shuffle.txt
chr1
chr5
chr12
chr17
...
The initial order will be taken from order of appearance in the file.
=head2 MODE 3 - shuffle set defined by regular expression
> orderchr -links linkfile.txt -shuffle_rx chr1
Same as MODE 1, except that ideogram list will be filtered using the
regular expression and only those ideograms that match the regular
expression are shuffled.
In this example, ideograms matching "chr1" will be shuffled (e.g. chr1, chr10, chr11, etc).
=head2 MODE 4 - shuffle set defined by list
> orderchr -links linkfile.txt -shuffle chr1,chr2,chr6,chr7,chr10
Same as MODE 3, except that ideograms are specified by a list.
In this example, ideograms chr1,chr2,chr6,chr7, and chr10 will be shuffled.
=head2 MODE 5 - multimode
You can combine -shuffle_file, -shuffle_rx and -shuffle to additively define the shuffle list.
=head1 DEFINING INITIAL ORDER
The initial order of ideograms can be defined in two ways. First,
the method that is used to specify which ideograms to shuffle will
dictate the initial order. Modes 1 and 2 (see above) use the order of ideograms
as they appear in the karyotype file. Mode 3 (see above) uses the order from the
shuffle file.
You can override the initial order using the -init_order
parameter. The value of this parameter is expected to a
comma-delimited list of ideograms, which may be the full set or a
subset of ideograms.
For example, if the entire set of ideograms to shuffle is chr1..chr5, then you can specify the initial order which explicitly orders each ideogram
-init_order chr2,chr5,chr1,chr3,chr4
or just a subset
-init_order chr2,chr5
In the latter case, the final order will be
{ chr2,chr5 } , { chr1,chr3,chr4 }
comprised of two order groups: leading group of ideograms as ordered
by -init_order and a group of remaining ideograms, in order of
appearance as set by parameters in the section DEFINING WHICH
IDEOGRAMS TO SHUFFLE.
If a ideogram mentioned in -init_order is not a candidate for
shuffling, its mention in the order string will be ignored.
The option -init_order_rx works just like -init_order, except that the list a list of regular expressions rather than chromsome names. For example,
-init_order chr1,chr2
is equivalent to
-init_order { chrs matching /chr1/ },{ chrs matching /chr2/ }
and for the canonical human genome with standard order this would be
-init_order chr1,chr10,chr11,chr2,chr20,chr21,chr22
Since this is a subset of ideograms, the final initial order will be automatically completed by ideograms from the karyotype file that were not explicitly ordered
chr1, chr10, chr11, chr2, chr20, chr21, chr22, chr3..chr9, chr12..chr19, chrX, chrY
If both -init_order_rx and -init_order are defined, order is initially defined by -init_order_rx and then refined using -init_order. Thus
-init_order chr10,chr20,chrx -init_order_rx chr1,chr2
will result in
chr10, chr20, chrx, chr1, chr11, chr2, chr21, chr22, chr3..chr9, chr12..chr19 ,chrY
=head1 DEFINING WHICH IDEOGRAMS TO REMAIN ANCHORED
After the set of ideograms to shuffle has been defined, and the
initial order has been set, you can define a subset of ideograms to
remain in the same order (static) throughout the shuffling process.
The difference between a ideogram (a) not being part of a shuffle
set and (b) being part of a shuffle set, but remain static, is that in
the former, links to ideograms do not play a role in the ordering
process whereas in the latter case links to these ideograms
contribute to the shuffle score. Thus, ideograms which are static
have all non-static ideograms shuffled around them in order to
minimize link crossover.
Defining static ideograms is done by a comma-delimited list of regular expressions
> orderchr -links linkfile.txt -static_rx chr1
In this example, all ideograms matching the regular expression chr1
will not have their order adjusted. Any links to/from these
ideograms will contribute to the total link crossing score, but the
ideograms themselves will not be moved. For example, if the original order of ideograms is
chr1,chr2,chr3,chr10,chr11,chr20,chr21
then any shuffle solution will have the order
chr1,-,-,chr10,chr11,-,-
with chr1, chr10 and chr11 remaining fixed.
To define multiple regular expressions, use a list of regular expressions.
> orderchr -links linkfile.txt -static_rx chr1,x,y
Like with -init_order, you can use the ideogram names to define static entries using -static.
-static chr1,chr2,chr3
will keep ideograms chr1, chr2 and chr3 always in the same position. You can combine -static_rx and -static
-static_rx chr1,chr2 -static chrx,chry
in which case all ideograms that match either the regular
expressions defined by -static_rx or the names defined by -static will
be kept in the same position during shuffling.
=head1 CONTROLING THE OPTIMIZATION
The order optimization process comprises one or more rounds. Each round is defined by a <round> block in the <simulation> block
<simulation>
<round>
# settings for round 1
</round>
<round>
# settings for round 2
</round>
...
</simulation>
A round can be either a warmup (read below), or a full simulated
annealing process (read below). The outcome of the warmup is
deterministic, and thus the warmup should only be used as the first
(optional) round.
=head2 Warmup Round
During the warmup round, the initial order of the ideograms is
defined based on the degree of connectivity between ideogram
pairs.
This warmup is most suited for data sets in which most relationships
between ideograms are many-to-one (e.g., ideogram A has links to
many ideograms B,C,D,... but each of B,C,D generally only links to
A). Many-to-one data sets are common for alignments (e.g. chr A
corresponds to the ideogram whereas and points to ctg A, B, C,
... all sequence contigs that map to disjoint regions on chr A).
The warmup algorithm is as follows. The ideogram (chrA) with the
most links is selected first and used to initialize the new order. A
list of all links to chrA is created, grouped by ideogram, and
sorted based on the average position of the link on chrA. Ideograms
are added to the new order based on descending order of grouped link
position. Once all ideograms are placed, the next unplaced
ideogram with the most links is selected and the process continues
until all ideograms are placed.
The warmup is deterministic - it will result in the same order each
time. It is insensitive to the initial order, or values of -static and
-static_rx.
<round>
warmup = yes
</round>
=head2 Stochastic Optimization Round
After the optional warmup round, all other rounds should be of
stochastic type (this is the default round type, if warmup=yes is not
set).
Parameters for the round are defined as follows
<round>
iterations = 1000
max_flips = 10
min_flips = 2
temp0 = 0.01
<round>
For the details of each parameter, read the section below. You can set parameter values to be relative to values of the previous round by prefixing the parameter with "r". For example,
<round>
iterations = 1000
...
temp0 = 0.01
<round>
<round>
iterations = r2
...
temp0 = r0.5
<round>
<round>
iterations = r2
...
temp0 = r0.5
<round>
defines three rounds. The first round has 1000 iterations with
temp0=0.01. The second round has 2x iterations (2000) and a value of
temp0 of 0.5*0.01=0.005. The third round has again 2x iterations
(4000) and temp0 of 0.5*0.005=0.0025.
Relative parameter values are very useful for additional rounds when
the transition probability is decreased (temp0 is lowered). You can
decrease temp0 in relative steps, without needing to remember what the
previous value was. This allows you to create a multi-round
optimization schedule with all parameter defined in a single place
(first round).
The solution at the end of a round is used as the initial order for the next round.
=head1 SIMULATED ANNEALING
This method is an optimization method that encourages the discovery of
a global minimum by traversing the space of solutions with a small
(and decreasing as simulation runs) chance of visiting less desirable
solutions.
There are three parameters that control the optimization.
=head2 iterations
The number of iterations to perform. At each iteration, the current
solution is randomly modified and either accepted or rejected.
=head2 max_flips, min_flips
The optimization run is split into max_flips-min_flips+1 equal-sized
intervals. During each iterval, the number of random
ideogram pair swaps in the solution is given by
min_flips + (max_flips-min_flips)*(1-t)
where t is a relative round completion time t=0..1 at the current iteration.
For example, if max_flips is 5 and min_flips is 2 and iterations=1000. Then the number of random pair swaps is
iteration 1-249 5
iteration 250-499 4
iteration 500-749 3
iteration 750-1000 2
I suggest starting with a value that corresponds to 5% of the
ideograms. For example, if you have 100 ideograms, use max_flips=5
to start. It's also a good idea to set min_flips=1 for the last round
to avoid abandoning the solution (remember that in simulated annealing
it is possible to discard a solution for a worse solution).
=head2 temp0
This parameter determines the probability of a transition to a less desirable solution. The transition probability is
p(dE) = temp0*exp( - dE/t )
where t=1..0 over the length of the simulation and dE is the relative change in the desirability of two solutions.
If temp0=1, then the probability of accepting a solution that is 10% worse (e.g. dE=0.1) is
p(0.1) = exp (-0.1/1) = 90% at start of simulation
= exp (-0.1/0.5) = 82% half way through simulation
= exp (-0.1/0.1) = 37% 90% of the way through simulation
By lowering temp0, you lower the probability of transition to a less desirable solution.
Do not adjust temp0 unless you feel that the simulation is (a) not
traversing the solution space sufficiently - in which case make temp0
larger or (b) too many low-quality solutions are accepted - in which
case make temp0 smaller.
=head2 optimize = minimize|maximize
Most of the time you'll want to adjust the ideogram order in a way
to minimize the number of crossing links. However, you can set to maximize the number of crossing links by setting
optimize = maximize
=head1 SIMULATION SCHEDULE
=head2 No Warmup, Single Stochastic Round
<simulation>
<round>
iterations = 1000
max_flips = 10 # or set this to ~5% of your ideograms
min_flips = 1
temp0 = 0.01
</round>
</simulation>
=head2 No Warmup, Multiple Stochastic Rounds
The purpose of rounds 2 and 3 is to successively decrease the
transition probability to worse solutions and also decrease the degree
to which successive candidate solutions vary from the current
solution. In these rounds, a more careful search is carried out around
the solution provided in round 1.
<simulation>
<round>
iterations = 1000
max_flips = 10 # or set this to ~5% of your ideograms
min_flips = 1
temp0 = 0.01
</round>
</simulation>
<simulation>
<round>
iterations = r2 # 2000
max_flips = 2
min_flips = 1
temp0 = r0.5 # 0.005
</round>
<round>
iterations = r2 # 4000
max_flips = 1
min_flips = 1
temp0 = r0.1 # 0.0005
</round>
</simulation>
=head2 Warmup, Multiple Stochastic Rounds
<simulation>
<round>
warmup = yes
</round>
<round>
iterations = 1000
max_flips = 10 # or set this to ~5% of your ideograms
min_flips = 1
temp0 = 0.01
</round>
</simulation>
<simulation>
<round>
iterations = r2 # 2000
max_flips = 2
min_flips = 1
temp0 = r0.5 # 0.005
</round>
<round>
iterations = r2 # 4000
max_flips = 1
min_flips = 1
temp0 = r0.1 # 0.0005
</round>
</simulation>
=head1 HISTORY
=over
=item * 3 Feb 2009 v0.11
Minor adjustments in documentation.
=item * 14 July 2008
Expanded documentation and added _rx parameters.
=item * 8 July 2008
Started and versioned.
=back
=head1 BUGS
=head1 AUTHOR
Martin Krzywinski
=head1 CONTACT
Martin Krzywinski
Genome Sciences Centre
Vancouver BC Canada
www.bcgsc.ca
martink@bcgsc.ca
=cut
################################################################
#
# Copyright 2002-2008 Martin Krzywinski
#
# This file is part of the Genome Sciences Centre Perl code base.
#
# This script is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this script; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
################################################################
################################################################
# Martin Krzywinski (martink@bcgsc.ca)
# 2008
################################################################
use strict;
use Config::General;
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use IO::File;
use List::MoreUtils qw(uniq);
use Math::VecStat qw(sum min max average);
use Memoize;
use Math::Round qw(round);
#use Devel::DProf;
use Set::IntSpan;
use Pod::Usage;
use Time::HiRes qw(gettimeofday tv_interval);
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";
use vars qw(%OPT %CONF);
################################################################
#
# *** YOUR MODULE IMPORTS HERE
#
################################################################
#memoize("get_num_links");
GetOptions(\%OPT,
"links=s",
"shuffle_file=s",
"shuffle_rx=s",
"optimize=s",
"karyotype=s",
"init_order=s",
"init_order_rx=s",
"static=s",
"static_rx=s",
"configfile=s","help","man","debug+");
pod2usage() if $OPT{help};
pod2usage(-verbose=>2) if $OPT{man};
loadconfiguration($OPT{configfile});
populateconfiguration(); # copy command line options to config hash
validateconfiguration();
if($CONF{debug} > 1) {
$Data::Dumper::Pad = "debug parameters";
$Data::Dumper::Indent = 1;
$Data::Dumper::Quotekeys = 0;
$Data::Dumper::Terse = 1;
print Dumper(\%CONF);
}
my $kar = read_karyotype($CONF{karyotype});
my $links = parse_links($CONF{links});
# make sure that each chr in the link file is in the karyotype
for my $c (keys %{$links->{c}}) {
die "chromosome $c appears in the link file but is not in the karyotype file" unless exists $kar->{$c};
}
# determine the set to shuffle - first assume entire set from links
my $chrs_to_order = [ sort {$kar->{$a}{idx} <=> $kar->{$b}{idx}} keys %{$links->{c}} ];
# collect any shuffle names from file, rx or list
my $chrs_to_order_hash;
if($CONF{shuffle_file}) {
# chromosomes from file
map { $chrs_to_order_hash->{$_}++ } read_chrs($CONF{shuffle_file});
}
if($CONF{shuffle_rx}) {
# filter chromosomes
(my $rx = $CONF{shuffle_rx}) =~ s/\s*,\s*/|/g;
map { $chrs_to_order_hash->{$_}++ } grep($_ =~ /$rx/i, @$chrs_to_order);
}
if ($CONF{shuffle}) {
for my $c (split(/\s*,\s*/,$CONF{shuffle})) {
my ($d) = grep(lc $_ eq lc $c, @$chrs_to_order);
$chrs_to_order_hash->{ $d }++ if $d;
}
}
# if specific shuffle names were mention, filter the set to shuffle for those names
if($chrs_to_order_hash) {
$chrs_to_order = [ grep($chrs_to_order_hash->{$_}, @$chrs_to_order) ]
}
$CONF{debug} && printdebug("shuffle set",@$chrs_to_order);
# refine the initial order of chromosomes to shuffle
if($CONF{init_order_rx}) {
my $new_order;
for my $rx (split(/\s*,\s*/,$CONF{init_order_rx})) {
for my $c (grep($_ =~ /$rx/i, @$chrs_to_order)) {
push @$new_order, $c if ! grep($_ eq $c, @$new_order);
}
}
for my $c (@$chrs_to_order) {
push @$new_order, $c if ! grep($_ eq $c, @$new_order);
}
$chrs_to_order = [@$new_order];
}
if($CONF{init_order}) {
my $new_order;
for my $c (split(/\s*,\s*/,$CONF{init_order})) {
push @$new_order, $c if grep($_ eq $c, @$chrs_to_order);
}
for my $c (@$chrs_to_order) {
push @$new_order, $c if ! grep($_ eq $c, @$new_order);
}
$chrs_to_order = [@$new_order];
}
$CONF{debug} && printdebug("initial order",@$chrs_to_order);
my $order = [@$chrs_to_order];
my ($round_idx,$prev_round_param) = (0,{});
my $score_init = calculate_overlap($order);
my $score = { init=>$score_init,
final=>$score_init };
for my $round_param ( ref($CONF{simulation}{round}) eq "ARRAY" ?
@{$CONF{simulation}{round}} : ($CONF{simulation}{round}) ) {
next if ! $round_param->{use} && ! $CONF{simulation}{default_round_use};
for my $var (qw(iterations max_flips min_flips temp0)) {
$round_param->{$var} = $round_param->{$var} || $CONF{simulation}{$var};
if($round_param->{$var} =~ /^r(.*)/) {
if($prev_round_param) {
$round_param->{$var} = $1 * $prev_round_param->{$var};
} else {
die "you cannot define a variable ($var) as a relative value in the first round";
}
}
}
$round_param->{idx} = $round_idx;
printinfo(sprintf("calculating round %d",$round_idx));
if($round_param->{warmup}) {
my $link_count = count_links($links->{list});
$order = warmup($order,$link_count,$links->{list},$round_param);
$score->{final} = calculate_overlap($order);
} else {
if($round_param->{min_flips} < 0) {
die "cannot have negative number of flips [min_flips $round_param->{min_flips}";
}
if($round_param->{max_flips} < 0) {
die "cannot have negative number of flips [min_flips $round_param->{min_flips}";
}
if($round_param->{max_flips} < $round_param->{min_flips}) {
die "max_flips smaller than min_flips [min_flips $round_param->{min_flips} max_flips $round_param->{max_flips}";
}
($order,$score) = anneal($order,$round_param);
}
printinfo(sprintf("report round %d $CONF{optimize} init %d final %d change %.2f%%",
$round_idx,
$score->{init},
$score->{final},
defined $score->{init} ? 100*abs($score->{init}-$score->{final})/$score->{init} : 0));
$round_idx++;
$prev_round_param = $round_param;
}
printinfo(sprintf("scorereport init %d final %d change %.2f%%",
$score_init,$score->{final},
100*($score_init-$score->{final})/$score_init));
printinfo("chromosomes_order =",join(",",@$order));
################################################################
#
# order chromosomes based on degree of connectivity - this is
# mean to be a warmup phase
#
sub warmup {
my ($order,$link_count,$links,$round_param) = @_;
my @neworder;
my $ordered = {};
my @preorder;
if($CONF{init_order}) {
for my $c (split(/\s*,\s*/,$CONF{init_order})) {
push @preorder, $c if grep($_ eq $c, @$order) && ! grep($_ eq $c, @preorder);
}
}
if($CONF{init_order_rx}) {
for my $rx (split(/\s*,\s*/,$CONF{init_order_rx})) {
for my $c (grep($_ =~ /$rx/i, @$order)) {
push @preorder, $c if ! grep($_ eq $c, @preorder);
}
}
}
if($round_param->{sort_init} && @preorder) {
@preorder = sort { $link_count->{$a}{tot} <=> $link_count->{$b}{tot} } @preorder;
}
if(@preorder) {
map { $ordered->{$_}++ } @preorder;
push @neworder, @preorder;
}
my $processed;
for my $c ((reverse @preorder), (sort { $link_count->{$b}{tot} <=> $link_count->{$a}{tot} } @$order)) {
next if $processed->{$c}++;
next unless $link_count->{$c};
push @neworder, $c if ! $ordered->{$c}++;
my @connections = sort {$link_count->{$c}{chr}{$b} <=> $link_count->{$c}{chr}{$a}}
keys %{$link_count->{$c}{chr}};
my $center;
for my $d (@connections) {
my @p = map { $_->{p1} } grep($_->{c1} eq $c && $_->{c2} eq $d , @$links);
$center->{$d} = average(@p);
}
for my $d ( sort {$center->{$b} <=> $center->{$a}} @connections) {
push @neworder, $d if ! $ordered->{$d}++;
}
}
return \@neworder;
}
################################################################
# one round of simulated annealing using
# parameters in the $params hash - which is expected to contain
#
# iterations, max_flips, temp0, {min_flips}
#
sub anneal {
my ($order,$params) = @_;
# grab initial score
my $score = calculate_overlap($order);
my $score_init = $score;
my ($score_best,$order_best) = ($score,$order);
for my $curr_iter ( 0 .. $params->{iterations} - 1 ) {
my @neworder = @$order;
# fractional simulation progress from 1 ... epsilon
my $curr_iter_f = ($params->{iterations} - $curr_iter)/$params->{iterations};
my $nflips = $params->{min_flips} + round($curr_iter_f*($params->{max_flips}-$params->{min_flips}));
@neworder = flip_chromosomes($nflips,@neworder);
my $time = [gettimeofday];
my $newscore = calculate_overlap(\@neworder);
$time = tv_interval($time);
my $dscore = $score ? ($newscore - $score) / $score : 1;
my $temp = $params->{temp0}*$curr_iter_f;
my $accept;
if( ($CONF{optimize} eq "minimize" && $dscore < 0) ||
($CONF{optimize} eq "maximize" && $dscore > 0) ) {
$accept = 1;
} else {
my $p = exp( - abs($dscore)/$temp );
$accept = 1 if rand() < $p;
}
if($accept) {
$order = [@neworder];
$score = $newscore;
if(! defined $score_best ||
( ($CONF{optimize} eq "minimize" && $score < $score_best) ||
($CONF{optimize} eq "maximize" && $score > $score_best) )) {
$score_best = $score;
$order_best = [@$order];
}
}
$CONF{debug} && printinfo(sprintf("round %d %s t0 %.3f t %.2e flips %d %s %d/%d curr %d best %d d %.2f%% time %.2e",
$params->{idx},
substr($CONF{optimize},0,3),
$params->{temp0},
$temp,
$nflips,
$accept?"+":".",
$curr_iter,$params->{iterations},
$newscore,$score_best,
100*abs($score_init-$score_best)/$score_init,
$time));
}
return ($order_best,
{ init => $score_init,
final => $score_best });
}
sub flip_chromosomes {
my $nflips = shift;
my @order = @_;
my @neworder = @order;
my @flippable = (0..@order-1);
my @static;
if($CONF{static_rx}) {
(my $rx = $CONF{static_rx}) =~ s/,/|/g;
push @static, grep($neworder[$_] =~ /$rx/i, @flippable);
@flippable = grep($neworder[$_] !~ /$rx/i, @flippable);
}
if($CONF{static}) {
my %h;
map { $h{$_}++ } split(/\s*,\s*/,$CONF{static});
push @static, grep($h{ $neworder[$_]}, @flippable);
@flippable = grep(! $h{ $neworder[$_]}, @flippable);
}
$CONF{debug} >1 && printdebug("flippable idx",@flippable);
$CONF{debug} >1 && printdebug("flippable chr",@order[@flippable]);
$CONF{debug} >1 && printdebug("static idx",@static);
$CONF{debug} >1 && printdebug("static chr",@order[@static]);
for my $flip (1..$nflips) {
my ($i,$j);
do {
($i,$j) = ($flippable[rand(@flippable)],$flippable[rand(@flippable)]);
} while ($i==$j);
@neworder[$i,$j] = @neworder[$j,$i];
}
return @neworder;
}
sub calculate_overlap {
my $order = shift;
my $n = @$order;
my $score = 0;
my %chr2idx;
for my $i (0..@$order-1) {
$chr2idx{ $order->[$i] } = $i;
}
for my $i ( 0 .. @{$links->{list}} - 1 ) {
my $li = $links->{list}[$i];
my ($ichridx1,$ichridx2) = @chr2idx{ @{$li}{qw(c1 c2)} };
my $ip1 = $ichridx1 + $li->{p1};
my $ip2 = $ichridx2 + $li->{p2};
($ip1,$ip2) = ($ip2,$ip1) if ($ip1>$ip2);
for my $j ( $i+1 .. @{$links->{list}} - 1 ) {
my $lj = $links->{list}[$j];
my ($jchridx1,$jchridx2) = @chr2idx{ @{$lj}{qw(c1 c2)} };
my $jp1 = $jchridx1 + $lj->{p1};
my $jp2 = $jchridx2 + $lj->{p2};
($jp1,$jp2) = ($jp2,$jp1) if ($jp1>$jp2);
my $xlink = 0;
if( $jp1 > $ip1 && $jp1 < $ip2 && ! ($jp2 > $ip1 && $jp2 < $ip2)
or
$jp2 > $ip1 && $jp2 < $ip2 && ! ($jp1 > $ip1 && $jp1 < $ip2) ) {
$xlink = 1;
}
#printinfo($xlink,map {sprintf("%7.3f",$_) } ($ip1,$ip2,$jp1,$jp2));
$score += $xlink;
}
}
return $score;
}
sub read_karyotype {
my $file = shift;
return undef if ! -e $file;
open(F,$file);
my $kar = {};
while(<F>) {
chomp;
if(/^chr/) {
my @tok = split;
my $chr = $tok[2];
$kar->{$chr} = {chr => $chr,
idx => int(keys %$kar),
set => Set::IntSpan->new(sprintf("%d-%d",@tok[4,5])),
tok => \@tok};
}
}
return $kar;
}
sub count_links {
my $links = shift;
my $link_count;
for my $l (@$links) {
$link_count->{ $l->{c1} }{chr}{ $l->{c2} }++;
$link_count->{ $l->{c2} }{chr}{ $l->{c1} }++;
$link_count->{ $l->{c2} }{tot}++;
$link_count->{ $l->{c1} }{tot}++;
}
return $link_count;
}
sub parse_links {
my $file = shift;
open(F,$file);
my $links;
while(<F>) {
chomp;
my @tok1 = split;
my $line2 = <F>;
last unless $line2;
chomp $line2;
my @tok2 = split(" ",$line2);
my $pos1 = ($tok1[2]+$tok1[3])/2;
my $pos2 = ($tok2[2]+$tok2[3])/2;
my ($chr1,$chr2) = ($tok1[1],$tok2[1]);
die "chromosome $chr1 not in karyotype file" unless $kar->{$chr1};
die "chromosome $chr2 not in karyotype file" unless $kar->{$chr2};
my $f1 = ($pos1-$kar->{$chr1}{set}->min)/$kar->{$chr1}{set}->cardinality;
my $f2 = ($pos2-$kar->{$chr2}{set}->min)/$kar->{$chr2}{set}->cardinality;
push @{$links->{list}}, { c1=>$chr1, c2=>$chr2, p1=>$f1, p2=>$f2 };
$links->{c1}{$chr1}++;
$links->{c2}{$chr2}++;
$links->{c}{$chr1}++;
$links->{c}{$chr2}++;
}
return $links;
}
sub read_chrs {
my $file = shift;
open(F,$file);
my @elems;
while(<F>) {
chomp;
s/^\s*//g;
s/\s*$//g;
push @elems, $_;
}
return @elems;
}
sub validateconfiguration {
$CONF{optimize} ||= "minimize";
}
################################################################
#
# *** DO NOT EDIT BELOW THIS LINE ***
#
################################################################
sub populateconfiguration {
foreach my $key (keys %OPT) {
$CONF{$key} = $OPT{$key};
}
# any configuration fields of the form __XXX__ are parsed and replaced with eval(XXX). The configuration
# can therefore depend on itself.
#
# flag = 10
# note = __2*$CONF{flag}__ # would become 2*10 = 20
for my $key (keys %CONF) {
my $value = $CONF{$key};
while($value =~ /__([^_].+?)__/g) {
my $source = "__" . $1 . "__";
my $target = eval $1;
$value =~ s/\Q$source\E/$target/g;
#printinfo($source,$target,$value);
}
$CONF{$key} = $value;
}
}
sub loadconfiguration {
my $file = shift;
my ($scriptname) = fileparse($0);
if(-e $file && -r _) {
# great the file exists
} elsif (-e "/home/$ENV{LOGNAME}/.$scriptname.conf" && -r _) {
$file = "/home/$ENV{LOGNAME}/.$scriptname.conf";
} elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) {
$file = "$FindBin::RealBin/$scriptname.conf";
} elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) {
$file = "$FindBin::RealBin/etc/$scriptname.conf";
} elsif (-e "$FindBin::RealBin/../etc/$scriptname.conf" && -r _) {
$file = "$FindBin::RealBin/../etc/$scriptname.conf";
} else {
return undef;
}
$OPT{configfile} = $file;
my $conf = new Config::General(-ConfigFile=>$file,
-AllowMultiOptions=>"yes",
-LowerCaseNames=>1,
-AutoTrue=>1);
%CONF = $conf->getall;
}
sub printdebug {
printinfo("debug",@_) if $CONF{debug};
}
sub printinfo {
printf("%s\n",join(" ",@_));
}