-
Notifications
You must be signed in to change notification settings - Fork 0
/
conserve-fpkm.pl
executable file
·94 lines (83 loc) · 2.68 KB
/
conserve-fpkm.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/perl
use strict;
my $MIN_FPKM=1;
my $MIN_SAMPLE_SIZE=30;
my $BASE="/home/bmajoros/1000G/assembly";
my $CRYPSKIP="$BASE/crypskip.txt";
#my $RNA="$BASE/rna.txt";
my $RNA="$BASE/crypskip-counts.txt";
my (%rna,%rnaByTranscript,%sampleSize,%seen);
open(IN,$RNA) || die $RNA;
while(<IN>) {
chomp; my @fields=split; next unless @fields>=7;
my ($indiv,$allele,$gene,$transcript,$cov,$FPKM,$TPM,$nmd)=@fields;
next if $indiv eq "indiv"; # header
my $key="$indiv $allele";
next if $transcript eq ".";
if($transcript=~/(ALT\d+_\S+)_\d+/) { $transcript=$1 }
my $baseID=$transcript;
if($transcript=~/ALT\d+_(\S+)/) { $baseID=$1 }
next if $seen{$baseID}>=100; ###
$seen{$baseID}++; ###
$rna{$key}->{$transcript}=$FPKM;
# $rnaByTranscript{$transcript}+=$FPKM;
# ++$sampleSize{$transcript};
#print "$key $transcript $FPKM\n";
}
close(IN);
open(IN,"$BASE/rna.txt") || die;
while(<IN>) {
chomp; my @fields=split; next unless @fields>=7;
my ($indiv,$allele,$gene,$transcript,$cov,$FPKM,$TPM,$nmd)=@fields;
next if $indiv eq "indiv"; # header
my $key="$indiv $allele";
if($transcript=~/(ALT\d+_\S+)_\d+/) { $transcript=$1 }
$rnaByTranscript{$transcript}+=$FPKM;
++$sampleSize{$transcript};
}
close(IN);
my (%crypticCounts,%exonSkipping,%baseIDs);
open(IN,$CRYPSKIP) || die $CRYPSKIP;
while(<IN>) {
chomp; my @fields=split; next unless @fields>=6;
my ($indiv,$hap,$gene,$transcript,$event,$dist)=@fields;
$transcript=~/(\S+)_\d+$/ || die $transcript;
$transcript=$1;
my $baseID=$transcript;
if($transcript=~/ALT\d+_(\S+)/) { $baseID=$1 }
$baseIDs{$baseID}=1;
my $key="$indiv $hap";
if($event eq "exon-skipping") {next unless $rna{$key}->{$transcript}>0} ###
if($event eq "exon-skipping") { $exonSkipping{$key}->{$baseID}=$transcript }
else { ++$crypticCounts{$key}->{$baseID} }
}
close(IN);
my %meanFPKM;
my @baseIDs=keys %rnaByTranscript;
my $numBaseIDs=@baseIDs;
for(my $i=0 ; $i<$numBaseIDs ; ++$i) {
my $baseID=$baseIDs[$i];
my $sum=$rnaByTranscript{$baseID};
my $N=$sampleSize{$baseID};
next unless $sum>$MIN_FPKM && $N>$MIN_SAMPLE_SIZE;
$meanFPKM{$baseID}=$sum/$N;
}
my @keys=keys %exonSkipping;
my $numKeys=@keys;
for(my $i=0 ; $i<$numKeys ; ++$i) {
my $key=$keys[$i];
my $hash=$exonSkipping{$key};
die unless $hash;
my @baseIDs=keys %$hash;
foreach my $baseID (@baseIDs) {
my $numCryptic=defined($crypticCounts{$key}) ?
0+$crypticCounts{$key}->{$baseID} : 0;
my $skippingID=$hash->{$baseID};
my $fpkm=0+$rna{$key}->{$skippingID};
my $mean=$meanFPKM{$baseID};
next unless $mean>$MIN_FPKM;
next if $baseID=~/ENST00000421308.2/; # outlier: common allele
#$fpkm/=$mean;
print "$baseID\t$numCryptic\t$fpkm\n";
}
}