-
Notifications
You must be signed in to change notification settings - Fork 0
/
tRNAscan_to_GFF3.pl
executable file
·107 lines (91 loc) · 5.16 KB
/
tRNAscan_to_GFF3.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/perl
## Pombert Lab, IIT, 2020
my $name = 'tRNAscan_to_GFF3.pl';
my $version = '0.8e';
my $updated = '2021-04-09';
use strict; use warnings; use File::Basename; use Getopt::Long qw(GetOptions);
my $usage = <<"OPTIONS";
NAME ${name}
VERSION ${version}
UPDATED ${updated}
SYNOPSIS Converts the output of tRNAscan-SE (in tabular format) to the proper GFF3 format for loading into Apollo.
USAGE ${name} \\
-t *.tRNAs \
-d tRNA/
OPTIONS:
-t (--tRNA) tRNA file(s) to convert to GFF3
-d (--dir) Output directory (Optional)
OPTIONS
die "$usage\n" unless @ARGV;
my @tRNA;
my $odir = './';
GetOptions(
't|tRNA=s@{1,}' => \@tRNA,
'd|dir=s' => \$odir
);
## Checking output directory
unless (-d $odir){
mkdir ($odir,0755) or die "Can't create $odir: $!\n";
}
print "\nOutput files will be located in directory $odir\n";
my $tRNA = 0;
while (my $file = shift@tRNA){
open TRNA, "<", "$file" or die "Can't read $file: $!\n";
my ($RNA, $dir) = fileparse($file);
print "Working on file $RNA located in $dir\n\n";
open GFF, ">", "$odir/$RNA.gff" or die "Can't create $odir/$RNA.gff: $!\n";
open GFF3, ">", "$odir/$RNA.gff3" or die "Can't create $odir/$RNA.gff3: $!\n";
## Converting tRNAs to GFF3
while (my $line = <TRNA>){
chomp $line;
if ($line =~ /^(Sequence|Name|--------)/){ next; }
else {
my @cols = split("\t", $line);
my $location = $cols[0]; $location =~ s/\s+$//; ## Removing padding spaces, if any
my $tnum = $cols[1]; $tnum =~ s/\s+$//;
my $start = $cols[2]; $start =~ s/\s+$//;
my $end = $cols[3]; $end =~ s/\s+$//;
my $aa = $cols[4]; $aa =~ s/\s+$//;
my $ac = $cols[5]; $ac =~ s/\s+$//;
my $int1 = $cols[6]; $int1 =~ s/\s+$//; ## intron boundaries, if any
my $int2 = $cols[7]; $int2 =~ s/\s+$//; ## intron boundaries, if any
my $cove = $cols[8]; $cove =~ s/\s+$//;
my $codon = lc($ac);
$codon =~ tr/Tt/Uu/;
$tRNA++;
## Forward strand
if ($start < $end){
if (($int1 == 0) && ($int2 == 0)){
print GFF "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$start"."\t"."$end"."\t"."$cove"."\t".'+'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."gene"."\t"."$start"."\t"."$end"."\t"."$cove"."\t".'+'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$start"."\t"."$end"."\t"."$cove"."\t".'+'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
}
else {
my $boundary1 = $int1 - 1;
my $boundary2 = $int2 + 1;
print GFF "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$start"."\t"."$boundary1"."\t"."$cove"."\t".'+'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$boundary2"."\t"."$end"."\t"."$cove"."\t".'+'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."gene"."\t"."$start"."\t"."$end"."\t"."$cove"."\t".'+'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$start"."\t"."$boundary1"."\t"."$cove"."\t".'+'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$boundary2"."\t"."$end"."\t"."$cove"."\t".'+'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
}
}
elsif ($start > $end){
if (($int1 == 0) && ($int2 == 0)){
print GFF "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$end"."\t"."$start"."\t"."$cove"."\t".'-'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."gene"."\t"."$end"."\t"."$start"."\t"."$cove"."\t".'-'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$end"."\t"."$start"."\t"."$cove"."\t".'-'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
}
else {
my $boundary1 = $int2 - 1;
my $boundary2 = $int1 + 1;
print GFF "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$boundary2"."\t"."$start"."\t"."$cove"."\t".'-'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$end"."\t"."$boundary1"."\t"."$cove"."\t".'-'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."gene"."\t"."$end"."\t"."$start"."\t"."$cove"."\t".'-'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$boundary2"."\t"."$start"."\t"."$cove"."\t".'-'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
print GFF3 "$location"."\t"."TRNASCAN"."\t"."tRNA"."\t"."$end"."\t"."$boundary1"."\t"."$cove"."\t".'-'."\t".'.'."\t"."ID=tRNA$tRNA".';'."Name=tRNA$tRNA".';'."Note=tRNA-$aa($codon)"."\n";
}
}
}
}
}