-
Notifications
You must be signed in to change notification settings - Fork 0
/
BLAST_tabularparser.py
executable file
·285 lines (244 loc) · 12.2 KB
/
BLAST_tabularparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#!/usr/bin/env python
# encoding: utf-8
# ================== Filtering the tabular output file of BLAST+ =================
# Script to parse the tabular output of BLAST+, to produce a tab file that
# satisfy a number of criteria. One of them includes to adjust the parsing to
# the output of Doug's script parseBlastBestHitLoc.pl. The output is printed
# in a new file.
# Based on script BLASTbasicFilter.py done for Lineus project
# TODO: Refine BLAST searches, by merging hits that actually belong to the
# same contig. From Simão et al. 2015 SOM: "Neighbouring highscoring segment
# pairs (HSPs) from the tBLASTn searches are merged if located within 50 Kb of
# each other, thus defining the span of the genomic regions to be evaluated."
# http://pymotw.com/2/argparse/
# ==================================================
# Sandra Lorena Ament Velasquez
# Johannesson Lab, Evolutionary Biology Center, Uppsala University, Sweden
# 2014/07/09-11
# +++++++++++++++++++++++++++++++++++++++++++++++++
#
# ------------------------------------------------------
import re # For the sorting function
import os # For the input name
import argparse # For the fancy options
from collections import defaultdict #This tells python that the dictionary contains a list so you can freely append things to the value of each key
# ------------------------------------------------------
version = 2.21
versiondisplay = "{0:.3f}".format(version)
# Make a nice menu for the user
parser = argparse.ArgumentParser(description="* Parse the output of a BLAST search with different criteria *", epilog="It's VERY important that you verify if the input file has a header or not!!!\nNormally the output of BLAST doesn't have a header, while the output of parseBlastBestHitLoc.pl does.\nIf you're missing the first hit, the header might be the reason!") # Create the object using class argparse
# Add options
parser.add_argument('tabfile', help="TABULAR file(s) obtained after a BLAST search")
parser.add_argument("--outputname", "-o", help="Output name set by the user")
parser.add_argument("--HSPminlen", "-l", help="Minimum HSP length for a hit to be kept (default 0)", default="0", dest="hsp", type=int) # nargs='+' All, and at least one, argument
parser.add_argument("--hitfrac", "-f", help="Use a minimum of 'hit fraction': HSPlength/querylength (default 0)", default="0", type=float) # nargs='+' All, and at least one, argument
parser.add_argument("--identity", "-i", help="Percentage of identity between hit and subject (only for normal BLAST output!)", default="0", type=float) # nargs='+' All, and at least one, argument
parser.add_argument("--evalue", "-e", help="Minimum e-value for keeping a hit (default 10)", type=float, default=10)
parser.add_argument("--Nhits", "-n", help="Number of hits to be recovered after filtering (default all)", type=int)
parser.add_argument("--filterstring", "-y", help="Keep hits with given string (only valid with -s)")
parser.add_argument("--postdoug", "-s", help="The input is the file obtained after using the script parseBlastBestHitLoc.pl", default=False, action='store_true')
parser.add_argument("--headerinput", "-w", help="Turnoff the header for parseBlastBestHitLoc.pl output, but turn it on for normal BLAST", default=True, action='store_false')
parser.add_argument('--verbose', '-b', help="Give some extra information", default=False, action='store_true')
parser.add_argument('--version', "-v", action='version', version='%(prog)s ' + versiondisplay)
try:
# ArgumentParser parses arguments through the parse_args() method You can
# parse the command line by passing a sequence of argument strings to
# parse_args(). By default, the arguments are taken from sys.argv[1:]
args = parser.parse_args()
tabopen = open(args.tabfile, 'r')
except IOError as msg: # Check that the file exists
parser.error(str(msg))
parser.print_help()
if args.verbose:
print
print("Input tab file: " + args.tabfile)
# ------------------------------------------------------
def print_table( list_thing, output_name="output.txt", header='' ):
ofile = open(output_name, 'w')
if header != '':
ofile.write((header+'\n'))
# else:
# print("No header")
for line in list_thing:
hitstring = ''
for tab in line:
hitstring += ''.join(str(tab)) + "\t"
ofile.write((hitstring.rstrip("\t") + '\n'))
ofile.close()
# Got these functions from http://stackoverflow.com/questions/5967500/how-to-correctly-sort-a-string-with-a-number-inside by unutbu user
def atoi(text):
return int(text) if text.isdigit() else text
def natural_keys(text):
'''
alist.sort(key=natural_keys) sorts in human order
http://nedbatchelder.com/blog/200712/human_sorting.html
(See Toothy's implementation in the comments)
'''
return [ atoi(c) for c in re.split('(\\d+)', text) ]
# ------------------------------------------------------
# ------------------------------------------------------
# Read file into list
# ------------------------------------------------------
tabs = [line.rstrip("\n").split("\t") for line in tabopen] # Read tab file into a list
badwords = ["hypothetical", "predic", "putative"]
# ------------------------------------------------------
# Does it have a header?
# ------------------------------------------------------
# Depending on the type of intput file
# Normal Doug (header): T T | -s
if args.postdoug and args.headerinput:
headercondition = True
# Normal BLAST F T |
elif not args.postdoug and args.headerinput:
headercondition = False
# BLAST with header: F F | -w
elif not args.postdoug and not args.headerinput:
headercondition = True
# Doug without header: T F | -s -w
elif args.postdoug and not args.headerinput:
headercondition = False
# The start depends on the type of file
if headercondition:
startingpoint = 1
if args.verbose:
print("** Header active **")
else:
startingpoint = 0
# ------------------------------------------------------
# Filter the BLAST tab file using a minimum HSP length
# ------------------------------------------------------
nohitcounter = 0
filteredtab = []
if not args.postdoug:
# heads = "query_id subject_id percent_identity alignment_length N_mismatches N_gaps query_start query_end subject_start subject_end evalue bit_score"
# filteredtab = [line for line in tabs[startingpoint:] if int(line[3]) >= args.hsp] # Filter the tab file with hsp_length
for line in tabs[startingpoint:]:
if line == ['']: continue # Ignore empty lines
if int(line[3]) >= args.hsp:
filteredtab.append(line)
else:
# heads = "QUERY ACC QUERY DESCRIPTION HIT NUM HIT NAME HIT DESCRIPTION SCORE HIT SIGNIFICANCE HIT E-VALUE FRACTION IDENTITY QUERY LENGTH HSP LENGTH QUERY STRAND QUERY START QUERY END HSP STRAND HSP START HSP END NUM GAPS"
# filteredtab = [line for line in tabs[startingpoint:] if int(line[10]) >= args.hsp] # Filter the tab file with hsp_length
for line in tabs[startingpoint:]:
if "No hit" in line:
# print("Query with No hit found")
nohitcounter += 1
filteredtab.append(line)
else:
if int(line[10]) >= args.hsp:
filteredtab.append(line)
# Report it
if args.hsp != 0 and args.verbose:
print(" Minimum HSP length set to %d bp" % args.hsp)
# ------------------------------------------------------
# Make a dictionary with the tab file for each query
# ------------------------------------------------------
hits = defaultdict(list) #This tells python that the dictionary contains a list so you can freely append things to the value of each key
for line in filteredtab:
hits[line[0]].append(line[1:])
# ------------------------------------------------------
# Sort the names of the queries
# ------------------------------------------------------
sorthitskeys = sorted(hits.keys(), key=natural_keys) # Use the function defined in key= to sort
# ------------------------------------------------------
# Keep only some of the hits
# ------------------------------------------------------
hypo = 0
if args.Nhits or args.hitfrac or (args.evalue < 10) or args.filterstring:
filteredtab = [] # Restart the filtered list of hits
# Establish the position of the e-value
if not args.postdoug: # If BLAST
eposition = 9
# HSPlength/querylength
hspposition = 2
queryposition = 6
perident = 1 # Percentage of identity
else: # If post Doug's script
eposition = 6
hspposition = 9
queryposition = 8
perident = 7 # Percentage of identity
nhits_counter = 0
# Do the filtering
for key in sorthitskeys: # For each query sequence
# Define a limit of the number of hits recovered
if args.Nhits:
limit = args.Nhits
else:
limit = len(hits[key])
for hit in hits[key][:]: # for each hit of every query
if args.postdoug and hit[2] == "No hit": # If you have No hits (and are working with Doug's script)
newline = hit
newline.insert(0, key) # Put the name of the query at the beginning
filteredtab.append(hit) # Just append them as they are
elif float(hit[eposition]) <= args.evalue: # It's below the e-value threshold?
fraction = float(hit[hspposition])/float(hit[queryposition])
if fraction >= args.hitfrac: # If the hit fraction is above the threshold
if args.postdoug and args.filterstring: # If the we're dealing with Doug's script output and a string to filter was given
if args.filterstring in hit[3]: # Check if the string is present
newline = hit # If yes, take the hit
newline.insert(0, key) # Put the name of the query at the beginning
filteredtab.append(newline) # Add this new line
nhits_counter += 1
# else: # If working with normal output or it's Dougs but there was no string to filter with
elif float(hit[perident]) >= args.identity:
newline = hit # Take the hit
newline.insert(0, key) # Put the name of the query at the beginning
filteredtab.append(newline) # Add this new line
nhits_counter += 1
if args.postdoug:
for word in badwords:
if word in hit[4]:
hypo += 1
# Limit the number of hits recovered
if nhits_counter >= limit:
nhits_counter = 0
break
# Report some figures if required
if args.verbose:
if args.Nhits: print(" Max No. of hits kept per query: %d" % args.Nhits)
if (args.evalue < 10): print(" Minimum e-value set to %f" % args.evalue)
if args.hitfrac: print(" Minimum hit fraction set to %f" % args.hitfrac)
if (args.identity > 0): print(" Minimum percentage of identity set to %.2f" % args.identity)
# ------------------------------------------------------
# Print filtered tab file
# ------------------------------------------------------
input_name = os.path.splitext(args.tabfile)[0]
# Get the header out of the first line of the input file
with open(args.tabfile, 'r') as f:
header = f.readline().rstrip("\n")
# Check if the outputname has been assigned by the user
if not args.outputname:
output_name_tab = input_name+"_filtered.txt"
else:
output_name_tab = args.outputname
# Print the list in a tab format
if headercondition:
print_table(filteredtab, output_name_tab, header)
else:
print_table(filteredtab, output_name_tab)
# ------------------------------------------------------
# Some numbers
# ------------------------------------------------------
if args.verbose:
# Input
hitsinput = defaultdict(list) #This tells python that the dictionary contains a list so you can freely append things to the value of each key
for line in tabs[startingpoint:]:
hitsinput[line[0]].append(line[1:])
print("\tNumber of input query sequences: " + str(len(hitsinput.keys())))
print("\tNumber of input hits: " + str(len(tabs) - startingpoint))
print
# Output
hitsoutput = defaultdict(list) #This tells python that the dictionary contains a list so you can freely append things to the value of each key
for line in filteredtab:
hitsoutput[line[0]].append(line[1:])
print("\tNumber of output query sequences: ", len(hitsoutput.keys()))
print("\tNumber of output true hits: ", len(filteredtab) - nohitcounter) # True in oposition of having no hit at all but still being in the file
print("\tNumber of queries without any hits: ", nohitcounter)
if args.postdoug and (args.Nhits or args.hitfrac or (args.evalue < 10)):
print("\tNumber of hits with %s in description: %i" % (badwords, hypo))
# ------------------------------------------------------
# The end
# ------------------------------------------------------
print("Output tab file: " + output_name_tab)
print