-
Notifications
You must be signed in to change notification settings - Fork 2
/
FASTA_divide_by_ambiguous.py
46 lines (39 loc) · 1.28 KB
/
FASTA_divide_by_ambiguous.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
__author__ = 'Wietrack 2019'
import sys
import operator
import hashlib
fasta_file = sys.argv[1]
loaded = False
titleRead = False
i = 0
n = 0
out_file_correct = open(fasta_file + "_correct.fa", 'w')
out_file_ambiguous = open(fasta_file + "_ambiguous.fa", 'w')
#load fasta sequences
for line in open(fasta_file):
ch = line[0]
if ch == '>':
titleRead=True
title = line[1:].strip()
else:
if titleRead:
titleRead=False
seq = line.strip().upper()
l = len(seq)
# count the lengths
count_A = seq.count('A')
count_C = seq.count('C')
count_T = seq.count('T')
count_G = seq.count('G')
# filter
if l == (count_A + count_C + count_T + count_G):
out_file_correct.write(">" + title +'\n')
out_file_correct.write(seq +'\n')
i = i + 1
else:
out_file_ambiguous.write(">" + title +'\n')
out_file_ambiguous.write(seq +'\n')
n = n + 1
out_file_correct.close()
out_file_ambiguous.close()
print str(i + n) + " sequences processed by ambiguosity - " + str(i) + " correct sequences vs " + str(n) + " ambiguous sequences"