-
Notifications
You must be signed in to change notification settings - Fork 2
/
rereplicate_FASTA.py
45 lines (36 loc) · 985 Bytes
/
rereplicate_FASTA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
__author__ = 'Wietrack 2016'
import sys
import operator
derep_fasta = sys.argv[1]
derep_table = sys.argv[2]
out_rerep_fasta = sys.argv[3]
fasta = {}
i = 0
#load fasta sequences
for line in open(derep_fasta):
ch = line[0]
if ch == '>':
titleRead=True
title = line[1:].strip()
else:
if titleRead:
titleRead=False
vals = title.split("|")
fasta[vals[0]]=line.strip()
i=i+1
print str(i)+" sequences loaded"
#process table
sr = 0
sf = 0
of = open(out_rerep_fasta, 'w')
for line in open(derep_table):
vals = line.strip().split("\t")
if fasta.has_key(vals[0]):
of.write('>'+vals[1]+'\n')
of.write(fasta[vals[0]]+'\n')
sf = sf + 1
else:
#print vals[0]+" sequences loaded"
sr = sr + 1
of.close()
print 'Dereplication is done - '+str(sf)+' sequences rereplicated / '+str(sr)+' sequences were not found...'