-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasta_tk.py
172 lines (145 loc) · 5.63 KB
/
fasta_tk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# FASTA Toolkit
#
# This module contains FASTA file analysis tools.
#
# Classes included in this file:
#
#
# https://www.stephendoescomp.bio
# Stephen Cheney © 2023
import pandas as pd
from collections import defaultdict
from structs import *
from sequenceBuilder import *
from bio_seq import *
from aa_seq import *
from datetime import datetime
def dictFromFASTA(fasta_File):
"""
Given a FASTA file, return a dict of sequence names and their respective sequence
\n Notes: Does not discriminate between DNA, RNA, or Nucleotide sequences
\n\tReturns empty {} if no properly formatted FASTA sequences
\n<- fasta_File: FASTA formatted file
\n-> dict
"""
lines = readFile(fasta_File)
fastaDict = {}
seqLabel = ''
for line in lines:
if line[0] == '>':
seqLabel = line[1:].rstrip()
fastaDict[seqLabel] = ''
else:
fastaDict[seqLabel] += line.rstrip()
# Remove empty values
fastaDict = {k:v for k,v in fastaDict.items() if v != ''}
return fastaDict
def percentGC_fasta(seq):
"""
Return GC percentage of a DNA sequence in % form
\n<- bio_seq obj
\n-> float
"""
temp_seq = bio_seq(seq)
bases = len(seq)
return float(temp_seq.nucFrequencyDict()['G']/bases * 100) + float(temp_seq.nucFrequencyDict()['C']/bases * 100)
def gcContentFromFASTA(fasta_File):
"""
Given a FASTA file, return a dict of sequence names and their respective GC content
\n<- fasta_File: FASTA formatted file
\n-> dict
"""
seqDict = dictFromFASTA(fasta_File)
gcDict = {k:percentGC_fasta(v) for k,v in seqDict.items()}
return gcDict
def getMaxGCFromFASTA(fasta_File):
"""
Given a FASTA file, return the sequence with the largest GC content
\n<- fasta_File: FASTA formatted file
\n-> dict
"""
gcDict = gcContentFromFASTA(fasta_File)
maxKey = max(gcDict, key=gcDict.get)
return {maxKey:gcDict[maxKey]}
def parseFASTA(fasta_File, labels = [], seq_type = 'AA'):
"""
Parse a FASTA file into bio_seq or aa_seq objects
\nNotes: labels empty by default returns all sequences, otherwise returns list of bio_seq objects that match label in labels
\n\tseq_type specifies what sequence type the function should interpret the FASTA file as. Default to amino acid/protein sequence.
\n<- fasta_File: FASTA formatted file, labels: str[],
\n-> bio_seq[] || aa_seq[]
"""
seqDict = dictFromFASTA(fasta_File)
if not bool(seqDict):
return [] # Empty dict, no seqs
if not bool(labels):
if seq_type == 'AA':
seqs = [aa_seq(v,k) for k,v in seqDict.items()]
else:
seqs = [bio_seq(v,seq_type,k) for k,v in seqDict.items()]
else:
if seq_type == 'AA':
seqs = [aa_seq(v,k) for k,v in seqDict.items() if k in labels]
else:
seqs = [bio_seq(v,seq_type,k) for k,v in seqDict.items() if k in labels]
return seqs
def dfFromFASTA(fasta_File, seq_type = 'AA', cols = []):
"""
Parse a FASTA file into a dataframe
\nNotes: cols empty by default returns dataframe with all bio_seq/aa_seq attributes, otherwise only returns df with spec'd cols
\n\tseq_type specifies what sequence type the function should interpret the FASTA file as. Default to amino acid/protein sequence.
\n<- fasta_File: FASTA formatted file, cols: str[],
\n-> bio_seq[]
"""
seqList = parseFASTA(fasta_File, [], seq_type)
df = pd.DataFrame([d.__dict__ for d in seqList])
if not cols: out = df
else: out = df[cols]
return out
def ancestryToFASTA(ancestry_file, outfile_name = 'output_'+ datetime.now().strftime("%Y%m%d_%H%M%S")+'.fasta', al = 0):
"""
Parse a standard Ancestry DNA .txt file into FASTA format
\nNotes: Will parse through file and disregard until the "rsid chromosome position allele1 allele2" line of text is found.
\n\tBy default, al = 0 will return a fasta file with allele 1 appended by allele 2 in singular file, 1 for allele 1, 2 for allele 2
\n<- (AncestryDNA.txt): default Ancestry DNA .txt file from download , outfile_name: str, al: int
\n-> outfile_name.fasta
"""
assert al == 0 or al == 1 or al == 2
lines = readFile(ancestry_file)
for line in lines[:]:
if line[0] == '#':
lines.remove(line) # remove beginning comments
if line[0] != '#':
print(line)
lines.remove(line) # remove the header line and break
break
allele1 = defaultdict(list)
allele2 = defaultdict(list)
count = 0
for line in lines:
stripped = line.strip()
line_arr = stripped.split('\t')
chromosome = line_arr[1]
if chromosome not in allele1.keys():
count = 0 # reset wrapping for new chr/label
count += 1
allele1[chromosome].append(line_arr[3].replace('0', '-')) # Ancestry deletions are represented by 0
allele2[chromosome].append(line_arr[4].replace('0', '-'))
if count == 60:
allele1[chromosome].append('\n')
allele2[chromosome].append('\n')
count = 0
allele1_out = ""
for chr, seq in allele1.items():
allele1_out += ">allele_1_chr_" + chr + '\n'
allele1_out += ''.join(seq) + '\n'
allele2_out = ""
for chr, seq in allele2.items():
allele2_out += ">allele_2_chr_" + chr + '\n'
allele2_out += ''.join(seq) + '\n'
if al == 0:
writeFile([allele1_out, allele2_out], outfile_name)
elif al == 1:
writeFile(allele1_out, outfile_name)
elif al == 2:
writeFile(allele2_out, outfile_name)