Skip to content

Commit

Permalink
Merge pull request #100 from VDBWRAIR/mapps-fix
Browse files Browse the repository at this point in the history
So I think this looks good
  • Loading branch information
necrolyte2 committed Mar 16, 2016
2 parents f7cc855 + 1c8f6f6 commit 17936f8
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 24 deletions.
54 changes: 31 additions & 23 deletions bio_bits/ctleptop.py
@@ -1,8 +1,6 @@
#!/usr/bin/env python
# encoding: utf-8
"""
ctleptop.py -i [FASTA FILE] > Out_file.txt
Created by Dereje Jima on May 21, 2015
"""
from __future__ import division
Expand Down Expand Up @@ -109,7 +107,7 @@ def access_mixed_aa(file_name):
seqids = []
for seq_record in SeqIO.parse(file_name, 'fasta'):
seq_id = seq_record.id
seqline = str(seq_record.seq)
seqline = str(seq_record.seq).upper()
seqline = seqline.replace("-", "N")
n = 3
codon_list = dict( (i + n , seqline[i:i + n]) for i in range(0, len(seqline), n))
Expand Down Expand Up @@ -173,7 +171,7 @@ def create_args():
the sequence',
epilog = '%(prog)s -i tests/Den4_MAAPS_TestData16.fasta -o out_file.txt'
)
g = parser.add_mutually_exclusive_group(required=True)
g = parser.add_mutually_exclusive_group(required=False)
parser.add_argument("-i", type=str, help="Nucleotide fasta file", required=True)
parser.add_argument("-o", type=str, help="output file name", required=True)
g.add_argument("--gb-file", type=str, help="genbank file name")
Expand All @@ -182,7 +180,7 @@ def create_args():
parser.add_argument('--cds', type=str, help="CDS start stop[start,stop]")
return parser.parse_args()

def mod_entry(entry, cds):
def mod_entry(entry, cds=None):
'''
Find Gap positions and non-coding region positions
:param entry: iterable of (seqid,nucindex,aaindex,nuclcodon,aacodon,genename)
Expand All @@ -192,10 +190,14 @@ def mod_entry(entry, cds):
new_entry = list(entry)
nuc_pos = entry[1]
nt = entry[3]
if cds.start >= nuc_pos or cds.end <= nuc_pos:
new_entry[4] = 'NON-CODING'
if cds:
if cds.start >= nuc_pos or cds.end <= nuc_pos:
new_entry[4] = 'NON-CODING'
elif 'N' in nt:
new_entry[4] = 'GAPFOUND'
elif 'N' in nt:
new_entry[4] = 'GAPFOUND'

return tuple(new_entry)

def main():
Expand All @@ -206,33 +208,39 @@ def main():
with open(outfile, 'w+') as outf:
aa, nuc_idx, nucl_codon, seqids = access_mixed_aa(file_name)

# Get Gene info
reference_genes, cds = degen.get_genes(args.gb_id, args.gb_file, args.tab_file)
overlapped_genes = degen.get_degen_list_overlap(reference_genes, nuc_idx)

# Remove all non-mixed positions
amb_aa_codon = filter(lambda x: '/' in x, aa)
# get amino acid index list
amb_aa_indx = map(lambda x: x//3 + 1, nuc_idx)

mixed_positions = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes)
if args.cds:
cds_start, cds_end = map(int, args.cds.split(','))
cds = degen.Gene('CDS', cds_start, cds_end)
if args.gb_file or args.tab_file or args.gb_id:
# cds might be None
reference_genes, cds = degen.get_genes(args.gb_id, args.gb_file, args.tab_file)
overlapped_genes = degen.get_degen_list_overlap(reference_genes, nuc_idx)
if args.cds:
cds_start, cds_end = map(int, args.cds.split(','))
cds = degen.Gene('CDS', cds_start, cds_end)
mixed_positions = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes)
assert cds, "No CDS info provided in file or commandline."
mixed_positions= map(lambda x: mod_entry(x, cds), mixed_positions)
headers=[
'seq id', 'nt Position', 'aa position',
'nt composition', 'aa composition', 'gene name'
]

if cds is None:
print("No CDS information supplied via input file or on command line")
sys.exit(1)
else:
mixed_positions = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon)
mixed_positions = map(mod_entry, mixed_positions)
headers=[
'seq id', 'nt Position', 'aa position',
'nt composition', 'aa composition'
]

# mark gaps and non-coding positions
mixed_positions= map(lambda x: mod_entry(x, cds), mixed_positions)
outf.write(
tabulate(
mixed_positions,
headers=[
'seq id', 'nt Position', 'aa position',
'nt composition', 'aa composition', 'gene name'
]
headers=headers
) + "\n"
)

Expand Down
2 changes: 1 addition & 1 deletion bio_bits/degen.py
Expand Up @@ -127,7 +127,7 @@ def get_genes(ref_id=None, genbank_file=None, user_file=None):
:param str genbank_file: filepath/filehandle for genbank file holding gene info
:return: (iterable Gene objects with `start`, `end`, `name`, cds Gene)
'''
assert sum(map(bool, [ref_id, genbank_file, user_file])) == 1, "Must supply exactly one of accession id (%s) or gene_file (%s), or csv/tab-delimited file %s." % (ref_id, gene_file, tab_file)
assert sum(map(bool, [ref_id, genbank_file, user_file])) == 1, "Must supply exactly one of accession id (%s) or genbank_file (%s), or csv/tab-delimited file %s." % (ref_id, genbank_file, user_file)
if ref_id:
genes = id_to_genes(ref_id)
elif genbank_file:
Expand Down
8 changes: 8 additions & 0 deletions docs/scripts/degen_regions.rst
Expand Up @@ -70,6 +70,14 @@ position 1 is coding
degen_regions -i tests/Den4_MAAPS_TestData16.fasta -o output.tsv --gb-file tests/testinput/sequence.gb --cds 1,1
Without Gene Information
------------------------
The gene information is optional. If it is not provided the output file will not be annotated with the gene information; otherwise, the output will look the same (you will also lose the "non-coding region" flag.)

.. code-block:: bash
degen_regions -i tests/Den4_MAAPS_TestData16.fasta -o output.tsv
Output
------

Expand Down

0 comments on commit 17936f8

Please sign in to comment.