Merge pull request #100 from VDBWRAIR/mapps-fix

So I think this looks good
VDBWRAIR · Mar 16, 2016 · 17936f8 · 17936f8
2 parents f7cc855 + 1c8f6f6
commit 17936f8
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 24 deletions.
diff --git a/bio_bits/ctleptop.py b/bio_bits/ctleptop.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python
 # encoding: utf-8
 """
-ctleptop.py -i [FASTA FILE] > Out_file.txt
-
 Created by Dereje Jima on May 21, 2015
 """
 from __future__ import division
@@ -109,7 +107,7 @@ def access_mixed_aa(file_name):
     seqids = []
     for seq_record in SeqIO.parse(file_name, 'fasta'):
         seq_id = seq_record.id
-        seqline = str(seq_record.seq)
+        seqline = str(seq_record.seq).upper()
         seqline = seqline.replace("-", "N")
         n = 3
         codon_list = dict( (i + n , seqline[i:i + n]) for i in range(0, len(seqline), n))
@@ -173,7 +171,7 @@ def create_args():
              the sequence',
         epilog = '%(prog)s -i tests/Den4_MAAPS_TestData16.fasta -o out_file.txt'
     )
-    g = parser.add_mutually_exclusive_group(required=True)
+    g = parser.add_mutually_exclusive_group(required=False)
     parser.add_argument("-i", type=str, help="Nucleotide fasta file", required=True)
     parser.add_argument("-o", type=str,  help="output file name", required=True)
     g.add_argument("--gb-file", type=str,  help="genbank file name")
@@ -182,7 +180,7 @@ def create_args():
     parser.add_argument('--cds', type=str, help="CDS start stop[start,stop]")
     return parser.parse_args()
 
-def mod_entry(entry, cds):
+def mod_entry(entry, cds=None):
     '''
     Find Gap positions and non-coding region positions
     :param entry: iterable of (seqid,nucindex,aaindex,nuclcodon,aacodon,genename)
@@ -192,10 +190,14 @@ def mod_entry(entry, cds):
     new_entry = list(entry)
     nuc_pos = entry[1]
     nt = entry[3]
-    if cds.start >= nuc_pos or cds.end <= nuc_pos:
-        new_entry[4] = 'NON-CODING'
+    if cds:
+        if cds.start >= nuc_pos or cds.end <= nuc_pos:
+            new_entry[4] = 'NON-CODING'
+        elif 'N' in nt:
+            new_entry[4] = 'GAPFOUND'
     elif 'N' in nt:
         new_entry[4] = 'GAPFOUND'
+
     return tuple(new_entry)
 
 def main():
@@ -206,33 +208,39 @@ def main():
     with open(outfile, 'w+') as outf:
         aa, nuc_idx, nucl_codon, seqids = access_mixed_aa(file_name)
 
-        # Get Gene info
-        reference_genes, cds = degen.get_genes(args.gb_id, args.gb_file, args.tab_file)
-        overlapped_genes = degen.get_degen_list_overlap(reference_genes, nuc_idx)
-
         # Remove all non-mixed positions
         amb_aa_codon = filter(lambda x: '/' in x, aa)
         # get amino acid index list
         amb_aa_indx = map(lambda x: x//3 + 1, nuc_idx)
 
-        mixed_positions = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes)
-        if args.cds:
-            cds_start, cds_end = map(int, args.cds.split(','))
-            cds = degen.Gene('CDS', cds_start, cds_end)
+        if args.gb_file or args.tab_file or args.gb_id:
+            # cds might be None
+            reference_genes, cds = degen.get_genes(args.gb_id, args.gb_file, args.tab_file)
+            overlapped_genes = degen.get_degen_list_overlap(reference_genes, nuc_idx)
+            if args.cds:
+                cds_start, cds_end = map(int, args.cds.split(','))
+                cds = degen.Gene('CDS', cds_start, cds_end)
+            mixed_positions = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes)
+            assert cds, "No CDS info provided in file or commandline."
+            mixed_positions= map(lambda x: mod_entry(x, cds), mixed_positions)
+            headers=[
+                    'seq id', 'nt Position', 'aa position',
+                    'nt composition', 'aa composition', 'gene name'
+                ]
 
-        if cds is None:
-            print("No CDS information supplied via input file or on command line")
-            sys.exit(1)
+        else:
+            mixed_positions = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon)
+            mixed_positions = map(mod_entry, mixed_positions)
+            headers=[
+                    'seq id', 'nt Position', 'aa position',
+                    'nt composition', 'aa composition'
+                ]
 
         # mark gaps and non-coding positions
-        mixed_positions= map(lambda x: mod_entry(x, cds), mixed_positions)
         outf.write(
             tabulate(
                 mixed_positions,
-                headers=[
-                    'seq id', 'nt Position', 'aa position',
-                    'nt composition', 'aa composition', 'gene name'
-                ]
+                headers=headers
             ) + "\n"
         )
 

diff --git a/bio_bits/degen.py b/bio_bits/degen.py
@@ -127,7 +127,7 @@ def get_genes(ref_id=None, genbank_file=None, user_file=None):
     :param str genbank_file: filepath/filehandle for genbank file holding gene info
     :return: (iterable Gene objects with `start`, `end`, `name`, cds Gene)
     '''
-    assert  sum(map(bool, [ref_id, genbank_file, user_file])) == 1, "Must supply exactly one of accession id (%s) or gene_file (%s), or csv/tab-delimited file %s." % (ref_id, gene_file, tab_file)
+    assert  sum(map(bool, [ref_id, genbank_file, user_file])) == 1, "Must supply exactly one of accession id (%s) or genbank_file (%s), or csv/tab-delimited file %s." % (ref_id, genbank_file, user_file)
     if ref_id:
         genes = id_to_genes(ref_id)
     elif genbank_file:

diff --git a/docs/scripts/degen_regions.rst b/docs/scripts/degen_regions.rst
@@ -70,6 +70,14 @@ position 1 is coding
 
     degen_regions -i tests/Den4_MAAPS_TestData16.fasta -o output.tsv --gb-file tests/testinput/sequence.gb --cds 1,1
 
+Without Gene Information
+------------------------
+The gene information is optional. If it is not provided the output file will not be annotated with the gene information; otherwise, the output will look the same (you will also lose the "non-coding region" flag.)
+
+.. code-block:: bash
+
+    degen_regions -i tests/Den4_MAAPS_TestData16.fasta -o output.tsv 
+
 Output
 ------