get seq objects based on codon positions

VoSeq · Feb 4, 2015 · 845efb1 · 845efb1
1 parent 90956f6
commit 845efb1
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 1 deletion.
diff --git a/voseq/core/utils.py b/voseq/core/utils.py
@@ -1,8 +1,11 @@
+import itertools
 import json
 import re
 
 from django.conf import settings
 
+from Bio.Seq import Seq
+
 from stats.models import Stats
 
 
@@ -179,3 +182,20 @@ def flatten_taxon_names_dict(dictionary):
     out_striped = re.sub('_+', '_', out)
     out_clean = re.sub('_$', '', out_striped)
     return out_clean
+
+
+def chain_and_flatten(seq1, seq2):
+    """Takes seq objects which only contain certain codon positions.
+
+    Combines the two seq objects and returns another seq object.
+
+    """
+    out = []
+    append = out.append
+
+    my_chain = itertools.zip_longest(seq1, seq2)
+    for i in itertools.chain.from_iterable(my_chain):
+        if i is not None:
+            append(i)
+
+    return Seq(''.join(out))
diff --git a/voseq/create_dataset/tests/tests_utils.py b/voseq/create_dataset/tests/tests_utils.py
@@ -1,3 +1,5 @@
+from Bio.Seq import Seq
+
 from django.test import TestCase
 from django.test.client import Client
 from django.core.management import call_command
@@ -20,7 +22,8 @@ def setUp(self):
             'taxonset': None,
             'voucher_codes': 'CP100-10\r\nCP100-11',
             'geneset': None,
-            'taxon_names': ['CODE', 'SUPERFAMILY', 'GENUS', 'SPECIES']
+            'taxon_names': ['CODE', 'SUPERFAMILY', 'GENUS', 'SPECIES'],
+            'positions': ['ALL'],
         }
 
         self.c = Client()
@@ -50,3 +53,12 @@ def test_get_taxon_names_for_taxa_additional_fields(self):
         result = dataset_creator.get_taxon_names_for_taxa()
 
         self.assertEqual(expected, result)
+
+    def test_get_sequence_based_on_codon_positions(self):
+        self.cleaned_data['positions'] = ['1st']
+        self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
+        dataset_creator = CreateDataset(self.cleaned_data)
+        expected = Seq("CGGTGATAAAGCTATATGGAGACAAGATGAG")
+        sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
+        result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
+        self.assertEqual(expected, result)
diff --git a/voseq/create_dataset/utils.py b/voseq/create_dataset/utils.py
@@ -4,6 +4,8 @@
 from core.utils import get_voucher_codes
 from core.utils import get_gene_codes
 from core.utils import flatten_taxon_names_dict
+from core.utils import chain_and_flatten
+from public_interface.models import Genes
 from public_interface.models import Sequences
 from public_interface.models import Vouchers
 
@@ -27,6 +29,8 @@ def __init__(self, cleaned_data):
         self.gene_codes = get_gene_codes(cleaned_data)
         self.taxon_names = cleaned_data['taxon_names']
         self.dataset_str = self.create_dataset()
+        self.codon_positions = cleaned_data['positions']
+        self.reading_frames = self.get_reading_frames()
 
     def create_dataset(self):
         self.voucher_codes = get_voucher_codes(self.cleaned_data)
@@ -113,3 +117,68 @@ def get_taxon_names_for_taxa(self):
                 vouchers_with_taxon_names[code] = obj
 
         return vouchers_with_taxon_names
+
+    def get_reading_frames(self):
+        """
+
+        :return: dict of gene_code: reading_frame. If not found, flag warning.
+        """
+        reading_frames = dict()
+        genes = Genes.objects.all().values('gene_code', 'reading_frame')
+        for gene in genes:
+            gene_code = gene['gene_code'].lower()
+            if gene_code in self.gene_codes:
+                reading_frames[gene_code] = gene['reading_frame']
+        return reading_frames
+
+    def get_sequence_based_on_codon_positions(self, gene_code, seq):
+        """Puts the sequence in frame, by deleting base pairs at the begining
+        of the sequence if the reading frame is not 1:
+
+        :param gene_code: as lower case
+        :param seq: as BioPython seq object.
+        :return: sequence as string with codon positions removed, if needed.
+
+        Example:
+            If reading frame is 2: ATGGGG becomes TGGGG. Then the sequence is
+            processed to extract the codon positions requested by the user.
+
+        """
+        if 'ALL' in self.codon_positions:
+            return seq
+
+        reading_frame = int(self.reading_frames[gene_code.lower()]) - 1
+        seq = seq[reading_frame:]
+
+        # This is the BioPython way to get codon positions
+        # http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc19
+        first_position = seq[0::3]
+        second_position = seq[1::3]
+        third_position = seq[2::3]
+
+        if '1st' in self.codon_positions \
+                and '2nd' not in self.codon_positions \
+                and '3rd' not in self.codon_positions:
+            return first_position
+
+        if '2nd' in self.codon_positions \
+                and '1st' not in self.codon_positions \
+                and '3rd' not in self.codon_positions:
+            return second_position
+
+        if '3rd' in self.codon_positions \
+                and '1st' not in self.codon_positions \
+                and '2nd' not in self.codon_positions:
+            return third_position
+
+        if '1st' in self.codon_positions and '2nd' in self.codon_positions \
+                and '3rd' not in self.codon_positions:
+            return chain_and_flatten(first_position, second_position)
+
+        if '1st' in self.codon_positions and '3rd' in self.codon_positions \
+                and '2nd' not in self.codon_positions:
+            return chain_and_flatten(first_position, third_position)
+
+        if '2nd' in self.codon_positions and '3rd' in self.codon_positions \
+                and '1st' not in self.codon_positions:
+            return chain_and_flatten(second_position, third_position)