Merge pull request #74 from carlosp420/dataset

Dataset
VoSeq · Feb 4, 2015 · 55bdb8f · 55bdb8f
2 parents 62f01c4 + fa55c0a
commit 55bdb8f
Show file tree

Hide file tree

Showing 6 changed files with 182 additions and 15 deletions.
diff --git a/voseq/core/utils.py b/voseq/core/utils.py
@@ -1,8 +1,11 @@
+import itertools
 import json
 import re
 
 from django.conf import settings
 
+from Bio.Seq import Seq
+
 from stats.models import Stats
 
 
@@ -38,7 +41,7 @@ def get_voucher_codes(cleaned_data):
     vouchers_to_drop = []
     for i in voucher_codes:
         if re.search('^--', i):
-            vouchers_to_drop.append(re.sub('^--', '', i))
+            vouchers_to_drop.append(re.sub('^--', '', i).lower())
 
     voucher_codes_filtered = []
     for i in voucher_codes_set:
@@ -179,3 +182,20 @@ def flatten_taxon_names_dict(dictionary):
     out_striped = re.sub('_+', '_', out)
     out_clean = re.sub('_$', '', out_striped)
     return out_clean
+
+
+def chain_and_flatten(seq1, seq2):
+    """Takes seq objects which only contain certain codon positions.
+
+    Combines the two seq objects and returns another seq object.
+
+    """
+    out = []
+    append = out.append
+
+    my_chain = itertools.zip_longest(seq1, seq2)
+    for i in itertools.chain.from_iterable(my_chain):
+        if i is not None:
+            append(i)
+
+    return Seq(''.join(out))
diff --git a/voseq/create_dataset/forms.py b/voseq/create_dataset/forms.py
@@ -24,17 +24,17 @@ class CreateDatasetForm(BaseDatasetForm):
         required=False,
     )
 
-    positions = forms.ChoiceField(
-        label='Positions',
-        help_text='codon positions',
+    positions = forms.MultipleChoiceField(
+        label='Codon Positions',
+        help_text='Codon positions to keep and write into datasets',
         choices=[
             ('ALL', 'all'),
             ('1st', '1st'),
             ('2nd', '2nd'),
             ('3rd', '3rd'),
         ],
-        widget=forms.RadioSelect(),
-        initial='ALL',
+        widget=forms.CheckboxSelectMultiple(),
+        initial=['ALL'],
         required=True,
     )
 

diff --git a/voseq/create_dataset/templates/create_dataset/index.html b/voseq/create_dataset/templates/create_dataset/index.html
@@ -65,6 +65,9 @@ <h3 class="panel-title"><b>Enter the required info to make yourself a ready-to-r
                 Amino Acids->Special->Degen->All->1st—2nd,3rd):
               </td>
               <td>
+                {% for error in form.positions.errors %}
+                <p class="text-danger">{{ error }}</p>
+                {% endfor %}
                 <b>{{ form.positions.label }}</b>
                 {{ form.positions }}
 

diff --git a/voseq/create_dataset/tests/tests_utils.py b/voseq/create_dataset/tests/tests_utils.py
@@ -1,9 +1,12 @@
+from Bio.Seq import Seq
+
 from django.test import TestCase
 from django.test.client import Client
 from django.core.management import call_command
 
 from create_dataset.utils import CreateDataset
 from public_interface.models import Genes
+from public_interface.models import TaxonSets
 
 
 class CreateDatasetUtilsTest(TestCase):
@@ -20,26 +23,47 @@ def setUp(self):
             'taxonset': None,
             'voucher_codes': 'CP100-10\r\nCP100-11',
             'geneset': None,
-            'taxon_names': ['CODE', 'SUPERFAMILY', 'GENUS', 'SPECIES']
+            'taxon_names': ['CODE', 'SUPERFAMILY', 'GENUS', 'SPECIES'],
+            'positions': ['ALL'],
         }
 
         self.c = Client()
         self.dataset_creator = CreateDataset(self.cleaned_data)
+        self.maxDiff = None
 
     def test_create_dataset(self):
-        expected = '>CP100-10_Papilionoidea_Melitaea_diamina'
+        expected = '>coi\n--------------------\n>CP100-10_Papilionoidea_Melitaea_diamina'
         result = self.dataset_creator.dataset_str
         self.assertTrue(expected in result)
 
+    def test_create_dataset_with_gene_code(self):
+        self.cleaned_data['taxon_names'] = ['CODE', 'GENECODE']
+        dataset_creator = CreateDataset(self.cleaned_data)
+        expected = ">CP100-10_coi\n"
+        result = dataset_creator.dataset_str
+        self.assertTrue(expected in result)
+
     def test_get_taxon_names_for_taxa(self):
         expected = {
             'cp100-10': {'code': 'CP100-10', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': 'Papilionoidea'},
             'cp100-11': {'code': 'CP100-11', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': ''},
         }
         result = self.dataset_creator.get_taxon_names_for_taxa()
-
         self.assertEqual(expected, result)
 
+    def test_create_dataset_drop_voucher(self):
+        cleaned_data = self.cleaned_data
+        cleaned_data['voucher_codes'] = 'CP100-10\r\n--CP100-11'
+        cleaned_data['taxonset'] = TaxonSets.objects.get(taxonset_name='Erebia')
+        dataset_creator = CreateDataset(cleaned_data)
+        result = dataset_creator.dataset_str
+        self.assertTrue('CP100-11' not in result)
+
+    def test_from_seq_objs_to_fasta(self):
+        expected = 2706
+        result = self.dataset_creator.from_seq_objs_to_fasta()
+        self.assertEqual(expected, len(result))
+
     def test_get_taxon_names_for_taxa_additional_fields(self):
         self.cleaned_data['taxon_names'] = ['SUPERFAMILY']
         dataset_creator = CreateDataset(self.cleaned_data)
@@ -50,3 +74,57 @@ def test_get_taxon_names_for_taxa_additional_fields(self):
         result = dataset_creator.get_taxon_names_for_taxa()
 
         self.assertEqual(expected, result)
+
+    def test_get_sequence_first_codon_position(self):
+        self.cleaned_data['positions'] = ['1st']
+        self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
+        dataset_creator = CreateDataset(self.cleaned_data)
+        expected = Seq("CGGTGATAAAGCTATATGGAGACAAGATGAG")
+        sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
+        result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
+        self.assertEqual(expected, result)
+
+    def test_get_sequence_second_codon_position(self):
+        self.cleaned_data['positions'] = ['2nd']
+        self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
+        dataset_creator = CreateDataset(self.cleaned_data)
+        expected = Seq("ATACGACCCCGATTAAGGGTAaGCTAATAAA")
+        sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
+        result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
+        self.assertEqual(expected, result)
+
+    def test_get_sequence_third_codon_position(self):
+        self.cleaned_data['positions'] = ['3rd']
+        self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
+        dataset_creator = CreateDataset(self.cleaned_data)
+        expected = Seq("CCCCCGCCCCTCGTCATTTCCATCCGGCGG")
+        sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
+        result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
+        self.assertEqual(expected, result)
+
+    def test_get_sequence_first_and_second_codon_position(self):
+        self.cleaned_data['positions'] = ['1st', '2nd']
+        self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
+        dataset_creator = CreateDataset(self.cleaned_data)
+        expected = Seq("CAGTGATCGGAATCACACACGGCATTATTAAATGGGGGATGAAaCGACATGAAATTGAAAGA")
+        sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
+        result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
+        self.assertEqual(expected, result)
+
+    def test_get_sequence_first_and_third_codon_position(self):
+        self.cleaned_data['positions'] = ['1st', '3rd']
+        self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
+        dataset_creator = CreateDataset(self.cleaned_data)
+        expected = Seq("CCGCGCTCGCAGTCACACACGTCCTGATTCAATTGTGTACGCAACTACACGGAGTCGGAGG")
+        sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
+        result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
+        self.assertEqual(expected, result)
+
+    def test_get_sequence_second_and_third_codon_position(self):
+        self.cleaned_data['positions'] = ['2nd', '3rd']
+        self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
+        dataset_creator = CreateDataset(self.cleaned_data)
+        expected = Seq("ACTCACCCGCAGCCCCCCCCGTACTGTTACAAGTGTGTTCACaAGTCCTCAGAGTCAGAGA")
+        sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
+        result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
+        self.assertEqual(expected, result)
diff --git a/voseq/create_dataset/utils.py b/voseq/create_dataset/utils.py
@@ -4,6 +4,8 @@
 from core.utils import get_voucher_codes
 from core.utils import get_gene_codes
 from core.utils import flatten_taxon_names_dict
+from core.utils import chain_and_flatten
+from public_interface.models import Genes
 from public_interface.models import Sequences
 from public_interface.models import Vouchers
 
@@ -22,9 +24,11 @@ def __init__(self, cleaned_data):
         print(">>>>>>_init", cleaned_data)
         self.errors = []
         self.seq_objs = dict()
+        self.codon_positions = cleaned_data['positions']
         self.cleaned_data = cleaned_data
         self.voucher_codes = get_voucher_codes(cleaned_data)
         self.gene_codes = get_gene_codes(cleaned_data)
+        self.reading_frames = self.get_reading_frames()
         self.taxon_names = cleaned_data['taxon_names']
         self.dataset_str = self.create_dataset()
 
@@ -78,11 +82,8 @@ def from_seq_objs_to_fasta(self):
                     this_gene = seq_record.name
                     seq_str = '>' + this_gene + '\n' + '--------------------'
                     append(seq_str)
-                if this_gene != seq_record.name:
-                    this_gene = seq_record.name
-                    seq_str = '>' + this_gene + '\n' + '--------------------'
-                    append(seq_str)
-                seq_str = '>' + seq_record.id + '\n' + str(seq_record.seq)
+                seq_record_seq_str = str(self.get_sequence_based_on_codon_positions(this_gene, seq_record.seq))
+                seq_str = '>' + seq_record.id + '\n' + seq_record_seq_str
                 append(seq_str)
 
         return '\n'.join(fasta_str)
@@ -113,3 +114,68 @@ def get_taxon_names_for_taxa(self):
                 vouchers_with_taxon_names[code] = obj
 
         return vouchers_with_taxon_names
+
+    def get_reading_frames(self):
+        """
+
+        :return: dict of gene_code: reading_frame. If not found, flag warning.
+        """
+        reading_frames = dict()
+        genes = Genes.objects.all().values('gene_code', 'reading_frame')
+        for gene in genes:
+            gene_code = gene['gene_code'].lower()
+            if gene_code in self.gene_codes:
+                reading_frames[gene_code] = gene['reading_frame']
+        return reading_frames
+
+    def get_sequence_based_on_codon_positions(self, gene_code, seq):
+        """Puts the sequence in frame, by deleting base pairs at the begining
+        of the sequence if the reading frame is not 1:
+
+        :param gene_code: as lower case
+        :param seq: as BioPython seq object.
+        :return: sequence as Seq object with codon positions requested by user.
+
+        Example:
+            If reading frame is 2: ATGGGG becomes TGGGG. Then the sequence is
+            processed to extract the codon positions requested by the user.
+
+        """
+        if 'ALL' in self.codon_positions:
+            return seq
+
+        reading_frame = int(self.reading_frames[gene_code.lower()]) - 1
+        seq = seq[reading_frame:]
+
+        # This is the BioPython way to get codon positions
+        # http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc19
+        first_position = seq[0::3]
+        second_position = seq[1::3]
+        third_position = seq[2::3]
+
+        if '1st' in self.codon_positions \
+                and '2nd' not in self.codon_positions \
+                and '3rd' not in self.codon_positions:
+            return first_position
+
+        if '2nd' in self.codon_positions \
+                and '1st' not in self.codon_positions \
+                and '3rd' not in self.codon_positions:
+            return second_position
+
+        if '3rd' in self.codon_positions \
+                and '1st' not in self.codon_positions \
+                and '2nd' not in self.codon_positions:
+            return third_position
+
+        if '1st' in self.codon_positions and '2nd' in self.codon_positions \
+                and '3rd' not in self.codon_positions:
+            return chain_and_flatten(first_position, second_position)
+
+        if '1st' in self.codon_positions and '3rd' in self.codon_positions \
+                and '2nd' not in self.codon_positions:
+            return chain_and_flatten(first_position, third_position)
+
+        if '2nd' in self.codon_positions and '3rd' in self.codon_positions \
+                and '1st' not in self.codon_positions:
+            return chain_and_flatten(second_position, third_position)
diff --git a/voseq/genbank_fasta/tests/test_utils.py b/voseq/genbank_fasta/tests/test_utils.py
@@ -44,6 +44,6 @@ def test_get_voucher_codes(self):
 
     def test_get_voucher_codes_dropped(self):
         self.cleaned_data['voucher_codes'] = 'CP100-10\r\n--CP100-11\r\nCP100-12'
-        expected = 3
+        expected = 2
         result = get_voucher_codes(self.cleaned_data)
         self.assertEqual(expected, len(result))