VoSeq · carlosp420 · Feb 3, 2015 · Feb 3, 2015 · Feb 3, 2015
diff --git a/voseq/core/utils.py b/voseq/core/utils.py
@@ -103,3 +103,79 @@ def strip_question_marks(seq):
     seq = re.sub('\?+$', '', seq)
     seq = re.sub('N+$', '', seq)
     return seq, removed
+
+
+def flatten_taxon_names_dict(dictionary):
+    """Converts a dict to string suitable for FASTA object id
+
+    Args:
+        ``dictionary``: {'code': 'CP100-10', 'orden': 'Lepidoptera'. 'genus': 'Danaus'}
+
+    Returns:
+        Flattened as string: 'CP100-10_Lepidoptera_Danaus'
+
+    """
+    out = ''
+    try:
+        out += dictionary['code'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['orden'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['superfamily'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['family'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['subfamily'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['tribe'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['subtribe'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['genus'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['species'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['subspecies'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['auctor'] + "_"
+    except KeyError:
+        pass
+
+    try:
+        out += dictionary['hostorg'] + "_"
+    except KeyError:
+        pass
+
+    out_striped = re.sub('_+', '_', out)
+    out_clean = re.sub('_$', '', out_striped)
+    return out_clean
diff --git a/voseq/create_dataset/forms.py b/voseq/create_dataset/forms.py
@@ -85,7 +85,7 @@ class CreateDatasetForm(BaseDatasetForm):
         label='What info do you want in the taxon names?',
         choices=[
             ('CODE', 'Code'),
-            ('ORDER', 'Order'),
+            ('ORDEN', 'Order'),
             ('SUPERFAMILY', 'Superfamily'),
             ('FAMILY', 'Family'),
             ('SUBFAMILY', 'Subfamily'),

diff --git a/voseq/create_dataset/tests/tests_utils.py b/voseq/create_dataset/tests/tests_utils.py
@@ -15,7 +15,7 @@ def setUp(self):
 
         g1 = Genes.objects.get(gene_code='COI')
         g2 = Genes.objects.get(gene_code='EF1a')
-        cleaned_data = {
+        self.cleaned_data = {
             'gene_codes': [g1, g2],
             'taxonset': None,
             'voucher_codes': 'CP100-10\r\nCP100-11',
@@ -24,18 +24,29 @@ def setUp(self):
         }
 
         self.c = Client()
-        self.dataset_creator = CreateDataset(cleaned_data)
+        self.dataset_creator = CreateDataset(self.cleaned_data)
 
     def test_create_dataset(self):
-        expected = '>CP100-11\n??TGAGCCGGTATAATTGGTACATCCCTAAGTCTTATTATTC'
+        expected = '>CP100-10_Papilionoidea_Melitaea_diamina'
         result = self.dataset_creator.dataset_str
         self.assertTrue(expected in result)
 
     def test_get_taxon_names_for_taxa(self):
-        expected = [
-            {'code': 'CP100-10', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': 'Papilionoidea'},
-            {'code': 'CP100-11', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': ''},
-        ]
+        expected = {
+            'cp100-10': {'code': 'CP100-10', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': 'Papilionoidea'},
+            'cp100-11': {'code': 'CP100-11', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': ''},
+        }
         result = self.dataset_creator.get_taxon_names_for_taxa()
 
         self.assertEqual(expected, result)
+
+    def test_get_taxon_names_for_taxa_additional_fields(self):
+        self.cleaned_data['taxon_names'] = ['SUPERFAMILY']
+        dataset_creator = CreateDataset(self.cleaned_data)
+        expected = {
+            'cp100-10': {'superfamily': 'Papilionoidea'},
+            'cp100-11': {'superfamily': ''},
+        }
+        result = dataset_creator.get_taxon_names_for_taxa()
+
+        self.assertEqual(expected, result)
diff --git a/voseq/create_dataset/utils.py b/voseq/create_dataset/utils.py
@@ -3,6 +3,7 @@
 
 from core.utils import get_voucher_codes
 from core.utils import get_gene_codes
+from core.utils import flatten_taxon_names_dict
 from public_interface.models import Sequences
 from public_interface.models import Vouchers
 
@@ -22,10 +23,10 @@ def __init__(self, cleaned_data):
         self.errors = []
         self.seq_objs = dict()
         self.cleaned_data = cleaned_data
-        self.dataset_str = self.create_dataset()
         self.voucher_codes = get_voucher_codes(cleaned_data)
         self.gene_codes = get_gene_codes(cleaned_data)
         self.taxon_names = cleaned_data['taxon_names']
+        self.dataset_str = self.create_dataset()
 
     def create_dataset(self):
         self.voucher_codes = get_voucher_codes(self.cleaned_data)
@@ -37,22 +38,22 @@ def create_seq_objs(self):
         """Generate a list of sequence objects. Also takes into account the
         genes passed as geneset.
 
-        Args:
-            * ``voucher_codes``: list of vouchers codes, cleaned by our Form.
-            * ``gene_codes``: list of gene codes, cleaned by our Form.
-
         Returns:
             list of sequence objects as produced by BioPython.
 
         """
+        our_taxon_names = self.get_taxon_names_for_taxa()
+
         all_seqs = Sequences.objects.all().values('code_id', 'gene_code', 'sequences').order_by('code_id')
         for s in all_seqs:
             code = s['code_id'].lower()
             gene_code = s['gene_code'].lower()
             if code in self.voucher_codes and gene_code in self.gene_codes:
                 seq = Seq(s['sequences'])
                 seq_obj = SeqRecord(seq)
-                seq_obj.id = code
+                seq_obj.id = flatten_taxon_names_dict(our_taxon_names[code])
+                if 'GENECODE' in self.taxon_names:
+                    seq_obj.id += '_' + gene_code
                 seq_obj.name = gene_code
 
                 if gene_code not in self.seq_objs:
@@ -81,26 +82,21 @@ def from_seq_objs_to_fasta(self):
                     this_gene = seq_record.name
                     seq_str = '>' + this_gene + '\n' + '--------------------'
                     append(seq_str)
-                seq_str = '>' + seq_record.id.upper() + '\n' + str(seq_record.seq)
+                seq_str = '>' + seq_record.id + '\n' + str(seq_record.seq)
                 append(seq_str)
 
         return '\n'.join(fasta_str)
 
     def get_taxon_names_for_taxa(self):
-        """Returns list of dicts: {'taxon': 'name'}
+        """Returns dict: {'CP100-10': {'taxon': 'name'}}
 
         Takes list of voucher_codes and list of taxon_names from cleaned form.
 
-        Args:
-            * ``voucher_codes``
-            * ``taxon_names``
-
         Returns:
-            List of dictionaries with data.
+            Dictionar with data, also as dicts.
 
         """
-        vouchers_with_taxon_names = []
-        append = vouchers_with_taxon_names.append
+        vouchers_with_taxon_names = {}
 
         all_vouchers = Vouchers.objects.all().order_by('code').values('code', 'orden', 'superfamily',
                                                                       'family', 'subfamily', 'tribe',
@@ -111,8 +107,9 @@ def get_taxon_names_for_taxa(self):
             if code in self.voucher_codes:
                 obj = dict()
                 for taxon_name in self.taxon_names:
-                    taxon_name = taxon_name.lower()
-                    obj[taxon_name] = voucher[taxon_name]
-                append(obj)
+                    if taxon_name != 'GENECODE':
+                        taxon_name = taxon_name.lower()
+                        obj[taxon_name] = voucher[taxon_name]
+                vouchers_with_taxon_names[code] = obj
 
         return vouchers_with_taxon_names