Skip to content

Commit

Permalink
Merge branch 'dataset'
Browse files Browse the repository at this point in the history
  • Loading branch information
carlosp420 committed Feb 3, 2015
2 parents ef338ae + 155bb98 commit 99371f0
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 26 deletions.
76 changes: 76 additions & 0 deletions voseq/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,79 @@ def strip_question_marks(seq):
seq = re.sub('\?+$', '', seq)
seq = re.sub('N+$', '', seq)
return seq, removed


def flatten_taxon_names_dict(dictionary):
"""Converts a dict to string suitable for FASTA object id
Args:
``dictionary``: {'code': 'CP100-10', 'orden': 'Lepidoptera'. 'genus': 'Danaus'}
Returns:
Flattened as string: 'CP100-10_Lepidoptera_Danaus'
"""
out = ''
try:
out += dictionary['code'] + "_"
except KeyError:
pass

try:
out += dictionary['orden'] + "_"
except KeyError:
pass

try:
out += dictionary['superfamily'] + "_"
except KeyError:
pass

try:
out += dictionary['family'] + "_"
except KeyError:
pass

try:
out += dictionary['subfamily'] + "_"
except KeyError:
pass

try:
out += dictionary['tribe'] + "_"
except KeyError:
pass

try:
out += dictionary['subtribe'] + "_"
except KeyError:
pass

try:
out += dictionary['genus'] + "_"
except KeyError:
pass

try:
out += dictionary['species'] + "_"
except KeyError:
pass

try:
out += dictionary['subspecies'] + "_"
except KeyError:
pass

try:
out += dictionary['auctor'] + "_"
except KeyError:
pass

try:
out += dictionary['hostorg'] + "_"
except KeyError:
pass

out_striped = re.sub('_+', '_', out)
out_clean = re.sub('_$', '', out_striped)
return out_clean
2 changes: 1 addition & 1 deletion voseq/create_dataset/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class CreateDatasetForm(BaseDatasetForm):
label='What info do you want in the taxon names?',
choices=[
('CODE', 'Code'),
('ORDER', 'Order'),
('ORDEN', 'Order'),
('SUPERFAMILY', 'Superfamily'),
('FAMILY', 'Family'),
('SUBFAMILY', 'Subfamily'),
Expand Down
25 changes: 18 additions & 7 deletions voseq/create_dataset/tests/tests_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def setUp(self):

g1 = Genes.objects.get(gene_code='COI')
g2 = Genes.objects.get(gene_code='EF1a')
cleaned_data = {
self.cleaned_data = {
'gene_codes': [g1, g2],
'taxonset': None,
'voucher_codes': 'CP100-10\r\nCP100-11',
Expand All @@ -24,18 +24,29 @@ def setUp(self):
}

self.c = Client()
self.dataset_creator = CreateDataset(cleaned_data)
self.dataset_creator = CreateDataset(self.cleaned_data)

def test_create_dataset(self):
expected = '>CP100-11\n??TGAGCCGGTATAATTGGTACATCCCTAAGTCTTATTATTC'
expected = '>CP100-10_Papilionoidea_Melitaea_diamina'
result = self.dataset_creator.dataset_str
self.assertTrue(expected in result)

def test_get_taxon_names_for_taxa(self):
expected = [
{'code': 'CP100-10', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': 'Papilionoidea'},
{'code': 'CP100-11', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': ''},
]
expected = {
'cp100-10': {'code': 'CP100-10', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': 'Papilionoidea'},
'cp100-11': {'code': 'CP100-11', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': ''},
}
result = self.dataset_creator.get_taxon_names_for_taxa()

self.assertEqual(expected, result)

def test_get_taxon_names_for_taxa_additional_fields(self):
self.cleaned_data['taxon_names'] = ['SUPERFAMILY']
dataset_creator = CreateDataset(self.cleaned_data)
expected = {
'cp100-10': {'superfamily': 'Papilionoidea'},
'cp100-11': {'superfamily': ''},
}
result = dataset_creator.get_taxon_names_for_taxa()

self.assertEqual(expected, result)
33 changes: 15 additions & 18 deletions voseq/create_dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from core.utils import get_voucher_codes
from core.utils import get_gene_codes
from core.utils import flatten_taxon_names_dict
from public_interface.models import Sequences
from public_interface.models import Vouchers

Expand All @@ -22,10 +23,10 @@ def __init__(self, cleaned_data):
self.errors = []
self.seq_objs = dict()
self.cleaned_data = cleaned_data
self.dataset_str = self.create_dataset()
self.voucher_codes = get_voucher_codes(cleaned_data)
self.gene_codes = get_gene_codes(cleaned_data)
self.taxon_names = cleaned_data['taxon_names']
self.dataset_str = self.create_dataset()

def create_dataset(self):
self.voucher_codes = get_voucher_codes(self.cleaned_data)
Expand All @@ -37,22 +38,22 @@ def create_seq_objs(self):
"""Generate a list of sequence objects. Also takes into account the
genes passed as geneset.
Args:
* ``voucher_codes``: list of vouchers codes, cleaned by our Form.
* ``gene_codes``: list of gene codes, cleaned by our Form.
Returns:
list of sequence objects as produced by BioPython.
"""
our_taxon_names = self.get_taxon_names_for_taxa()

all_seqs = Sequences.objects.all().values('code_id', 'gene_code', 'sequences').order_by('code_id')
for s in all_seqs:
code = s['code_id'].lower()
gene_code = s['gene_code'].lower()
if code in self.voucher_codes and gene_code in self.gene_codes:
seq = Seq(s['sequences'])
seq_obj = SeqRecord(seq)
seq_obj.id = code
seq_obj.id = flatten_taxon_names_dict(our_taxon_names[code])
if 'GENECODE' in self.taxon_names:
seq_obj.id += '_' + gene_code
seq_obj.name = gene_code

if gene_code not in self.seq_objs:
Expand Down Expand Up @@ -81,26 +82,21 @@ def from_seq_objs_to_fasta(self):
this_gene = seq_record.name
seq_str = '>' + this_gene + '\n' + '--------------------'
append(seq_str)
seq_str = '>' + seq_record.id.upper() + '\n' + str(seq_record.seq)
seq_str = '>' + seq_record.id + '\n' + str(seq_record.seq)
append(seq_str)

return '\n'.join(fasta_str)

def get_taxon_names_for_taxa(self):
"""Returns list of dicts: {'taxon': 'name'}
"""Returns dict: {'CP100-10': {'taxon': 'name'}}
Takes list of voucher_codes and list of taxon_names from cleaned form.
Args:
* ``voucher_codes``
* ``taxon_names``
Returns:
List of dictionaries with data.
Dictionar with data, also as dicts.
"""
vouchers_with_taxon_names = []
append = vouchers_with_taxon_names.append
vouchers_with_taxon_names = {}

all_vouchers = Vouchers.objects.all().order_by('code').values('code', 'orden', 'superfamily',
'family', 'subfamily', 'tribe',
Expand All @@ -111,8 +107,9 @@ def get_taxon_names_for_taxa(self):
if code in self.voucher_codes:
obj = dict()
for taxon_name in self.taxon_names:
taxon_name = taxon_name.lower()
obj[taxon_name] = voucher[taxon_name]
append(obj)
if taxon_name != 'GENECODE':
taxon_name = taxon_name.lower()
obj[taxon_name] = voucher[taxon_name]
vouchers_with_taxon_names[code] = obj

return vouchers_with_taxon_names

0 comments on commit 99371f0

Please sign in to comment.