Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

append higher taxon names to seq objects id #73

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions voseq/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,79 @@ def strip_question_marks(seq):
seq = re.sub('\?+$', '', seq)
seq = re.sub('N+$', '', seq)
return seq, removed


def flatten_taxon_names_dict(dictionary):
"""Converts a dict to string suitable for FASTA object id

Args:
``dictionary``: {'code': 'CP100-10', 'orden': 'Lepidoptera'. 'genus': 'Danaus'}

Returns:
Flattened as string: 'CP100-10_Lepidoptera_Danaus'

"""
out = ''
try:
out += dictionary['code'] + "_"
except KeyError:
pass

try:
out += dictionary['orden'] + "_"
except KeyError:
pass

try:
out += dictionary['superfamily'] + "_"
except KeyError:
pass

try:
out += dictionary['family'] + "_"
except KeyError:
pass

try:
out += dictionary['subfamily'] + "_"
except KeyError:
pass

try:
out += dictionary['tribe'] + "_"
except KeyError:
pass

try:
out += dictionary['subtribe'] + "_"
except KeyError:
pass

try:
out += dictionary['genus'] + "_"
except KeyError:
pass

try:
out += dictionary['species'] + "_"
except KeyError:
pass

try:
out += dictionary['subspecies'] + "_"
except KeyError:
pass

try:
out += dictionary['auctor'] + "_"
except KeyError:
pass

try:
out += dictionary['hostorg'] + "_"
except KeyError:
pass

out_striped = re.sub('_+', '_', out)
out_clean = re.sub('_$', '', out_striped)
return out_clean
2 changes: 1 addition & 1 deletion voseq/create_dataset/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class CreateDatasetForm(BaseDatasetForm):
label='What info do you want in the taxon names?',
choices=[
('CODE', 'Code'),
('ORDER', 'Order'),
('ORDEN', 'Order'),
('SUPERFAMILY', 'Superfamily'),
('FAMILY', 'Family'),
('SUBFAMILY', 'Subfamily'),
Expand Down
25 changes: 18 additions & 7 deletions voseq/create_dataset/tests/tests_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def setUp(self):

g1 = Genes.objects.get(gene_code='COI')
g2 = Genes.objects.get(gene_code='EF1a')
cleaned_data = {
self.cleaned_data = {
'gene_codes': [g1, g2],
'taxonset': None,
'voucher_codes': 'CP100-10\r\nCP100-11',
Expand All @@ -24,18 +24,29 @@ def setUp(self):
}

self.c = Client()
self.dataset_creator = CreateDataset(cleaned_data)
self.dataset_creator = CreateDataset(self.cleaned_data)

def test_create_dataset(self):
expected = '>CP100-11\n??TGAGCCGGTATAATTGGTACATCCCTAAGTCTTATTATTC'
expected = '>CP100-10_Papilionoidea_Melitaea_diamina'
result = self.dataset_creator.dataset_str
self.assertTrue(expected in result)

def test_get_taxon_names_for_taxa(self):
expected = [
{'code': 'CP100-10', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': 'Papilionoidea'},
{'code': 'CP100-11', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': ''},
]
expected = {
'cp100-10': {'code': 'CP100-10', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': 'Papilionoidea'},
'cp100-11': {'code': 'CP100-11', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': ''},
}
result = self.dataset_creator.get_taxon_names_for_taxa()

self.assertEqual(expected, result)

def test_get_taxon_names_for_taxa_additional_fields(self):
self.cleaned_data['taxon_names'] = ['SUPERFAMILY']
dataset_creator = CreateDataset(self.cleaned_data)
expected = {
'cp100-10': {'superfamily': 'Papilionoidea'},
'cp100-11': {'superfamily': ''},
}
result = dataset_creator.get_taxon_names_for_taxa()

self.assertEqual(expected, result)
33 changes: 15 additions & 18 deletions voseq/create_dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from core.utils import get_voucher_codes
from core.utils import get_gene_codes
from core.utils import flatten_taxon_names_dict
from public_interface.models import Sequences
from public_interface.models import Vouchers

Expand All @@ -22,10 +23,10 @@ def __init__(self, cleaned_data):
self.errors = []
self.seq_objs = dict()
self.cleaned_data = cleaned_data
self.dataset_str = self.create_dataset()
self.voucher_codes = get_voucher_codes(cleaned_data)
self.gene_codes = get_gene_codes(cleaned_data)
self.taxon_names = cleaned_data['taxon_names']
self.dataset_str = self.create_dataset()

def create_dataset(self):
self.voucher_codes = get_voucher_codes(self.cleaned_data)
Expand All @@ -37,22 +38,22 @@ def create_seq_objs(self):
"""Generate a list of sequence objects. Also takes into account the
genes passed as geneset.

Args:
* ``voucher_codes``: list of vouchers codes, cleaned by our Form.
* ``gene_codes``: list of gene codes, cleaned by our Form.

Returns:
list of sequence objects as produced by BioPython.

"""
our_taxon_names = self.get_taxon_names_for_taxa()

all_seqs = Sequences.objects.all().values('code_id', 'gene_code', 'sequences').order_by('code_id')
for s in all_seqs:
code = s['code_id'].lower()
gene_code = s['gene_code'].lower()
if code in self.voucher_codes and gene_code in self.gene_codes:
seq = Seq(s['sequences'])
seq_obj = SeqRecord(seq)
seq_obj.id = code
seq_obj.id = flatten_taxon_names_dict(our_taxon_names[code])
if 'GENECODE' in self.taxon_names:
seq_obj.id += '_' + gene_code
seq_obj.name = gene_code

if gene_code not in self.seq_objs:
Expand Down Expand Up @@ -81,26 +82,21 @@ def from_seq_objs_to_fasta(self):
this_gene = seq_record.name
seq_str = '>' + this_gene + '\n' + '--------------------'
append(seq_str)
seq_str = '>' + seq_record.id.upper() + '\n' + str(seq_record.seq)
seq_str = '>' + seq_record.id + '\n' + str(seq_record.seq)
append(seq_str)

return '\n'.join(fasta_str)

def get_taxon_names_for_taxa(self):
"""Returns list of dicts: {'taxon': 'name'}
"""Returns dict: {'CP100-10': {'taxon': 'name'}}

Takes list of voucher_codes and list of taxon_names from cleaned form.

Args:
* ``voucher_codes``
* ``taxon_names``

Returns:
List of dictionaries with data.
Dictionar with data, also as dicts.

"""
vouchers_with_taxon_names = []
append = vouchers_with_taxon_names.append
vouchers_with_taxon_names = {}

all_vouchers = Vouchers.objects.all().order_by('code').values('code', 'orden', 'superfamily',
'family', 'subfamily', 'tribe',
Expand All @@ -111,8 +107,9 @@ def get_taxon_names_for_taxa(self):
if code in self.voucher_codes:
obj = dict()
for taxon_name in self.taxon_names:
taxon_name = taxon_name.lower()
obj[taxon_name] = voucher[taxon_name]
append(obj)
if taxon_name != 'GENECODE':
taxon_name = taxon_name.lower()
obj[taxon_name] = voucher[taxon_name]
vouchers_with_taxon_names[code] = obj

return vouchers_with_taxon_names