Skip to content

Commit

Permalink
Merge pull request #74 from carlosp420/dataset
Browse files Browse the repository at this point in the history
Dataset
  • Loading branch information
carlosp420 committed Feb 4, 2015
2 parents 62f01c4 + fa55c0a commit 55bdb8f
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 15 deletions.
22 changes: 21 additions & 1 deletion voseq/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import itertools
import json
import re

from django.conf import settings

from Bio.Seq import Seq

from stats.models import Stats


Expand Down Expand Up @@ -38,7 +41,7 @@ def get_voucher_codes(cleaned_data):
vouchers_to_drop = []
for i in voucher_codes:
if re.search('^--', i):
vouchers_to_drop.append(re.sub('^--', '', i))
vouchers_to_drop.append(re.sub('^--', '', i).lower())

voucher_codes_filtered = []
for i in voucher_codes_set:
Expand Down Expand Up @@ -179,3 +182,20 @@ def flatten_taxon_names_dict(dictionary):
out_striped = re.sub('_+', '_', out)
out_clean = re.sub('_$', '', out_striped)
return out_clean


def chain_and_flatten(seq1, seq2):
"""Takes seq objects which only contain certain codon positions.
Combines the two seq objects and returns another seq object.
"""
out = []
append = out.append

my_chain = itertools.zip_longest(seq1, seq2)
for i in itertools.chain.from_iterable(my_chain):
if i is not None:
append(i)

return Seq(''.join(out))
10 changes: 5 additions & 5 deletions voseq/create_dataset/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@ class CreateDatasetForm(BaseDatasetForm):
required=False,
)

positions = forms.ChoiceField(
label='Positions',
help_text='codon positions',
positions = forms.MultipleChoiceField(
label='Codon Positions',
help_text='Codon positions to keep and write into datasets',
choices=[
('ALL', 'all'),
('1st', '1st'),
('2nd', '2nd'),
('3rd', '3rd'),
],
widget=forms.RadioSelect(),
initial='ALL',
widget=forms.CheckboxSelectMultiple(),
initial=['ALL'],
required=True,
)

Expand Down
3 changes: 3 additions & 0 deletions voseq/create_dataset/templates/create_dataset/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ <h3 class="panel-title"><b>Enter the required info to make yourself a ready-to-r
Amino Acids->Special->Degen->All->1st—2nd,3rd):
</td>
<td>
{% for error in form.positions.errors %}
<p class="text-danger">{{ error }}</p>
{% endfor %}
<b>{{ form.positions.label }}</b>
{{ form.positions }}

Expand Down
84 changes: 81 additions & 3 deletions voseq/create_dataset/tests/tests_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from Bio.Seq import Seq

from django.test import TestCase
from django.test.client import Client
from django.core.management import call_command

from create_dataset.utils import CreateDataset
from public_interface.models import Genes
from public_interface.models import TaxonSets


class CreateDatasetUtilsTest(TestCase):
Expand All @@ -20,26 +23,47 @@ def setUp(self):
'taxonset': None,
'voucher_codes': 'CP100-10\r\nCP100-11',
'geneset': None,
'taxon_names': ['CODE', 'SUPERFAMILY', 'GENUS', 'SPECIES']
'taxon_names': ['CODE', 'SUPERFAMILY', 'GENUS', 'SPECIES'],
'positions': ['ALL'],
}

self.c = Client()
self.dataset_creator = CreateDataset(self.cleaned_data)
self.maxDiff = None

def test_create_dataset(self):
expected = '>CP100-10_Papilionoidea_Melitaea_diamina'
expected = '>coi\n--------------------\n>CP100-10_Papilionoidea_Melitaea_diamina'
result = self.dataset_creator.dataset_str
self.assertTrue(expected in result)

def test_create_dataset_with_gene_code(self):
self.cleaned_data['taxon_names'] = ['CODE', 'GENECODE']
dataset_creator = CreateDataset(self.cleaned_data)
expected = ">CP100-10_coi\n"
result = dataset_creator.dataset_str
self.assertTrue(expected in result)

def test_get_taxon_names_for_taxa(self):
expected = {
'cp100-10': {'code': 'CP100-10', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': 'Papilionoidea'},
'cp100-11': {'code': 'CP100-11', 'genus': 'Melitaea', 'species': 'diamina', 'superfamily': ''},
}
result = self.dataset_creator.get_taxon_names_for_taxa()

self.assertEqual(expected, result)

def test_create_dataset_drop_voucher(self):
cleaned_data = self.cleaned_data
cleaned_data['voucher_codes'] = 'CP100-10\r\n--CP100-11'
cleaned_data['taxonset'] = TaxonSets.objects.get(taxonset_name='Erebia')
dataset_creator = CreateDataset(cleaned_data)
result = dataset_creator.dataset_str
self.assertTrue('CP100-11' not in result)

def test_from_seq_objs_to_fasta(self):
expected = 2706
result = self.dataset_creator.from_seq_objs_to_fasta()
self.assertEqual(expected, len(result))

def test_get_taxon_names_for_taxa_additional_fields(self):
self.cleaned_data['taxon_names'] = ['SUPERFAMILY']
dataset_creator = CreateDataset(self.cleaned_data)
Expand All @@ -50,3 +74,57 @@ def test_get_taxon_names_for_taxa_additional_fields(self):
result = dataset_creator.get_taxon_names_for_taxa()

self.assertEqual(expected, result)

def test_get_sequence_first_codon_position(self):
self.cleaned_data['positions'] = ['1st']
self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
dataset_creator = CreateDataset(self.cleaned_data)
expected = Seq("CGGTGATAAAGCTATATGGAGACAAGATGAG")
sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
self.assertEqual(expected, result)

def test_get_sequence_second_codon_position(self):
self.cleaned_data['positions'] = ['2nd']
self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
dataset_creator = CreateDataset(self.cleaned_data)
expected = Seq("ATACGACCCCGATTAAGGGTAaGCTAATAAA")
sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
self.assertEqual(expected, result)

def test_get_sequence_third_codon_position(self):
self.cleaned_data['positions'] = ['3rd']
self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
dataset_creator = CreateDataset(self.cleaned_data)
expected = Seq("CCCCCGCCCCTCGTCATTTCCATCCGGCGG")
sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
self.assertEqual(expected, result)

def test_get_sequence_first_and_second_codon_position(self):
self.cleaned_data['positions'] = ['1st', '2nd']
self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
dataset_creator = CreateDataset(self.cleaned_data)
expected = Seq("CAGTGATCGGAATCACACACGGCATTATTAAATGGGGGATGAAaCGACATGAAATTGAAAGA")
sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
self.assertEqual(expected, result)

def test_get_sequence_first_and_third_codon_position(self):
self.cleaned_data['positions'] = ['1st', '3rd']
self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
dataset_creator = CreateDataset(self.cleaned_data)
expected = Seq("CCGCGCTCGCAGTCACACACGTCCTGATTCAATTGTGTACGCAACTACACGGAGTCGGAGG")
sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
self.assertEqual(expected, result)

def test_get_sequence_second_and_third_codon_position(self):
self.cleaned_data['positions'] = ['2nd', '3rd']
self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
dataset_creator = CreateDataset(self.cleaned_data)
expected = Seq("ACTCACCCGCAGCCCCCCCCGTACTGTTACAAGTGTGTTCACaAGTCCTCAGAGTCAGAGA")
sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
self.assertEqual(expected, result)
76 changes: 71 additions & 5 deletions voseq/create_dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from core.utils import get_voucher_codes
from core.utils import get_gene_codes
from core.utils import flatten_taxon_names_dict
from core.utils import chain_and_flatten
from public_interface.models import Genes
from public_interface.models import Sequences
from public_interface.models import Vouchers

Expand All @@ -22,9 +24,11 @@ def __init__(self, cleaned_data):
print(">>>>>>_init", cleaned_data)
self.errors = []
self.seq_objs = dict()
self.codon_positions = cleaned_data['positions']
self.cleaned_data = cleaned_data
self.voucher_codes = get_voucher_codes(cleaned_data)
self.gene_codes = get_gene_codes(cleaned_data)
self.reading_frames = self.get_reading_frames()
self.taxon_names = cleaned_data['taxon_names']
self.dataset_str = self.create_dataset()

Expand Down Expand Up @@ -78,11 +82,8 @@ def from_seq_objs_to_fasta(self):
this_gene = seq_record.name
seq_str = '>' + this_gene + '\n' + '--------------------'
append(seq_str)
if this_gene != seq_record.name:
this_gene = seq_record.name
seq_str = '>' + this_gene + '\n' + '--------------------'
append(seq_str)
seq_str = '>' + seq_record.id + '\n' + str(seq_record.seq)
seq_record_seq_str = str(self.get_sequence_based_on_codon_positions(this_gene, seq_record.seq))
seq_str = '>' + seq_record.id + '\n' + seq_record_seq_str
append(seq_str)

return '\n'.join(fasta_str)
Expand Down Expand Up @@ -113,3 +114,68 @@ def get_taxon_names_for_taxa(self):
vouchers_with_taxon_names[code] = obj

return vouchers_with_taxon_names

def get_reading_frames(self):
"""
:return: dict of gene_code: reading_frame. If not found, flag warning.
"""
reading_frames = dict()
genes = Genes.objects.all().values('gene_code', 'reading_frame')
for gene in genes:
gene_code = gene['gene_code'].lower()
if gene_code in self.gene_codes:
reading_frames[gene_code] = gene['reading_frame']
return reading_frames

def get_sequence_based_on_codon_positions(self, gene_code, seq):
"""Puts the sequence in frame, by deleting base pairs at the begining
of the sequence if the reading frame is not 1:
:param gene_code: as lower case
:param seq: as BioPython seq object.
:return: sequence as Seq object with codon positions requested by user.
Example:
If reading frame is 2: ATGGGG becomes TGGGG. Then the sequence is
processed to extract the codon positions requested by the user.
"""
if 'ALL' in self.codon_positions:
return seq

reading_frame = int(self.reading_frames[gene_code.lower()]) - 1
seq = seq[reading_frame:]

# This is the BioPython way to get codon positions
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc19
first_position = seq[0::3]
second_position = seq[1::3]
third_position = seq[2::3]

if '1st' in self.codon_positions \
and '2nd' not in self.codon_positions \
and '3rd' not in self.codon_positions:
return first_position

if '2nd' in self.codon_positions \
and '1st' not in self.codon_positions \
and '3rd' not in self.codon_positions:
return second_position

if '3rd' in self.codon_positions \
and '1st' not in self.codon_positions \
and '2nd' not in self.codon_positions:
return third_position

if '1st' in self.codon_positions and '2nd' in self.codon_positions \
and '3rd' not in self.codon_positions:
return chain_and_flatten(first_position, second_position)

if '1st' in self.codon_positions and '3rd' in self.codon_positions \
and '2nd' not in self.codon_positions:
return chain_and_flatten(first_position, third_position)

if '2nd' in self.codon_positions and '3rd' in self.codon_positions \
and '1st' not in self.codon_positions:
return chain_and_flatten(second_position, third_position)
2 changes: 1 addition & 1 deletion voseq/genbank_fasta/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,6 @@ def test_get_voucher_codes(self):

def test_get_voucher_codes_dropped(self):
self.cleaned_data['voucher_codes'] = 'CP100-10\r\n--CP100-11\r\nCP100-12'
expected = 3
expected = 2
result = get_voucher_codes(self.cleaned_data)
self.assertEqual(expected, len(result))

0 comments on commit 55bdb8f

Please sign in to comment.