Skip to content

Commit

Permalink
get seq objects based on codon positions
Browse files Browse the repository at this point in the history
  • Loading branch information
carlosp420 committed Feb 4, 2015
1 parent 90956f6 commit 845efb1
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 1 deletion.
20 changes: 20 additions & 0 deletions voseq/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import itertools
import json
import re

from django.conf import settings

from Bio.Seq import Seq

from stats.models import Stats


Expand Down Expand Up @@ -179,3 +182,20 @@ def flatten_taxon_names_dict(dictionary):
out_striped = re.sub('_+', '_', out)
out_clean = re.sub('_$', '', out_striped)
return out_clean


def chain_and_flatten(seq1, seq2):
"""Takes seq objects which only contain certain codon positions.
Combines the two seq objects and returns another seq object.
"""
out = []
append = out.append

my_chain = itertools.zip_longest(seq1, seq2)
for i in itertools.chain.from_iterable(my_chain):
if i is not None:
append(i)

return Seq(''.join(out))
14 changes: 13 additions & 1 deletion voseq/create_dataset/tests/tests_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from Bio.Seq import Seq

from django.test import TestCase
from django.test.client import Client
from django.core.management import call_command
Expand All @@ -20,7 +22,8 @@ def setUp(self):
'taxonset': None,
'voucher_codes': 'CP100-10\r\nCP100-11',
'geneset': None,
'taxon_names': ['CODE', 'SUPERFAMILY', 'GENUS', 'SPECIES']
'taxon_names': ['CODE', 'SUPERFAMILY', 'GENUS', 'SPECIES'],
'positions': ['ALL'],
}

self.c = Client()
Expand Down Expand Up @@ -50,3 +53,12 @@ def test_get_taxon_names_for_taxa_additional_fields(self):
result = dataset_creator.get_taxon_names_for_taxa()

self.assertEqual(expected, result)

def test_get_sequence_based_on_codon_positions(self):
self.cleaned_data['positions'] = ['1st']
self.cleaned_data['gene_codes'] = [Genes.objects.get(gene_code='wingless')]
dataset_creator = CreateDataset(self.cleaned_data)
expected = Seq("CGGTGATAAAGCTATATGGAGACAAGATGAG")
sequence = Seq("ACACGTCGACTCCGGCAAGTCCACCACCACCGGTCACTTGATTTACAAATGTGGTGGTATCGACAaACGTACCATCGAGAAGTTCGAGAAGGA")
result = dataset_creator.get_sequence_based_on_codon_positions('wingless', sequence)
self.assertEqual(expected, result)
69 changes: 69 additions & 0 deletions voseq/create_dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from core.utils import get_voucher_codes
from core.utils import get_gene_codes
from core.utils import flatten_taxon_names_dict
from core.utils import chain_and_flatten
from public_interface.models import Genes
from public_interface.models import Sequences
from public_interface.models import Vouchers

Expand All @@ -27,6 +29,8 @@ def __init__(self, cleaned_data):
self.gene_codes = get_gene_codes(cleaned_data)
self.taxon_names = cleaned_data['taxon_names']
self.dataset_str = self.create_dataset()
self.codon_positions = cleaned_data['positions']
self.reading_frames = self.get_reading_frames()

def create_dataset(self):
self.voucher_codes = get_voucher_codes(self.cleaned_data)
Expand Down Expand Up @@ -113,3 +117,68 @@ def get_taxon_names_for_taxa(self):
vouchers_with_taxon_names[code] = obj

return vouchers_with_taxon_names

def get_reading_frames(self):
"""
:return: dict of gene_code: reading_frame. If not found, flag warning.
"""
reading_frames = dict()
genes = Genes.objects.all().values('gene_code', 'reading_frame')
for gene in genes:
gene_code = gene['gene_code'].lower()
if gene_code in self.gene_codes:
reading_frames[gene_code] = gene['reading_frame']
return reading_frames

def get_sequence_based_on_codon_positions(self, gene_code, seq):
"""Puts the sequence in frame, by deleting base pairs at the begining
of the sequence if the reading frame is not 1:
:param gene_code: as lower case
:param seq: as BioPython seq object.
:return: sequence as string with codon positions removed, if needed.
Example:
If reading frame is 2: ATGGGG becomes TGGGG. Then the sequence is
processed to extract the codon positions requested by the user.
"""
if 'ALL' in self.codon_positions:
return seq

reading_frame = int(self.reading_frames[gene_code.lower()]) - 1
seq = seq[reading_frame:]

# This is the BioPython way to get codon positions
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc19
first_position = seq[0::3]
second_position = seq[1::3]
third_position = seq[2::3]

if '1st' in self.codon_positions \
and '2nd' not in self.codon_positions \
and '3rd' not in self.codon_positions:
return first_position

if '2nd' in self.codon_positions \
and '1st' not in self.codon_positions \
and '3rd' not in self.codon_positions:
return second_position

if '3rd' in self.codon_positions \
and '1st' not in self.codon_positions \
and '2nd' not in self.codon_positions:
return third_position

if '1st' in self.codon_positions and '2nd' in self.codon_positions \
and '3rd' not in self.codon_positions:
return chain_and_flatten(first_position, second_position)

if '1st' in self.codon_positions and '3rd' in self.codon_positions \
and '2nd' not in self.codon_positions:
return chain_and_flatten(first_position, third_position)

if '2nd' in self.codon_positions and '3rd' in self.codon_positions \
and '1st' not in self.codon_positions:
return chain_and_flatten(second_position, third_position)

0 comments on commit 845efb1

Please sign in to comment.