Skip to content

Commit

Permalink
Merge pull request #77 from carlosp420/partitions
Browse files Browse the repository at this point in the history
get dataset divided by codon positions if needed
  • Loading branch information
carlosp420 committed Feb 12, 2015
2 parents ba896f7 + 1b53c03 commit 458125f
Show file tree
Hide file tree
Showing 10 changed files with 640 additions and 46 deletions.
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
biopython==1.65
Django==1.7.4
pyprind==2.9.1
elasticsearch==1.3.0
elasticsearch==1.4
Unipath==1.0
psycopg2==2.6
dataset==0.5.5
Expand Down
3 changes: 0 additions & 3 deletions voseq/blast_local/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/blast_local_full/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/blast_ncbi/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/blast_new/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/core/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/create_dataset/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
524 changes: 518 additions & 6 deletions voseq/create_dataset/tests/tests_utils.py

Large diffs are not rendered by default.

139 changes: 121 additions & 18 deletions voseq/create_dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(self, cleaned_data):
self.errors = []
self.seq_objs = dict()
self.codon_positions = cleaned_data['positions']
self.partition_by_positions = cleaned_data['partition_by_positions']
self.cleaned_data = cleaned_data
self.voucher_codes = get_voucher_codes(cleaned_data)
self.gene_codes = get_gene_codes(cleaned_data)
Expand Down Expand Up @@ -72,21 +73,96 @@ def from_seq_objs_to_fasta(self):
another FASTA gene sequence.
"""
fasta_str = []
append = fasta_str.append
# This codons might not correspond to first, second and third codon positions
partitions = {
'all_codons': [],
'codon1': [],
'codon2': [],
'codon3': [],
}

length_partitions = None

for gene_code in self.seq_objs:
this_gene = None
for seq_record in self.seq_objs[gene_code]:
if this_gene is None:
this_gene = seq_record.name
seq_str = '>' + this_gene + '\n' + '--------------------'
append(seq_str)
seq_record_seq_str = str(self.get_sequence_based_on_codon_positions(this_gene, seq_record.seq))
seq_str = '>' + seq_record.id + '\n' + seq_record_seq_str
append(seq_str)

return '\n'.join(fasta_str)
partitions['all_codons'].append(seq_str)

seq_str = '>' + this_gene + '_1st_codon\n' + '--------------------'
partitions['codon1'].append(seq_str)
seq_str = '>' + this_gene + '_2nd_codon\n' + '--------------------'
partitions['codon2'].append(seq_str)
seq_str = '>' + this_gene + '_3rd_codon\n' + '--------------------'
partitions['codon3'].append(seq_str)

seq_record_seqs = self.get_sequence_based_on_codon_positions(this_gene, seq_record.seq)

# We have codon positions that go to one partition
if len(seq_record_seqs) == 1:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['all_codons'].append(seq_str)
length_partitions = 1

# We have two codon positions because they should go to different partitions
if len(seq_record_seqs) == 2:
if self.codon_positions == ['1st', '2nd']:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['codon1'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[1])
partitions['codon2'].append(seq_str)
length_partitions = 2

if self.codon_positions == ['1st', '3rd']:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['codon1'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[1])
partitions['codon3'].append(seq_str)
length_partitions = 2

if self.codon_positions == ['2nd', '3rd']:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['codon2'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[1])
partitions['codon3'].append(seq_str)
length_partitions = 2

# We have three codon positions because they should go to different partitions
if len(seq_record_seqs) == 3:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['codon1'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[1])
partitions['codon2'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[2])
partitions['codon3'].append(seq_str)
length_partitions = 3

out = ''
if self.partition_by_positions == 'ONE':
out += '\n'.join(partitions['all_codons'])
return out

# We have codon positions that go to one partition
if length_partitions == 1:
out += '\n'.join(partitions['all_codons'])
return out

if len(partitions['codon1']) > len(self.gene_codes):
out += '\n'.join(partitions['codon1'])
if len(partitions['codon2']) > len(self.gene_codes):
out += '\n'
out += '\n'.join(partitions['codon2'])
if len(partitions['codon3']) > len(self.gene_codes):
out += '\n'
out += '\n'.join(partitions['codon3'])
return out

def get_taxon_names_for_taxa(self):
"""Returns dict: {'CP100-10': {'taxon': 'name'}}
Expand Down Expand Up @@ -130,19 +206,23 @@ def get_reading_frames(self):

def get_sequence_based_on_codon_positions(self, gene_code, seq):
"""Puts the sequence in frame, by deleting base pairs at the begining
of the sequence if the reading frame is not 1:
of the sequence if the reading frame is not 1.
Takes into account whether the codon positions should go in different
partitions.
:param gene_code: as lower case
:param seq: as BioPython seq object.
:return: sequence as Seq object with codon positions requested by user.
:return: tuples of Seq objects depending of number of paritions_by_condons.
and codon positions as requested by user.
Example:
If reading frame is 2: ATGGGG becomes TGGGG. Then the sequence is
processed to extract the codon positions requested by the user.
"""
if 'ALL' in self.codon_positions:
return seq
if 'ALL' in self.codon_positions and self.partition_by_positions == 'ONE':
return seq,

reading_frame = int(self.reading_frames[gene_code.lower()]) - 1
seq = seq[reading_frame:]
Expand All @@ -153,29 +233,52 @@ def get_sequence_based_on_codon_positions(self, gene_code, seq):
second_position = seq[1::3]
third_position = seq[2::3]

# ALL overrides 1st, 2nd, 3rd codon positions. We should return all codons
if 'ALL' in self.codon_positions:
if self.partition_by_positions == 'ONE':
return (chain_and_flatten(first_position, second_position, third_position))
else:
return (first_position, second_position, third_position)

if '1st' in self.codon_positions \
and '2nd' not in self.codon_positions \
and '3rd' not in self.codon_positions:
return first_position
return first_position,

if '2nd' in self.codon_positions \
and '1st' not in self.codon_positions \
and '3rd' not in self.codon_positions:
return second_position
return second_position,

if '3rd' in self.codon_positions \
and '1st' not in self.codon_positions \
and '2nd' not in self.codon_positions:
return third_position
return third_position,

if '1st' in self.codon_positions and '2nd' in self.codon_positions \
and '3rd' not in self.codon_positions:
return chain_and_flatten(first_position, second_position)
if self.partition_by_positions == 'ONE':
return chain_and_flatten(first_position, second_position),
else:
return (first_position, second_position)

if '1st' in self.codon_positions and '3rd' in self.codon_positions \
and '2nd' not in self.codon_positions:
return chain_and_flatten(first_position, third_position)
if self.partition_by_positions == 'ONE':
return chain_and_flatten(first_position, third_position),
else:
return first_position, third_position

if '2nd' in self.codon_positions and '3rd' in self.codon_positions \
and '1st' not in self.codon_positions:
return chain_and_flatten(second_position, third_position)
if self.partition_by_positions == 'ONE':
return chain_and_flatten(second_position, third_position),
else:
return (second_position, third_position)

if '1st' in self.codon_positions and '2nd' in self.codon_positions \
and '3rd' in self.codon_positions:
if self.partition_by_positions == 'ONE':
return (chain_and_flatten(first_position, second_position, third_position))
else:
return (first_position, second_position, third_position)
3 changes: 0 additions & 3 deletions voseq/view_genes/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.

0 comments on commit 458125f

Please sign in to comment.