Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

get dataset divided by codon positions if needed #77

Merged
merged 24 commits into from
Feb 12, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9b20c04
separate codons by partition
carlosp420 Feb 10, 2015
7b668a5
fix Seq bug
carlosp420 Feb 10, 2015
e201bff
test dataset all condons as one
carlosp420 Feb 10, 2015
2cbdcda
test dataset all codons, 1st position as one
carlosp420 Feb 10, 2015
2fccaba
test dataset all codons, 1st 2nd position as one
carlosp420 Feb 10, 2015
ed6af5f
test dataset all codons, 1st 2nd 3rd position as one
carlosp420 Feb 10, 2015
5da79b7
test dataset all codons, partitions each
carlosp420 Feb 11, 2015
0de12fb
test dataset all condons + 1st, partitions each
carlosp420 Feb 11, 2015
4ffc954
test dataset all condons + 1st + 2nd, partitions each
carlosp420 Feb 11, 2015
a47ec36
test dataset all condons + 1st + 2nd + 3rd, partitions each
carlosp420 Feb 11, 2015
a21034f
test datatset 1st codon, one partition
carlosp420 Feb 11, 2015
3043089
test datatset 2nd codon, one partition
carlosp420 Feb 11, 2015
b954788
test dataset 3rd codon, one partition
carlosp420 Feb 11, 2015
f19f918
test dataset 1st 2nd codons, one partition
carlosp420 Feb 11, 2015
f52f8ca
test dataset 2nd, 3rd codons, one partition
carlosp420 Feb 11, 2015
38aa033
test dataset 1st, 3rd codons, one partition
carlosp420 Feb 11, 2015
fa620d6
test dataset 1st, each partition
carlosp420 Feb 12, 2015
2fed8bf
test dataset 2nd, each partition
carlosp420 Feb 12, 2015
66cb256
test dataset 3rd, each partition
carlosp420 Feb 12, 2015
a936459
test dataset 1st, 3rd, each partition
carlosp420 Feb 12, 2015
058f8f0
test dataset 2nd, 3rd, each position
carlosp420 Feb 12, 2015
90d0f07
test dataset 1st, 2nd, 3rd, each position
carlosp420 Feb 12, 2015
1b64516
test dataset ALL, 1st, 2nd, 3rd, each position
carlosp420 Feb 12, 2015
1b53c03
rm unused model imports
carlosp420 Feb 12, 2015
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
biopython==1.65
Django==1.7.4
pyprind==2.9.1
elasticsearch==1.3.0
elasticsearch==1.4
Unipath==1.0
psycopg2==2.6
dataset==0.5.5
Expand Down
3 changes: 0 additions & 3 deletions voseq/blast_local/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/blast_local_full/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/blast_ncbi/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/blast_new/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/core/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
3 changes: 0 additions & 3 deletions voseq/create_dataset/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.
524 changes: 518 additions & 6 deletions voseq/create_dataset/tests/tests_utils.py

Large diffs are not rendered by default.

139 changes: 121 additions & 18 deletions voseq/create_dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(self, cleaned_data):
self.errors = []
self.seq_objs = dict()
self.codon_positions = cleaned_data['positions']
self.partition_by_positions = cleaned_data['partition_by_positions']
self.cleaned_data = cleaned_data
self.voucher_codes = get_voucher_codes(cleaned_data)
self.gene_codes = get_gene_codes(cleaned_data)
Expand Down Expand Up @@ -72,21 +73,96 @@ def from_seq_objs_to_fasta(self):
another FASTA gene sequence.

"""
fasta_str = []
append = fasta_str.append
# This codons might not correspond to first, second and third codon positions
partitions = {
'all_codons': [],
'codon1': [],
'codon2': [],
'codon3': [],
}

length_partitions = None

for gene_code in self.seq_objs:
this_gene = None
for seq_record in self.seq_objs[gene_code]:
if this_gene is None:
this_gene = seq_record.name
seq_str = '>' + this_gene + '\n' + '--------------------'
append(seq_str)
seq_record_seq_str = str(self.get_sequence_based_on_codon_positions(this_gene, seq_record.seq))
seq_str = '>' + seq_record.id + '\n' + seq_record_seq_str
append(seq_str)

return '\n'.join(fasta_str)
partitions['all_codons'].append(seq_str)

seq_str = '>' + this_gene + '_1st_codon\n' + '--------------------'
partitions['codon1'].append(seq_str)
seq_str = '>' + this_gene + '_2nd_codon\n' + '--------------------'
partitions['codon2'].append(seq_str)
seq_str = '>' + this_gene + '_3rd_codon\n' + '--------------------'
partitions['codon3'].append(seq_str)

seq_record_seqs = self.get_sequence_based_on_codon_positions(this_gene, seq_record.seq)

# We have codon positions that go to one partition
if len(seq_record_seqs) == 1:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['all_codons'].append(seq_str)
length_partitions = 1

# We have two codon positions because they should go to different partitions
if len(seq_record_seqs) == 2:
if self.codon_positions == ['1st', '2nd']:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['codon1'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[1])
partitions['codon2'].append(seq_str)
length_partitions = 2

if self.codon_positions == ['1st', '3rd']:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['codon1'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[1])
partitions['codon3'].append(seq_str)
length_partitions = 2

if self.codon_positions == ['2nd', '3rd']:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['codon2'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[1])
partitions['codon3'].append(seq_str)
length_partitions = 2

# We have three codon positions because they should go to different partitions
if len(seq_record_seqs) == 3:
seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[0])
partitions['codon1'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[1])
partitions['codon2'].append(seq_str)

seq_str = '>' + seq_record.id + '\n' + str(seq_record_seqs[2])
partitions['codon3'].append(seq_str)
length_partitions = 3

out = ''
if self.partition_by_positions == 'ONE':
out += '\n'.join(partitions['all_codons'])
return out

# We have codon positions that go to one partition
if length_partitions == 1:
out += '\n'.join(partitions['all_codons'])
return out

if len(partitions['codon1']) > len(self.gene_codes):
out += '\n'.join(partitions['codon1'])
if len(partitions['codon2']) > len(self.gene_codes):
out += '\n'
out += '\n'.join(partitions['codon2'])
if len(partitions['codon3']) > len(self.gene_codes):
out += '\n'
out += '\n'.join(partitions['codon3'])
return out

def get_taxon_names_for_taxa(self):
"""Returns dict: {'CP100-10': {'taxon': 'name'}}
Expand Down Expand Up @@ -130,19 +206,23 @@ def get_reading_frames(self):

def get_sequence_based_on_codon_positions(self, gene_code, seq):
"""Puts the sequence in frame, by deleting base pairs at the begining
of the sequence if the reading frame is not 1:
of the sequence if the reading frame is not 1.

Takes into account whether the codon positions should go in different
partitions.

:param gene_code: as lower case
:param seq: as BioPython seq object.
:return: sequence as Seq object with codon positions requested by user.
:return: tuples of Seq objects depending of number of paritions_by_condons.
and codon positions as requested by user.

Example:
If reading frame is 2: ATGGGG becomes TGGGG. Then the sequence is
processed to extract the codon positions requested by the user.

"""
if 'ALL' in self.codon_positions:
return seq
if 'ALL' in self.codon_positions and self.partition_by_positions == 'ONE':
return seq,

reading_frame = int(self.reading_frames[gene_code.lower()]) - 1
seq = seq[reading_frame:]
Expand All @@ -153,29 +233,52 @@ def get_sequence_based_on_codon_positions(self, gene_code, seq):
second_position = seq[1::3]
third_position = seq[2::3]

# ALL overrides 1st, 2nd, 3rd codon positions. We should return all codons
if 'ALL' in self.codon_positions:
if self.partition_by_positions == 'ONE':
return (chain_and_flatten(first_position, second_position, third_position))
else:
return (first_position, second_position, third_position)

if '1st' in self.codon_positions \
and '2nd' not in self.codon_positions \
and '3rd' not in self.codon_positions:
return first_position
return first_position,

if '2nd' in self.codon_positions \
and '1st' not in self.codon_positions \
and '3rd' not in self.codon_positions:
return second_position
return second_position,

if '3rd' in self.codon_positions \
and '1st' not in self.codon_positions \
and '2nd' not in self.codon_positions:
return third_position
return third_position,

if '1st' in self.codon_positions and '2nd' in self.codon_positions \
and '3rd' not in self.codon_positions:
return chain_and_flatten(first_position, second_position)
if self.partition_by_positions == 'ONE':
return chain_and_flatten(first_position, second_position),
else:
return (first_position, second_position)

if '1st' in self.codon_positions and '3rd' in self.codon_positions \
and '2nd' not in self.codon_positions:
return chain_and_flatten(first_position, third_position)
if self.partition_by_positions == 'ONE':
return chain_and_flatten(first_position, third_position),
else:
return first_position, third_position

if '2nd' in self.codon_positions and '3rd' in self.codon_positions \
and '1st' not in self.codon_positions:
return chain_and_flatten(second_position, third_position)
if self.partition_by_positions == 'ONE':
return chain_and_flatten(second_position, third_position),
else:
return (second_position, third_position)

if '1st' in self.codon_positions and '2nd' in self.codon_positions \
and '3rd' in self.codon_positions:
if self.partition_by_positions == 'ONE':
return (chain_and_flatten(first_position, second_position, third_position))
else:
return (first_position, second_position, third_position)
3 changes: 0 additions & 3 deletions voseq/view_genes/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.db import models

# Create your models here.