Skip to content

Commit

Permalink
Adding 5'UTR and 3'UTR to output for degen_regions
Browse files Browse the repository at this point in the history
  • Loading branch information
necrolyte2 committed Dec 30, 2015
1 parent dc4f38e commit 0d40032
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 9 deletions.
19 changes: 16 additions & 3 deletions bio_bits/degen.py
Expand Up @@ -32,15 +32,28 @@

Gene = namedtuple('Gene', [ 'name', 'start', 'end'])

def gene_name(rec):
'''
Determine the correct gene name from genbank record
:param Bio.SeqRecord rec: biopython genbank record
:return str name: 3'UTR, 5'UTR, mat_peptide, ...
'''
name = rec.qualifiers.get('product', rec.type)
if isinstance(name, list):
return name[0]
return name

def seqrecord_to_genes(rec):
'''
[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(10452), strand=1), type='source'), SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(83), strand=1), type="5'UTR"), SeqFeature(FeatureLocation(ExactPosition(83), ExactPosition(10262), strand=1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(83), ExactPosition(425), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(425), ExactPosition(923), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(923), ExactPosition(2408), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(2408), ExactPosition(3464), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(3464), ExactPosition(4118), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(4118), ExactPosition(4508), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(4508), ExactPosition(6365), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(6365), ExactPosition(6746), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(6746), ExactPosition(6815), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(6815), ExactPosition(7562), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(7562), ExactPosition(10259), strand=1), type='mat_peptide'), SeqFeature(FeatureLocation(ExactPosition(10262), ExactPosition(10452), strand=1), type="3'UTR")]
:param Bio.SeqRecord rec: genbank record from SeqIO.parse format='genbank'
:return iterable genes: iterator of gene objects (features with mat_peptied as their type)
'''
#Don't include `CDS`, that's whole-genome polypeptide
GENE_TYPES = ('mat_peptide')
genes = filter(lambda x: x.type in GENE_TYPES, rec.features)
starts_ends_names = map(lambda f: ( f.qualifiers['product'][0], int(f.location.start), int(f.location.end), ), genes)
EXCLUDE_GENE_TYPES = ('source', 'CDS')
genes = filter(lambda x: x.type not in EXCLUDE_GENE_TYPES, rec.features)
starts_ends_names = map(lambda f: ( gene_name(f), int(f.location.start), int(f.location.end), ), genes)
return starmap(Gene, starts_ends_names)

def fetch_record_by_id(_id):
Expand Down
6 changes: 3 additions & 3 deletions tests/ctleptop.robot
Expand Up @@ -27,7 +27,7 @@ Expected Output Genbank File
File Should Not Be Empty ${actual}
${actual_contents} = Get File ${actual}
${expected_contents} = Get File ${expected}
Should Be Equal As Strings ${actual_contents} ${expected_contents}
Should Be Equal As Strings ${expected_contents} ${actual_contents}

Expected Output Genbank Accession
${process_result} = Run Process degen_regions -i ${in_fasta} -o ${ACTUAL} --gb-id ${in_genbank_id}
Expand All @@ -43,7 +43,7 @@ Expected Output Genbank Accession
File Should Not Be Empty ${actual}
${actual_contents} = Get File ${actual}
${expected_contents} = Get File ${expected}
Should Be Equal As Strings ${actual_contents} ${expected_contents}
Should Be Equal As Strings ${expected_contents} ${actual_contents}

Expected Output Tab File
${process_result} = Run Process degen_regions -i ${in_fasta} -o ${ACTUAL} --tab-file ${in_annotation_tab}
Expand All @@ -59,4 +59,4 @@ Expected Output Tab File
File Should Not Be Empty ${actual}
${actual_contents} = Get File ${actual}
${expected_contents} = Get File ${expected}
Should Be Equal As Strings ${actual_contents} ${expected_contents}
Should Be Equal As Strings ${expected_contents} ${actual_contents}
2 changes: 2 additions & 0 deletions tests/testinput/KJ189367.annotation.csv
@@ -1,4 +1,5 @@
name,start,stop
5'UTR,1,83
anchored capsid protein,84,425
membrane glycoprotein precursor,426,923
envelope protein,924,2408
Expand All @@ -10,3 +11,4 @@ nonstructural protein NS4A,6366,6746
2K peptide,6747,6815
nonstructural protein NS4B,6816,7562
nonstructural protein NS5,7563,10259
3'UTR,10263,10452
6 changes: 3 additions & 3 deletions tests/testinput/ctl_expected.tsv
Expand Up @@ -124,13 +124,13 @@ seq id nt Position aa position nt comp
1793_Den4/AY618992_1/Thailand/2001/Den4_1 10015 3339 RTT I/V nonstructural protein NS5
1793_Den4/AY618992_1/Thailand/2001/Den4_1 10087 3363 NGR GAPFOUND nonstructural protein NS5
1793_Den4/AY618992_1/Thailand/2001/Den4_1 10085 3362 NGR GAPFOUND nonstructural protein NS5
1901_Den4/AY618992_1/Thailand/2001/Den4_1 15 6 AAN GAPFOUND -
1901_Den4/AY618992_1/Thailand/2001/Den4_1 15 6 AAN GAPFOUND 5'UTR
1901_Den4/AY618992_1/Thailand/2001/Den4_1 111 38 TTN GAPFOUND anchored capsid protein
1901_Den4/AY618992_1/Thailand/2001/Den4_1 2279 760 GYT A/V envelope protein
1901_Den4/AY618992_1/Thailand/2001/Den4_1 8798 2933 ARA K/R nonstructural protein NS5
1901_Den4/AY618992_1/Thailand/2001/Den4_1 10195 3399 RAG E/K nonstructural protein NS5
1901_Den4/AY618992_1/Thailand/2001/Den4_1 10366 3456 RGG G/R -
1934_Den4/AY618992_1/Thailand/2001/Den4_1 15 6 AAN GAPFOUND -
1901_Den4/AY618992_1/Thailand/2001/Den4_1 10366 3456 RGG G/R 3'UTR
1934_Den4/AY618992_1/Thailand/2001/Den4_1 15 6 AAN GAPFOUND 5'UTR
1934_Den4/AY618992_1/Thailand/2001/Den4_1 111 38 TTN GAPFOUND anchored capsid protein
1934_Den4/AY618992_1/Thailand/2001/Den4_1 998 333 GMT A/D envelope protein
1934_Den4/AY618992_1/Thailand/2001/Den4_1 4515 1506 TTM F/L nonstructural protein NS3
Expand Down

0 comments on commit 0d40032

Please sign in to comment.