In [1]:
from Bio import SeqIO
from collections import Counter
from orffinder import orffinder

In [2]:
def repeat_sequence(sequence, repeat_length):
    seq_list = []
    for i in range(len(sequence) - repeat_length):
        seq_list.append(sequence[i:(i + repeat_length)])
    return seq_list

In [3]:
# create base list to store info about each sequence
seq_info = []
repeat_sum = []
repeat_length = 12

# parse the fasta file
count = 0
for seq_record in SeqIO.parse('dna.example.fasta', 'fasta'):
    count += 1
    
    # create dict for each sequence
    fasta_entry = {'id': seq_record.id, 'seq': seq_record.seq,
                  'seq_length': len(seq_record), 'orfs': orffinder.getORFs(seq_record)}
    
    repeat_list = repeat_sequence(seq_record.seq, repeat_length)
    for i in range(len(repeat_list)):
        repeat_sum.append(repeat_list[i])
        
#     print(orffinder.getORFs(seq_record))
    
    
    seq_info.append(fasta_entry)
print(Counter(repeat_sum).most_common(5))    
print(count)

[(Seq('CGCGGTCGAGCG'), 3), (Seq('GCGCTGCGCGGC'), 3), (Seq('CGCTGCGCGGCG'), 3), (Seq('GGCCGCGATCCG'), 3), (Seq('CGATCGCGAGCG'), 3)]
25


In [4]:
# seq_info

In [5]:
# longest sequence using list of dict method
max_seq_length = max(seq_info, key=(lambda item: item['seq_length']))
max_seq_length

{'id': 'gi|142022655|gb|EQ086233.1|323',
 'seq': Seq('ACGCCCGGCGCACCGCGAGTACCGCGCCGCCGGGCACTCCTTGACCCCGCATGA...CGC'),
 'seq_length': 4805,
 'orfs': [{'start': 3228,
   'end': 1389,
   'frame': 1,
   'sense': '-',
   'length': 1839,
   'trailing': False,
   'index': 1},
  {'start': 2824,
   'end': 4510,
   'frame': 1,
   'sense': '+',
   'length': 1686,
   'trailing': False,
   'index': 2},
  {'start': 1476,
   'end': 3072,
   'frame': 3,
   'sense': '+',
   'length': 1596,
   'trailing': False,
   'index': 3},
  {'start': 97,
   'end': 1570,
   'frame': 1,
   'sense': '+',
   'length': 1473,
   'trailing': False,
   'index': 4},
  {'start': 65,
   'end': 1436,
   'frame': 2,
   'sense': '+',
   'length': 1371,
   'trailing': False,
   'index': 5},
  {'start': 3068,
   'end': 3770,
   'frame': 2,
   'sense': '+',
   'length': 702,
   'trailing': False,
   'index': 6},
  {'start': 673,
   'end': 1,
   'frame': 3,
   'sense': '-',
   'length': 671,
   'trailing': True,
   'index': 7},
  {

In [6]:
# shortest sequence using list of dict method
min_seq_length = min(seq_info, key=(lambda item: item['seq_length']))
min_seq_length

{'id': 'gi|142022655|gb|EQ086233.1|521',
 'seq': Seq('CGTTGTTCGCCAGGTCGTCCGCATAGCCGGCCGAGCTGAACTGCGTGACATACG...TCG'),
 'seq_length': 512,
 'orfs': [{'start': 359,
   'end': 1,
   'frame': 2,
   'sense': '-',
   'length': 357,
   'trailing': True,
   'index': 1},
  {'start': 353,
   'end': 1,
   'frame': 2,
   'sense': '-',
   'length': 351,
   'trailing': True,
   'index': 2},
  {'start': 126,
   'end': 285,
   'frame': 3,
   'sense': '+',
   'length': 159,
   'trailing': False,
   'index': 3},
  {'start': 128,
   'end': 1,
   'frame': 2,
   'sense': '-',
   'length': 126,
   'trailing': True,
   'index': 4}]}

In [7]:
# find all max and min sequence lengths

all_max = []
all_min = []

for sequence in seq_info:
    if sequence['seq_length'] == max_seq_length['seq_length']:
        all_max.append(sequence)
    elif sequence['seq_length'] == min_seq_length['seq_length']:
        all_min.append(sequence)

In [8]:
all_max

[{'id': 'gi|142022655|gb|EQ086233.1|323',
  'seq': Seq('ACGCCCGGCGCACCGCGAGTACCGCGCCGCCGGGCACTCCTTGACCCCGCATGA...CGC'),
  'seq_length': 4805,
  'orfs': [{'start': 3228,
    'end': 1389,
    'frame': 1,
    'sense': '-',
    'length': 1839,
    'trailing': False,
    'index': 1},
   {'start': 2824,
    'end': 4510,
    'frame': 1,
    'sense': '+',
    'length': 1686,
    'trailing': False,
    'index': 2},
   {'start': 1476,
    'end': 3072,
    'frame': 3,
    'sense': '+',
    'length': 1596,
    'trailing': False,
    'index': 3},
   {'start': 97,
    'end': 1570,
    'frame': 1,
    'sense': '+',
    'length': 1473,
    'trailing': False,
    'index': 4},
   {'start': 65,
    'end': 1436,
    'frame': 2,
    'sense': '+',
    'length': 1371,
    'trailing': False,
    'index': 5},
   {'start': 3068,
    'end': 3770,
    'frame': 2,
    'sense': '+',
    'length': 702,
    'trailing': False,
    'index': 6},
   {'start': 673,
    'end': 1,
    'frame': 3,
    'sense': '-',
    'leng

In [9]:
all_min

[{'id': 'gi|142022655|gb|EQ086233.1|521',
  'seq': Seq('CGTTGTTCGCCAGGTCGTCCGCATAGCCGGCCGAGCTGAACTGCGTGACATACG...TCG'),
  'seq_length': 512,
  'orfs': [{'start': 359,
    'end': 1,
    'frame': 2,
    'sense': '-',
    'length': 357,
    'trailing': True,
    'index': 1},
   {'start': 353,
    'end': 1,
    'frame': 2,
    'sense': '-',
    'length': 351,
    'trailing': True,
    'index': 2},
   {'start': 126,
    'end': 285,
    'frame': 3,
    'sense': '+',
    'length': 159,
    'trailing': False,
    'index': 3},
   {'start': 128,
    'end': 1,
    'frame': 2,
    'sense': '-',
    'length': 126,
    'trailing': True,
    'index': 4}]}]

In [10]:
test = seq_info[0]['orfs']

In [11]:
# find longest ORF in specific frame
orf_max = 0
n = 0

for id in seq_info:
    test = seq_info[n]['orfs']
    n += 1
    for orf in test:
        if orf['frame'] == 2:
            if orf['length'] > orf_max:
                orf_max = orf['length']
                orf_starts = orf['start']
    
print(orf_max)
print(orf_starts)

1540
1542


In [12]:
# find longest ORF in forward frame
sense_orf_max = 0
n = 0

for id in seq_info:
    test = seq_info[n]['orfs']
    n += 1
    for orf in test:
        if orf['sense'] == '+':
            if orf['length'] > sense_orf_max:
                sense_orf_max = orf['length']
                
print(sense_orf_max)

1686
