# Bioinformatika Lab. 1

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from math import ceil
from Bio.Alphabet import IUPAC
from Orf import Orf
from Bio.Alphabet import generic_dna, generic_rna

In [2]:
rec = SeqIO.read("plazmide.fasta", "fasta")
sequence = rec.seq

In [3]:
print(rec)

ID: plazmide
Name: plazmide
Description: plazmide
Number of features: 0
Seq('TTGCGCCGGCGAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATC...CCT', SingleLetterAlphabet())


In [4]:
start = ["ATG"]
stop = ["TAA", "TAG", "TGA"]
convert = {"C" : "G", "G" : "C", "T" : "A", "A" : "T"}

In [5]:
def complement(data):
    compl = []
    for var in data:
        compl.append(convert[var])
    return compl

In [6]:
def reverse(data):
    return "".join(list(reversed(data)))

In [32]:
def findOrfs(dna, st, revert=False):
    frame = "-" if revert else ""
    frame += str(st)
    orfstart = -1
    orfend = -1
    orfs = []
    data = reverse(complement(dna)) if revert else dna
    for index in range(st, len(data), 3):
        codon = data[index:index+3]
        last_start = 0
        if codon in start:
            #print("Start: {} at {}".format(codon, index))
            if orfstart == -1:
                orfstart = index
        if codon in stop:
            #print("Stop: {} at {}".format(codon, index))
            orfend = index + 3
            if orfend > orfstart and orfstart != -1:
                value = data[orfstart:orfend]
                if orfend - orfstart > 75:
                    orfs.append(Orf(orfstart, orfend, frame, value, (orfend - orfstart)))
                orfstart = -1
    return orfs

In [37]:
def getAllOrfs(dna):
    orfs = []
    orfs.extend(findOrfs(dna, 0))
    orfs.extend(findOrfs(dna, 1))
    orfs.extend(findOrfs(dna, 2))
    orfs.extend(findOrfs(dna, 0, True))
    orfs.extend(findOrfs(dna, 1, True))
    orfs.extend(findOrfs(dna, 2, True))
    return orfs    

In [42]:
orfs = getAllOrfs(sequence) # mind that indexes are wrong for reversed sequences
orfs.sort(key=lambda x: x.length, reverse=False)
print("Number of orfs: {}".format(len(orfs)))
for o in orfs:
    print(o)

Number of orfs: 38
ORF <Start: 4012, Stop: 4093, Length: 81, Frame: -1>
ORF <Start: 5120, Stop: 5204, Length: 84, Frame: 2>
ORF <Start: 3249, Stop: 3333, Length: 84, Frame: -0>
ORF <Start: 4825, Stop: 4912, Length: 87, Frame: -1>
ORF <Start: 2003, Stop: 2090, Length: 87, Frame: -2>
ORF <Start: 3203, Stop: 3293, Length: 90, Frame: -2>
ORF <Start: 1112, Stop: 1205, Length: 93, Frame: 2>
ORF <Start: 3301, Stop: 3394, Length: 93, Frame: -1>
ORF <Start: 3806, Stop: 3902, Length: 96, Frame: -2>
ORF <Start: 291, Stop: 393, Length: 102, Frame: -0>
ORF <Start: 4133, Stop: 4238, Length: 105, Frame: -2>
ORF <Start: 3397, Stop: 3505, Length: 108, Frame: -1>
ORF <Start: 4459, Stop: 4567, Length: 108, Frame: -1>
ORF <Start: 2261, Stop: 2372, Length: 111, Frame: 2>
ORF <Start: 5169, Stop: 5280, Length: 111, Frame: -0>
ORF <Start: 1783, Stop: 1897, Length: 114, Frame: -1>
ORF <Start: 4102, Stop: 4219, Length: 117, Frame: -1>
ORF <Start: 1618, Stop: 1738, Length: 120, Frame: -1>
ORF <Start: 2519, Stop:

In [10]:
def toRNAs(orfs):
    rnas = []
    for orf in orfs:
        rnas.append(toRNA(orf.value))
    return rnas

In [11]:
def toRNA(dna):
    conv = {"T" : "A", "A" : "U", "C" : "G", "G" : "C"}
    rna = ""
    for i in dna:
        rna += str(conv[i])
    return rna

In [12]:
def toProteins(rnas):
    proteins = []
    for rna in rnas:
        r = Seq(rna, generic_rna)
        proteins.append(r.translate())
    return proteins

In [13]:
rnas = toRNAs(orfs)

In [14]:
proteins = toProteins(rnas)

In [15]:
for p in proteins:
    print(p)

YYRGPSLS*VPPLTLWSLQYATASHTATENSLAKGAPLGPVGAKTLLRPFSPSPLPPRLNVRVGAPCC*PPVCQQRLTATVEVRPGRARQRLTAPLI*SAAS*PTVAPPQLPSCFAAASDISPPRVRRARCAVTRLVIDRRPTGPTVTTPSTDVITRPQ*RTTETGLWVVVIIKEGTSAMR*PAPRRPA*PSGRLARQSPG*FKTEPRRRRPTDRIYRVSVSLSRLSPCPSADLTVQAKSCLVRLRLT
YDDRWDTLWM*T*LLRDRNWDSLKRDQGGVGMAVNKWECCKVI
YK**SLGIALVGESKVAIVMGVLVFRGNVPP*SLVCPFLAGIVPGEIVFGL*LRRPL*VARPAPTCPSVDT*RSAGATTRNGVDGARKATTATFGDCVRRGPLPVSNRHSPTALVCSGSPAQSPTTAHSPASVLGQCIAI
YTRFPVVFRSLAFFRRNDRKKVSEAGGLLVVFLAASSVSTALGCPDISMVRKGGPSREHARGQGWDGEWPMDRRKEGSPSHRERVSSATSIESSHIQQARFDPTHVLGGQVGLATRNRPLIAELRLGHSVLNSGDRRR*PLS*SSRSIHPPRCLKNFTTGLMPM*SSCHKP*TRDDFGQWKPFSQPSRTRPFVWWRPSPPKKQTFVV*CASFFPRVLLGN*KRCPRLRVTLLLSAI
YSNSFS*KWI*ENLIFTSKFS*ISYILI*TRLSMVTN*SLRG*SR*TDKASRYQRTEGQHI
YIG*ARGLTRSRRK*KWSQRPTRFCPSVLRRFFPYSRCAFTTYEYEKEKVIIT
YRKNSS*EFNTRLLLNSSHGKARAKGRPPRGHYAE*YPLNTRLLLRGRSLLKQI
YQ*LIGGHTLTIGETQSLN*HYLMA*FRIATIRQFVLFLGSYWGTPEI
YVDRAVQRADLSPVIKA*T*RPTTTDGTPCGCRHNCFATVTGT
YRAK*SSVCNCEDLFELLDLRLLVRL*TLSEVLVRLLEMASTERAKPLLPLLET
YAVVSSNMTL