# Pacotes usados

In [1]:
!pip install BioPython

Collecting BioPython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: BioPython
Successfully installed BioPython-1.83


In [2]:
import numpy as np

from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Dados usados

In [3]:
data_path = "/content/drive/MyDrive/AB/exercicio_5/data/Dengue virus type 1 strain Den1BR-90 sequence.fasta"

In [4]:
dna_string = SeqIO.read(data_path, "fasta")

In [5]:
dna_string_to_rna = dna_string.seq.transcribe()

# Separa-se os três frames

In [22]:
protein_array_1 = np.array(dna_string_to_rna.translate())
protein_array_2 = np.array(dna_string_to_rna[1:].translate())
protein_array_3 = np.array(dna_string_to_rna[2:].translate())

# Função que determina um sequência potencial

In [8]:
def potencial_sequence(rna_array):
  initiator_array = np.argwhere(rna_array == 'M').flatten()
  terminator_array = np.argwhere(rna_array == '*').flatten()

  array_test = []

  j = 0
  for i in initiator_array:
    while (terminator_array[j] < i):

      if (i > terminator_array[len(terminator_array) - 1]):
        break

      j += 1

    array_test.append(rna_array[i:terminator_array[j]])

    j = 0


  return array_test

In [23]:
potencial_sequences_frame_1 = potencial_sequence(protein_array_1)
potencial_sequences_frame_2 = potencial_sequence(protein_array_2)
potencial_sequences_frame_3 = potencial_sequence(protein_array_3)

# Função que cria os registros fasta

In [60]:
def potencial_sequences_to_fasta(potencial_sequences, id, name, description, frame):
  potencial_sequences_fasta = []

  for i in range(len(potencial_sequences)):
    fasta_record = SeqRecord(
        Seq("".join(potencial_sequences[i])),
        id = id,
        name = name,
        description = f"{description}, frame: {frame}, proteína: {i + 1}"
    )

    potencial_sequences_fasta.append(fasta_record)

  return potencial_sequences_fasta

In [61]:
id = dna_string.id
name = dna_string.name
description = "Código do Genoma da Dengue do GenBank"

In [62]:
potencial_sequences_fasta_1 = potencial_sequences_to_fasta(potencial_sequences_frame_1, id, name, description, frame = 1)
potencial_sequences_fasta_2 = potencial_sequences_to_fasta(potencial_sequences_frame_2, id, name, description, frame = 2)
potencial_sequences_fasta_3 = potencial_sequences_to_fasta(potencial_sequences_frame_3, id, name, description, frame = 3)

all_3_potencial_sequences = [potencial_sequences_fasta_1, potencial_sequences_fasta_2, potencial_sequences_fasta_3]

# Cria-se o arquivo com todas as potenciais sequências codificadoras de proteínas

In [68]:
fasta_file = open("/content/drive/MyDrive/AB/exercicio_5/data/sequencias_potenciais_codificadoras_de_proteina.fasta", "a")

for i in range(len(all_3_potencial_sequences)):
  write_file = SeqIO.write(all_3_potencial_sequences[i], fasta_file, "fasta")

fasta_file.close()