In [173]:
codones_aminoacids_dict = {'AAA': 'K',
 'AAC': 'N',
 'AAG': 'K',
 'AAU': 'N',
 'ACA': 'T',
 'ACC': 'T',
 'ACG': 'T',
 'ACU': 'T',
 'AGA': 'R',
 'AGC': 'S',
 'AGG': 'R',
 'AGU': 'S',
 'AUA': 'I',
 'AUC': 'I',
 'AUG': 'M',
 'AUU': 'I',
 'CAA': 'Q',
 'CAC': 'H',
 'CAG': 'Q',
 'CAU': 'H',
 'CCA': 'P',
 'CCC': 'P',
 'CCG': 'P',
 'CCU': 'P',
 'CGA': 'R',
 'CGC': 'R',
 'CGG': 'R',
 'CGU': 'R',
 'CUA': 'L',
 'CUC': 'L',
 'CUG': 'L',
 'CUU': 'L',
 'GAA': 'E',
 'GAC': 'D',
 'GAG': 'E',
 'GAU': 'D',
 'GCA': 'A',
 'GCC': 'A',
 'GCG': 'A',
 'GCU': 'A',
 'GGA': 'G',
 'GGC': 'G',
 'GGG': 'G',
 'GGU': 'G',
 'GUA': 'V',
 'GUC': 'V',
 'GUG': 'V',
 'GUU': 'V',
 'UAA': '*',
 'UAC': 'Y',
 'UAG': '*',
 'UAU': 'Y',
 'UCA': 'S',
 'UCC': 'S',
 'UCG': 'S',
 'UCU': 'S',
 'UGA': '*',
 'UGC': 'C',
 'UGG': 'W',
 'UGU': 'C',
 'UUA': 'L',
 'UUC': 'F',
 'UUG': 'L',
 'UUU': 'F'}

In [174]:
def DnaToRna(pattern):
  return pattern.replace('T', 'U')

In [175]:
def ReverseComplement(pattern):
  reverse_complement = str()
  for nucleotide in pattern:
    reverse_complement = reverse_complement + ComplementaryBase(nucleotide)
  return reverse_complement[::-1] #5' to 3' order of reading of reverse complement (instead of 3' to 5')

In [176]:
def ComplementaryBase(base):
  if base == 'A':
    return 'T'
  elif base == 'C':
    return 'G'
  elif base == 'G':
    return 'C'
  else:
    return 'A'

In [177]:
def NonOverlappingAminoacidsSequencesEncodingDnaCodones(dna_string, peptide):
  non_overlapping_codones_sequences = []
  i = 0
  while i + (len(peptide)*3-1) <= len(dna_string):
    non_overlapping_codones_sequences.append(dna_string[i:i+len(peptide)*3])
    i = i + 3
  return non_overlapping_codones_sequences

There are three different ways to divide a DNA string into codons for translation, one starting at each of the first three starting positions of the string. These different ways of dividing a DNA string into codons are called reading frames. Since DNA is double-stranded, a genome has six reading frames (three on each strand)

Original strand reading frames:


1.   Whole strand
2.   Whole strand without 1. nucleotide
3.   Whole strand without first 2 nucleotides

Reverse complement strand reading frames:

1.   Whole strand
2.   Whole strand without 1. nucleotide
3.   Whole strand without first 2 nucleotides





In [178]:
def CodonesSequenceEncodingPeptide(dna_codones_sequence):
  peptide = ''
  i = 0
  while i <= len(dna_codones_sequence) - 1 - 2:
    peptide = peptide + codones_aminoacids_dict[DnaToRna(dna_codones_sequence[i:i+3])]
    i = i + 3
  return peptide

We say that a DNA string Pattern encodes an amino acid string Peptide if the RNA string transcribed from either Pattern or its reverse complement Pattern translates into Peptide.

For example, the DNA string GAAACT is transcribed into GAAACU and translated into ET. The reverse complement of this DNA string, AGTTTC, is transcribed into
AGUUUC and translated into SF. Thus, GAAACT encodes both ET and SF.

In [254]:
def PeptideEncoding(dna_string, peptide):
  peptide_encoding_genes = set()
  for i in range(3): #3 reading frames per strand
    for non_overlapping_dna_codones_sequence in NonOverlappingAminoacidsSequencesEncodingDnaCodones(dna_string[i:len(dna_string)], peptide):
      non_overlapping_dna_codones_sequence_encoding_peptides = []
      non_overlapping_dna_codones_sequence_encoding_peptides.append(CodonesSequenceEncodingPeptide(non_overlapping_dna_codones_sequence))
      non_overlapping_dna_codones_sequence_encoding_peptides.append(CodonesSequenceEncodingPeptide(ReverseComplement(non_overlapping_dna_codones_sequence)))
      if peptide in non_overlapping_dna_codones_sequence_encoding_peptides:
        peptide_encoding_genes.append(non_overlapping_dna_codones_sequence)
  return peptide_encoding_genes

Find substrings of a genome encoding a given amino acid sequence.

Given: A DNA string Text and an amino acid string Peptide.

**Return: All substrings of Text encoding Peptide (if any such substrings exist).**

In [268]:
def FindAllKmers(dna_string, peptide):
  kmers_list = []
  i = 0
  while i + len(peptide)*3 - 1 <= len(dna_string) - 1:
    kmers_list.append(dna_string[i: i + len(peptide)*3])
    i = i + 1
  return kmers_list

In [269]:
def RunPeptideEncoding(dna_string, peptide):
  substrings_encoding_peptides = []
  genes_encoding_peptide = set().union(PeptideEncoding(dna_string, peptide),PeptideEncoding(ReverseComplement(dna_string), peptide))
  for substring in FindAllKmers(dna_string,peptide):
    if substring in genes_encoding_peptide:
      substrings_encoding_peptides.append(substring)
  return substrings_encoding_peptides

In [270]:
dna_string = 'ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA'

In [271]:
peptide = 'MA'

In [272]:
RunPeptideEncoding(dna_string, peptide)

['ATGGCC', 'GGCCAT', 'ATGGCC']

In [274]:
with open('/content/rosalind_ba4b.txt') as task_file:
  dna_string = [line.rstrip() for line in task_file]

In [275]:
dna_string = dna_string[0]

In [273]:
peptide = 'TTMTYDAC'

In [None]:
for solution in RunPeptideEncoding(dna_string, peptide):
  print(solution)