#Transcription May Begin
Problem
Either strand of a DNA double helix can serve as the coding strand for RNA transcription. Hence, a given DNA string implies six total reading frames, or ways in which the same region of DNA can be translated into amino acids: three reading frames result from reading the string itself, whereas three more result from reading its reverse complement.

An open reading frame (ORF) is one which starts from the start codon and ends by stop codon, without any other stop codons in between. Thus, a candidate protein string is derived by translating an open reading frame into amino acids until a stop codon is reached.

Given: A DNA string s
 of length at most 1 kbp in FASTA format.

Return: Every distinct candidate protein string that can be translated from ORFs of s
. Strings can be returned in any order.

Sample Dataset
>Rosalind_99
AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG
Sample Output
MLLGSFRLIPKETLIQVAGSSPCNLS
M
MGMTPRLGLESLLE
MTPRLGLESLLE

In [13]:
# Sample input in FASTA format
fasta_input = """>Rosalind_99
AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG"""

def read_fasta(fasta_input):
    """Read FASTA formatted input and return the DNA string."""
    lines = fasta_input.strip().split('\n')
    return ''.join(line.strip() for line in lines[1:])  # Join all lines after the header

def translate_dna_to_protein(dna):
    """Translate a DNA sequence to a protein string."""
    # Codon to amino acid mapping
    codon_table = {
        'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
        'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
        'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
        'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
        'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
        'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
        'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
        'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
        'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
        'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
        'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
        'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
        'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
        'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
        'TAC': 'Y', 'TAT': 'Y', 'TAA': '',   'TAG': '',   # Stop codons
        'TGC': 'C', 'TGT': 'C', 'TGA': '',   'TGG': 'W',
    }

    protein = []

    # Iterate through the DNA sequence in steps of 3 (codons)
    for i in range(0, len(dna), 3):
        codon = dna[i:i + 3]
        if codon in codon_table:
            amino_acid = codon_table[codon]
            if amino_acid:  # Skip stop codons
                protein.append(amino_acid)
            else:
                break  # Stop translation at stop codon

    return ''.join(protein)

def find_orfs(dna):
    """Find all distinct candidate protein strings from ORFs in the DNA string."""
    proteins = set()
    n = len(dna)

    # Search for start codons (ATG)
    for i in range(n):
        if dna[i:i + 3] == 'ATG':  # Start codon
            # Check for stop codons
            for j in range(i, n - 2, 3):
                codon = dna[j:j + 3]
                if codon in ['TAA', 'TAG', 'TGA']:  # Stop codons
                    orf = dna[i:j + 3]  # Extract the ORF
                    protein = translate_dna_to_protein(orf)
                    if protein:  # Only add non-empty proteins
                        proteins.add(protein)
                    break  # Stop searching after the first stop codon

    # Also check for ORFs in the reverse complement
    reverse_dna = dna[::-1].translate(str.maketrans('ATGC', 'TACG'))
    for i in range(len(reverse_dna)):
        if reverse_dna[i:i + 3] == 'ATG':  # Start codon in reverse
            for j in range(i, len(reverse_dna) - 2, 3):
                codon = reverse_dna[j:j + 3]
                if codon in ['TAA', 'TAG', 'TGA']:  # Stop codons
                    orf = reverse_dna[i:j + 3]  # Extract the ORF
                    protein = translate_dna_to_protein(orf)
                    if protein:  # Only add non-empty proteins
                        proteins.add(protein)
                    break  # Stop searching after the first stop codon

    return proteins  # Return the set of distinct proteins

# Read the DNA string from the FASTA input
dna_string = read_fasta(fasta_input)

# Find all distinct candidate protein strings
proteins = find_orfs(dna_string)

# Print the results
for protein in proteins:
    print(protein)

MTPRLGLESLLE
MGMTPRLGLESLLE
MLLGSFRLIPKETLIQVAGSSPCNLS
M
