In [38]:
# Genetic code dictionary
# This dictionary maps RNA codons to their corresponding amino acids
# This is essential for the translation process.

genetic_code = {
    'AUG': 'Methionine', 'UUU': 'Phenylalanine', 'UUC': 'Phenylalanine', 
    'UUA': 'Leucine', 'UUG': 'Leucine', 'UCU': 'Serine', 'UCC': 'Serine', 
    'UCA': 'Serine', 'UCG': 'Serine', 'UAU': 'Tyrosine', 'UAC': 'Tyrosine', 
    'UGU': 'Cysteine', 'UGC': 'Cysteine', 'UGG': 'Tryptophan', 
    'CUU': 'Leucine', 'CUC': 'Leucine', 'CUA': 'Leucine', 'CUG': 'Leucine', 
    'CCU': 'Proline', 'CCC': 'Proline', 'CCA': 'Proline', 'CCG': 'Proline', 
    'CAU': 'Histidine', 'CAC': 'Histidine', 'CAA': 'Glutamine', 
    'CAG': 'Glutamine', 'CGU': 'Arginine', 'CGC': 'Arginine', 
    'CGA': 'Arginine', 'CGG': 'Arginine', 'AUU': 'Isoleucine', 
    'AUC': 'Isoleucine', 'AUA': 'Isoleucine', 'ACU': 'Threonine', 
    'ACC': 'Threonine', 'ACA': 'Threonine', 'ACG': 'Threonine', 
    'AAU': 'Asparagine', 'AAC': 'Asparagine', 'AAA': 'Lysine', 
    'AAG': 'Lysine', 'AGU': 'Serine', 'AGC': 'Serine', 'AGA': 'Arginine', 
    'AGG': 'Arginine', 'GUU': 'Valine', 'GUC': 'Valine', 'GUA': 'Valine', 
    'GUG': 'Valine', 'GCU': 'Alanine', 'GCC': 'Alanine', 'GCA': 'Alanine', 
    'GCG': 'Alanine', 'GAU': 'Aspartic Acid', 'GAC': 'Aspartic Acid', 
    'GAA': 'Glutamic Acid', 'GAG': 'Glutamic Acid', 'GGU': 'Glycine', 
    'GGC': 'Glycine', 'GGA': 'Glycine', 'GGG': 'Glycine', 
    'UAA': 'Stop', 'UAG': 'Stop', 'UGA': 'Stop'
}

STOP_CODONS = {'UAA', 'UAG', 'UGA'}

In [39]:
# DNA sequence validation function
def validate_dna_sequence(dna_sequence):
    """
    Validates the DNA sequence to ensure it contains only the nucleotides A, T, C, and G.
    """
    valid_nucleotides = {'A', 'T', 'C', 'G'}
    if not dna_sequence:
        return False, "Input cannot be empty." # Return error if the sequence is empty
    for nucleotide in dna_sequence.upper():
        if nucleotide not in valid_nucleotides:
            return False, f"Invalid character found: {nucleotide}"
    return True, None

In [40]:
# Transcription function (DNA to mRNA)
def transcribe(dna_sequence):
    """
    Converts a DNA sequence into an mRNA sequence by replacing thymine (T) with uracil (U).
    """
    return dna_sequence.upper().replace('T', 'U') # Replace T with U to get the mRNA sequence

In [41]:
# Translation function (mRNA to protein)
def translate(mrna_sequence):
    """
    Translates an mRNA sequence into a protein sequence.
    Stops translation if a stop codon is encountered.
    """
    protein = [] # Initialize an empty list for the protein sequence
    for i in range(0, len(mrna_sequence), 3): # Loop through the mRNA sequence in steps of 3 (codons)
        codon = mrna_sequence[i:i+3] # Extract the codon
        if len(codon) < 3:
            break # Stop if the codon is incomplete
        amino_acid = genetic_code.get(codon) # Get the amino acid for the codon
        if amino_acid == 'Stop':
            break # Stop translation at stop codons
        if amino_acid: 
            protein.append(amino_acid) # Add the amino acid to the protein sequence
    return protein # Return the complete protein sequence

In [42]:
# Gene Mapping function (DNA to Protein)
def map_genes(dna_sequence):
    """
    Combines transcription and translation to map a DNA sequence to a protein sequence.
    """
    mrna_sequence = transcribe(dna_sequence) # Transcribe DNA to mRNA
    return translate(mrna_sequence) # Translate mRNA to a protein sequence

In [43]:
# Gene Identification function
def identify_gene(protein_sequence, protein_to_gene):
    """
    Identifies the gene based on the provided protein sequence.
    """
    return protein_to_gene.get(tuple(protein_sequence), "Unknown gene") # Match protein sequence to known genes

In [47]:
# Main function for gene mapping and identification
def main():
     # Get DNA sequence input from the user
    dna_sequence = input("Enter DNA sequence (A, T, C, G only): ").upper()
    
    # Validate DNA sequence
    is_valid, error_msg = validate_dna_sequence(dna_sequence)
    if not is_valid:
        print(error_msg) # Print the error message if the DNA sequence is invalid
        return
    
    # Map genes (translate DNA to protein)
    protein_sequence = map_genes(dna_sequence)
    
    if not protein_sequence:
        print("No protein sequence found. Check if the DNA sequence is valid and long enough.")
        return
    
    # Gene identification using breast cancer-related genes
    breast_cancer_genes = {
        'BRCA1': 'ATGGATTTTGGTCAGTCAACAAAGAAAGCTGAGAACTTGGACACTAGGGTCTGACTGGAAGAAATCTGGA',
        'BRCA2': 'ATGGATTTGGAGGTTTTTGTTTGCTGCTGCTGCTGCTGAGCTTGCTGAAACTGGAAGGAAACAGG',
        'TP53': 'ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCTGCTCGGAACATCTCGAAGTGTTTGTGC'
    }
    
    # Reverse mapping from protein sequences to gene names
    protein_to_gene = {tuple(map_genes(v)): k for k, v in breast_cancer_genes.items()}
    
    gene_name = identify_gene(protein_sequence, protein_to_gene)
    
    # Output the gene name and protein sequence
    print(f"Protein Sequence: {' - '.join(protein_sequence)}")
    if gene_name != "Unknown gene":
        print(f"Identified Gene: {gene_name}")
    else:
        print("No known breast cancer gene identified.")

if __name__ == "__main__":
    main()


Enter DNA sequence (A, T, C, G only): AAAAATTTTTTCCCCCCCGGGGGGGCCCCGGGGTTTTAAAUAAUAGUGAGAGAGAGAGAGAGAGAGAGAGAGATATATATATATATATATATATATATTTACACACACACACACACACACACACACACACACACACACACACACACAC
Invalid character found: U
