In [None]:

#################################################  Start #############################################################

# At first, the DNA sequences and the Amino Acid Sequences are downloaded from NCBI website using code NM_207618.2
# To download, search on Google as 'NCBI' >>> Click on NCBI official web link >>> 
# >>> select 'Nucleotide' and write 'NM_207618.2' on the search bar >>> search >> 
# >>> a web page would be opened titled as "Mus musculus vomeronasal 1 receptor, D18 (V1rd18), mRNA" 
# >>> click on 'FASTA' >>> copy it and save it
# >>> go back at the NM_207618.2 web page and click on CDS >> copy it and save it
# the DNA and the Amino Acid Sequences are saved in a two different txt files e.g. 'dna.txt' and 'protein.txt'

# A function has been created to remove all special characters from our downloaded files

def read_seq(inputfile):
    """ Read and returns the input sequence with special characters removed """ 
    #The above script written within (" ") is docstring. You could access it by help(read_seq)
    with open(inputfile, 'r') as f:
        seq=f.read()
    seq = seq.replace('\n','')
    seq = seq.replace('\r','')
    return seq

# Now the translation function is written

def translate (seq):
    """
    Translate a string containing a nucleotide sequence into a string containing the corresponding sequence of 
    amino acids . Nucleotides are translated in triplets using the table dictionary; each amino acid 4 is encoded 
    with a string of length 1.
    """
    table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }
    # check that the sequence length is divisible by 3
    protein=''
    if len(seq)%3 == 0:
        # loop over the sequence
        for i in range(0, len(seq), 3):
            # extract a single codon
            codon =seq[i : i+3]
            # look up the codon and store the result
            protein += table[codon]
    return protein

# check the translate function using a single code 
translate('ATA')
# or
translate('ATA') # works fine


# Now, import our txt file in Python using our predefined read_seq function

prt = read_seq('protein.txt')
dna = read_seq('dna.txt')

translate(dna) # gives you an empty string. This is because the length of dna is not divisible by 3. let's check
len(dna)%3 #this gives the remainder 2. so length of dna is not divisible by 3. But why?
# This is because, in NCBI website, we could see that it is written as "CDS  21....938" - this refers to the 
# locations of the gene where the coding sequence starts and ends

# so for translating DNA, we would start at 21 and ends at 938. However, in Python index starts at 0 and ends 
# before the final index. we use dna[20:938]

translate(dna[20:938])  #this gives you the translated amino acid code

# now, check the NCBI official amino acid code
prt

# the translate(dna[20:938]) and prt are almost same except translate(dna[20:938]) prints an extra underscore(_). 
# This underscore(_) corresponds to stop codon which is ignored in NCBI's amino acid code. so, let's ignore it.

translate(dna[20:935])

# let's check whether our code matches to the official code

prt == translate(dna[20:935])

# or use the following code
prt == translate(dna[20:938])[:-1]

# it should print true which means our function works perfectly


################################################# The end #############################################################