# How to transcribe and translate a DNA sequence

## First we need to import the required modules:

In [1]:
from Bio import SeqIO
from Bio.SeqIO.FastaIO import as_fasta

## Then, we can open the file using "with open(filename)":

In [2]:
with open("ftsZ.fna", "r") as handle: #Open the file (read only mode, r)
    record = SeqIO.read(handle, "fasta") #Read the file, which is in fasta format, with the read function from SeqIO
    print(record.seq) #Print the DNA sequence

ATGTTTGAACCAATGGAACTTACCAATGACGCGGTGATTAAAGTCATCGGCGTCGGCGGCGGCGGCGGTAATGCTGTTGAACACATGGTGCGCGAGCGCATTGAAGGTGTTGAATTCTTCGCGGTAAATACCGATGCACAAGCGCTGCGTAAAACAGCGGTTGGACAGACGATTCAAATCGGTAGCGGTATCACCAAAGGACTGGGCGCTGGCGCTAATCCAGAAGTTGGCCGCAATGCGGCTGATGAGGATCGCGATGCATTGCGTGCGGCGCTGGAAGGTGCAGACATGGTCTTTATTGCTGCGGGTATGGGTGGTGGTACCGGTACAGGTGCAGCACCAGTCGTCGCTGAAGTGGCAAAAGATTTGGGTATCCTGACCGTTGCTGTCGTCACTAAGCCTTTCAACTTTGAAGGCAAGAAGCGTATGGCATTCGCGGAGCAGGGGATCACTGAACTGTCCAAGCATGTGGACTCTCTGATCACTATCCCGAACGACAAACTGCTGAAAGTTCTGGGCCGCGGTATCTCCCTGCTGGATGCGTTTGGCGCAGCGAACGATGTACTGAAAGGCGCTGTGCAAGGTATCGCTGAACTGATTACTCGTCCGGGTTTGATGAACGTGGACTTTGCAGACGTACGCACCGTAATGTCTGAGATGGGCTACGCAATGATGGGTTCTGGCGTGGCGAGCGGTGAAGACCGTGCGGAAGAAGCTGCTGAAATGGCTATCTCTTCTCCGCTGCTGGAAGATATCGACCTGTCTGGCGCGCGCGGCGTGCTGGTTAACATCACGGCGGGCTTCGACCTGCGTCTGGATGAGTTCGAAACGGTAGGTAACACCATCCGTGCATTTGCTTCCGACAACGCGACTGTGGTTATCGGTACTTCTCTTGACCCGGATATGAATGACGAGCTGCGCGTAACCGTTGTTGCGACAGGTATCGGCATGGACAAACGTCCTGAAATCACTCTGGTGACCAATAAGCAGGTTCAGCAGC

SeqIO.read only works on single records. If we want to read a file with several sequences, we use the function SeqIO.parse instead.

### When we read a fasta file in Biopython, we get a record object with several attributes:

In [3]:
print(record)

ID: NC_000913.3:105305-106456
Name: NC_000913.3:105305-106456
Description: NC_000913.3:105305-106456 ftsZ [organism=Escherichia coli str. K-12 substr. MG1655] [GeneID=944786] [chromosome=]
Number of features: 0
Seq('ATGTTTGAACCAATGGAACTTACCAATGACGCGGTGATTAAAGTCATCGGCGTC...TAA')


### In a fasta file, we get the description attribute as a header and then the sequence, this way:

In [4]:
print(f">{record.description}")
print(record.seq)

>NC_000913.3:105305-106456 ftsZ [organism=Escherichia coli str. K-12 substr. MG1655] [GeneID=944786] [chromosome=]
ATGTTTGAACCAATGGAACTTACCAATGACGCGGTGATTAAAGTCATCGGCGTCGGCGGCGGCGGCGGTAATGCTGTTGAACACATGGTGCGCGAGCGCATTGAAGGTGTTGAATTCTTCGCGGTAAATACCGATGCACAAGCGCTGCGTAAAACAGCGGTTGGACAGACGATTCAAATCGGTAGCGGTATCACCAAAGGACTGGGCGCTGGCGCTAATCCAGAAGTTGGCCGCAATGCGGCTGATGAGGATCGCGATGCATTGCGTGCGGCGCTGGAAGGTGCAGACATGGTCTTTATTGCTGCGGGTATGGGTGGTGGTACCGGTACAGGTGCAGCACCAGTCGTCGCTGAAGTGGCAAAAGATTTGGGTATCCTGACCGTTGCTGTCGTCACTAAGCCTTTCAACTTTGAAGGCAAGAAGCGTATGGCATTCGCGGAGCAGGGGATCACTGAACTGTCCAAGCATGTGGACTCTCTGATCACTATCCCGAACGACAAACTGCTGAAAGTTCTGGGCCGCGGTATCTCCCTGCTGGATGCGTTTGGCGCAGCGAACGATGTACTGAAAGGCGCTGTGCAAGGTATCGCTGAACTGATTACTCGTCCGGGTTTGATGAACGTGGACTTTGCAGACGTACGCACCGTAATGTCTGAGATGGGCTACGCAATGATGGGTTCTGGCGTGGCGAGCGGTGAAGACCGTGCGGAAGAAGCTGCTGAAATGGCTATCTCTTCTCCGCTGCTGGAAGATATCGACCTGTCTGGCGCGCGCGGCGTGCTGGTTAACATCACGGCGGGCTTCGACCTGCGTCTGGATGAGTTCGAAACGGTAGGTAACACCATCCGTGCATTTGCTTCCGACAACGCGACTGTGGTTATCGGT

## The DNA sequence can be converted to RNA by using the transcribe() function:

In [5]:
RNA_seq = record.seq.transcribe() #In this case, we transcribe (DNA > RNA) the sequence
print(RNA_seq) #Print the RNA sequence

AUGUUUGAACCAAUGGAACUUACCAAUGACGCGGUGAUUAAAGUCAUCGGCGUCGGCGGCGGCGGCGGUAAUGCUGUUGAACACAUGGUGCGCGAGCGCAUUGAAGGUGUUGAAUUCUUCGCGGUAAAUACCGAUGCACAAGCGCUGCGUAAAACAGCGGUUGGACAGACGAUUCAAAUCGGUAGCGGUAUCACCAAAGGACUGGGCGCUGGCGCUAAUCCAGAAGUUGGCCGCAAUGCGGCUGAUGAGGAUCGCGAUGCAUUGCGUGCGGCGCUGGAAGGUGCAGACAUGGUCUUUAUUGCUGCGGGUAUGGGUGGUGGUACCGGUACAGGUGCAGCACCAGUCGUCGCUGAAGUGGCAAAAGAUUUGGGUAUCCUGACCGUUGCUGUCGUCACUAAGCCUUUCAACUUUGAAGGCAAGAAGCGUAUGGCAUUCGCGGAGCAGGGGAUCACUGAACUGUCCAAGCAUGUGGACUCUCUGAUCACUAUCCCGAACGACAAACUGCUGAAAGUUCUGGGCCGCGGUAUCUCCCUGCUGGAUGCGUUUGGCGCAGCGAACGAUGUACUGAAAGGCGCUGUGCAAGGUAUCGCUGAACUGAUUACUCGUCCGGGUUUGAUGAACGUGGACUUUGCAGACGUACGCACCGUAAUGUCUGAGAUGGGCUACGCAAUGAUGGGUUCUGGCGUGGCGAGCGGUGAAGACCGUGCGGAAGAAGCUGCUGAAAUGGCUAUCUCUUCUCCGCUGCUGGAAGAUAUCGACCUGUCUGGCGCGCGCGGCGUGCUGGUUAACAUCACGGCGGGCUUCGACCUGCGUCUGGAUGAGUUCGAAACGGUAGGUAACACCAUCCGUGCAUUUGCUUCCGACAACGCGACUGUGGUUAUCGGUACUUCUCUUGACCCGGAUAUGAAUGACGAGCUGCGCGUAACCGUUGUUGCGACAGGUAUCGGCAUGGACAAACGUCCUGAAAUCACUCUGGUGACCAAUAAGCAGGUUCAGCAGC

## We can easily convert the DNA or RNA sequence into protein by using the translate() function:

In [6]:
protein_seq = RNA_seq.translate() #We overwrite the DNA sequence with the translation (DNA > RNA > protein)
print(protein_seq) #Print the protein sequence

MFEPMELTNDAVIKVIGVGGGGGNAVEHMVRERIEGVEFFAVNTDAQALRKTAVGQTIQIGSGITKGLGAGANPEVGRNAADEDRDALRAALEGADMVFIAAGMGGGTGTGAAPVVAEVAKDLGILTVAVVTKPFNFEGKKRMAFAEQGITELSKHVDSLITIPNDKLLKVLGRGISLLDAFGAANDVLKGAVQGIAELITRPGLMNVDFADVRTVMSEMGYAMMGSGVASGEDRAEEAAEMAISSPLLEDIDLSGARGVLVNITAGFDLRLDEFETVGNTIRAFASDNATVVIGTSLDPDMNDELRVTVVATGIGMDKRPEITLVTNKQVQQPVMDRYQQHGMAPLTQEQKPVAKVVNDNAPQTAKEPDYLDIPAFLRKQAD*


The asterisk at the end represents the stop codon. If we don't want the asterisk, we have to specify that the translation should stop at the stop codon:

In [7]:
protein_seq = RNA_seq.translate(to_stop=True)
print(protein_seq) #Print the protein sequence without asterisk

MFEPMELTNDAVIKVIGVGGGGGNAVEHMVRERIEGVEFFAVNTDAQALRKTAVGQTIQIGSGITKGLGAGANPEVGRNAADEDRDALRAALEGADMVFIAAGMGGGTGTGAAPVVAEVAKDLGILTVAVVTKPFNFEGKKRMAFAEQGITELSKHVDSLITIPNDKLLKVLGRGISLLDAFGAANDVLKGAVQGIAELITRPGLMNVDFADVRTVMSEMGYAMMGSGVASGEDRAEEAAEMAISSPLLEDIDLSGARGVLVNITAGFDLRLDEFETVGNTIRAFASDNATVVIGTSLDPDMNDELRVTVVATGIGMDKRPEITLVTNKQVQQPVMDRYQQHGMAPLTQEQKPVAKVVNDNAPQTAKEPDYLDIPAFLRKQAD


## Now, we can save the output to a fasta file by using the as_fasta function that we imported in the beginning:

In [10]:
out_file = "ftsZ.faa" #Name of output file
protein_record = record #Create a new record that is a copy of the original
protein_record.seq = protein_seq #Replace the DNA sequence with the protein sequence
with open(out_file, "w") as faa: #Open new file in write mode
    faa.write(as_fasta(protein_record)) #Write amino acid sequence in fasta format with as_fasta

In [8]:
out_file = "ftsZ.faa"
protein_record = record
protein_record.seq = protein_seq
with open(out_file, "w") as faa:
    SeqIO.write(protein_record, faa, "fasta") #Alternatively, we can use SeqIO to write to fasta format

## Finally, we can check the content of the newly-created file

In [9]:
with open(out_file, "r") as faa:
    for line in faa: #Here we loop normally through the file to check its contents
        print(line, end="") #Fasta files already have a line break at the end of each line

>NC_000913.3:105305-106456 ftsZ [organism=Escherichia coli str. K-12 substr. MG1655] [GeneID=944786] [chromosome=]
MFEPMELTNDAVIKVIGVGGGGGNAVEHMVRERIEGVEFFAVNTDAQALRKTAVGQTIQI
GSGITKGLGAGANPEVGRNAADEDRDALRAALEGADMVFIAAGMGGGTGTGAAPVVAEVA
KDLGILTVAVVTKPFNFEGKKRMAFAEQGITELSKHVDSLITIPNDKLLKVLGRGISLLD
AFGAANDVLKGAVQGIAELITRPGLMNVDFADVRTVMSEMGYAMMGSGVASGEDRAEEAA
EMAISSPLLEDIDLSGARGVLVNITAGFDLRLDEFETVGNTIRAFASDNATVVIGTSLDP
DMNDELRVTVVATGIGMDKRPEITLVTNKQVQQPVMDRYQQHGMAPLTQEQKPVAKVVND
NAPQTAKEPDYLDIPAFLRKQAD
