In [77]:
from Bio.Seq import Seq

In [78]:
from Bio import SeqIO

### Loading in DNA sequences from Project 

_Aims:_ 
- Load in a DNA sequence
- Use code to translate codons (account for redundancies - take the E. coli most frequent) find the primers based on the codons (5' end first 6 codons) and then the compliment codon sequence last 6 codons for reverse
- Define function to calculate the G+C content of overall codon sequence and primer sequences
- Calculate the Mw of the total sequence based on dictionary of amino acid residues

In [79]:
mupB = SeqIO.read("AAM12910.1.fasta", "fasta") #loading in the mupB ORF sequence that is investigated in my project from NCBI 
                                                                                        #fasta sequence
print(mupB)

ID: AAM12910.1
Name: AAM12910.1
Description: AAM12910.1 MupB [Pseudomonas fluorescens]
Number of features: 0
Seq('MRTAYLASTAYVLGEQAHDYRDAASFEAVCKQHSMPDFASVFGWGTYWRTTRSV...SDA')


In [80]:
mupB_seq = mupB.seq

print(mupB_seq) #taking the sequence of mupB and defining it as new sequence input with the .seq function (from miligram course)

MRTAYLASTAYVLGEQAHDYRDAASFEAVCKQHSMPDFASVFGWGTYWRTTRSVGELLVESVASTLASSGLRGCDIDALVVCSSNFESGQVVDYLPLLRELQLARAFPLGVTWGDCTMLLAGLEVARAQVLAGLDNVLVVSANRIEDEAFRFQHYALFSDGAASCLVTSGRRRGFEMLGSLARSNAGLAHDPKEDDTRLFREVHEQFMHRQQINTADLEQVVCSNVFLPVLKIKEGRQGVSGTQLYVDNVTRVGHCFSADSLINLCDYQARAQQVHGGLVMLTANADGLRCQTLLQRVSDA


In [81]:
DNA_codons = {
    'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L', 'TAT':'Y', 'TAC':'Y', 
    'TAA':'stop', 'TAG':'stop', 'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L', 'CAT':'H', 
    'CAC':'H', 'CAA':'Q', 'CAG':'H', 'ATT':'I', 'ATC':'I', 'ATA':'I', 'ATG':'M',
    'AAT':'N', 'AAC':'N', 'AAA':'L', 'AAG':'K', 'GTT':'V', 'GTC':'V',
    'GTA':'V', 'GTG':'V', 'GAT':'D', 'GAC':'D', 'GAA':'E',
    'GAG':'E', 'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S', 'TGT':'S', 'TGC':'S', 
    'TGA':'stop', 'TGG':'W', 'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P', 'CGT':'R', 
    'CGC':'R', 'CGA':'R', 'CGG':'R', 'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T', 'AGT':'S', 
    'AGC':'S', 'AGA':'R', 'AGG':'R', 'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A', 'GGT':'G',
    'GGC':'G', 'GGA':'G', 'GGG':'G'
}  # E.coli genetic code put in a dictionary - this wouldn't work very well 
            #as there are codon redundancies (amino acids for more than one codon) 
            #- the way this is written could be okay as I have put the codons in the dictionary first
                #so that the codon is defined by the amino acid as opposed to the other way round
            #but may reach difficulities in the loop when one letter would be read at a time passing
            #through the loop as opposed to 3 that need to be used as codons

In [82]:
mupB_seq

Seq('MRTAYLASTAYVLGEQAHDYRDAASFEAVCKQHSMPDFASVFGWGTYWRTTRSV...SDA')

In [99]:
#E.coli optimised codons
optimised_codons = {
    'F': 'TTT', 'L': 'CTG', 'Y': 'TAT', 'H': 'CAT', 'Q': 'CAG', 'I': 'ATT', 
    'M': 'ATG', 'N': 'AAC', 'K': 'AAA', 'V': 'GTG', 'D': 'GAT', 'E': 'GAA',
    'S': 'AGC', 'C': 'TGC', 'W': 'TGG', 'P': 'CCG', 'R': 'CGT', 'T': 'ACC',
    'A': 'GCG', 'G': 'GGC'    
} #codons for the amino acids with the highest frequency/thousand as seen
                    #https://www.genscript.com/tools/codon-frequency-table
                    #for expression in E. coli (E. coli optimised codons)

In [84]:
mupB_codons = [] #empty list to be filled for the mupB_codons based on
                    #the optimised codon dictionary

In [85]:
            #taken from the sequence on NCBI
for amino_acid in mupB_seq: #looking for the amino acids in the mubB_seq
    codon = optimised_codons[amino_acid]
    mupB_codons.append(codon) 

In [86]:
codon_output = "".join(mupB_codons)

In [87]:
print(codon_output) #this generates the optimised codons for mupB expression

ATGCGTACCGCGTATCTGGCGAGCACCGCGTATGTGCTGGGCGAACAGGCGCATGATTATCGTGATGCGGCGAGCTTTGAAGCGGTGTGCAAACAGCATAGCATGCCGGATTTTGCGAGCGTGTTTGGCTGGGGCACCTATTGGCGTACCACCCGTAGCGTGGGCGAACTGCTGGTGGAAAGCGTGGCGAGCACCCTGGCGAGCAGCGGCCTGCGTGGCTGCGATATTGATGCGCTGGTGGTGTGCAGCAGCAACTTTGAAAGCGGCCAGGTGGTGGATTATCTGCCGCTGCTGCGTGAACTGCAGCTGGCGCGTGCGTTTCCGCTGGGCGTGACCTGGGGCGATTGCACCATGCTGCTGGCGGGCCTGGAAGTGGCGCGTGCGCAGGTGCTGGCGGGCCTGGATAACGTGCTGGTGGTGAGCGCGAACCGTATTGAAGATGAAGCGTTTCGTTTTCAGCATTATGCGCTGTTTAGCGATGGCGCGGCGAGCTGCCTGGTGACCAGCGGCCGTCGTCGTGGCTTTGAAATGCTGGGCAGCCTGGCGCGTAGCAACGCGGGCCTGGCGCATGATCCGAAAGAAGATGATACCCGTCTGTTTCGTGAAGTGCATGAACAGTTTATGCATCGTCAGCAGATTAACACCGCGGATCTGGAACAGGTGGTGTGCAGCAACGTGTTTCTGCCGGTGCTGAAAATTAAAGAAGGCCGTCAGGGCGTGAGCGGCACCCAGCTGTATGTGGATAACGTGACCCGTGTGGGCCATTGCTTTAGCGCGGATAGCCTGATTAACCTGTGCGATTATCAGGCGCGTGCGCAGCAGGTGCATGGCGGCCTGGTGATGCTGACCGCGAACGCGGATGGCCTGCGTTGCCAGACCCTGCTGCAGCGTGTGAGCGATGCG


In [88]:
mupB_express = Seq(codon_output)

In [89]:
mupB_rna = mupB_express.transcribe()
mupB_protein = mupB_rna.translate()
#print(mupB_protein)

In [90]:
if mupB_protein == mupB_seq: #This code is being used to check whether the dictionary I set up
                            #to find the optimised codons for MupB expression in E. coli, when transcribed and translated
                            #would produce the same amino acids that are in MupB original sequence that was taken from NCBI 
                            #the 'if' statement should tell me whether this worked
   print(f"The optimised codons have worked to express MupB")
else:
    print(f"The optimised codons have not worked - the MupB sequences are not the same")

The optimised codons have worked to express MupB


In [91]:
#using code found online at https://python.plainenglish.io/bioinformatics-in-python-dna-toolkit-part-3-gc-content-calculation-618993e0278c

length_codons = print(len(codon_output)) #calculating the number of codons
n_cytosines = print(codon_output.count("C")) #counting the number of C's in the codon sequence
n_guanines = print(codon_output.count("G")) #counting the number of G's in the codon sequence

903
219
326


In [92]:
#To calculate the GC content of the codon_output - from https://python.plainenglish.io/bioinformatics-in-python-dna-toolkit-part-3-gc-content-calculation-618993e0278c

def gc_content(codon_output): 
    return round((codon_output.count("C") + codon_output.count("G"))/(len(codon_output)) * 100)

gc_content(codon_output) #calculating the GC content of the mupB_codon sequence (percentage)

60

In [93]:
#print(mupB_express) - can use to find the 5' --> 3'primer sequence (or codon_output from above)

In [94]:
compliment_mupB = mupB_express.complement()

In [61]:
print(compliment_mupB) #in order to find the reverse 3'--> 5' primer sequence

TACGCATGGCGCATAGACCGCTCGTGGCGCATACACGACCCGCTTGTCCGCGTACTAATAGCACTACGCCGCTCGAAACTTCGCCACACGTTTGTCGTATCGTACGGCCTAAAACGCTCGCACAAACCGACCCCGTGGATAACCGCATGGTGGGCATCGCACCCGCTTGACGACCACCTTTCGCACCGCTCGTGGGACCGCTCGTCGCCGGACGCACCGACGCTATAACTACGCGACCACCACACGTCGTCGTTGAAACTTTCGCCGGTCCACCACCTAATAGACGGCGACGACGCACTTGACGTCGACCGCGCACGCAAAGGCGACCCGCACTGGACCCCGCTAACGTGGTACGACGACCGCCCGGACCTTCACCGCGCACGCGTCCACGACCGCCCGGACCTATTGCACGACCACCACTCGCGCTTGGCATAACTTCTACTTCGCAAAGCAAAAGTCGTAATACGCGACAAATCGCTACCGCGCCGCTCGACGGACCACTGGTCGCCGGCAGCAGCACCGAAACTTTACGACCCGTCGGACCGCGCATCGTTGCGCCCGGACCGCGTACTAGGCTTTCTTCTACTATGGGCAGACAAAGCACTTCACGTACTTGTCAAATACGTAGCAGTCGTCTAATTGTGGCGCCTAGACCTTGTCCACCACACGTCGTTGCACAAAGACGGCCACGACTTTTAATTTCTTCCGGCAGTCCCGCACTCGCCGTGGGTCGACATACACCTATTGCACTGGGCACACCCGGTAACGAAATCGCGCCTATCGGACTAATTGGACACGCTAATAGTCCGCGCACGCGTCGTCCACGTACCGCCGGACCACTACGACTGGCGCTTGCGCCTACCGGACGCAACGGTCTGGGACGACGTCGCACACTCGCTACGC


__Primer design__

-- Considering information https://www.thermofisher.com/blog/behindthebench/pcr-primer-design-tips/
To design a primer for expressing mupB in E coli: 
- Take the first 6 codons from the forward codon sequence 
- Take the last 6 codons from the complimentary codon sequence

In [103]:
#When the MupB optimised codon output (above DNA sequence) would be ordered, it would come in a plasmid from the vendor
#Below is the design of a basic set of primer sequences that I generated from the forward and reverse sequences of MupB

mupB_forward_primer = Seq("ATGCGTACCGCGTATCTG") 
mupB_reverse_primer = Seq("GTCGCACACTCGCTACGC")

#Creating an object for the forward and reverse primers that would anneal to the MupB DNA and then these strands 
#could be edited/appended to include any enzyme restriction sites or extra codons that may be desired for the user to ligate
#into a different plasmid

In [63]:
def gc_content(mupB_forward_primer): 
    return round((mupB_forward_primer.count("C") + mupB_forward_primer.count("G"))/(len(mupB_forward_primer)) * 100)

gc_content(mupB_forward_primer) #finding the GC content of the forward primer

56

In [107]:
def gc_content(mupB_reverse_primer): 
    return round((mupB_reverse_primer.count("C") + mupB_reverse_primer.count("G"))/(len(mupB_reverse_primer)) * 100)

gc_content(mupB_reverse_primer) #finding the GC content of the forward primer

#There is a difference in the GC content which could affect 
            #the Tm of the two primers (should not be more than 5 degrees - which means they could be incompatible
            #would need to investigate the primers/their lengths to try and find more matching
        #GC content should aim to be between 40-60%
        #Above 60% GC content could suggest that DMSO could be added to the reaction mixture during PCR to minimise
        #the self-annealing of the primers

67

In [104]:
#Adding EcoRI restriction enzyme sites (GAATTC) to the different primer strands (the plasmid we use would have the EcoRI cohesive ends to insert these strands)
#This is incase the original plasmid that is given by the vendor is undesired, and I wanted to amplify the MupB sequence through PCR and ligate to another (new/desired) plasmid that has the EcoRI restriction sites

mupB_forward_primer_ecoRI = Seq("GAATTCATGCGTACCGCGTATCTGGCG") #adding an extra codon on from the MupB sequence to try and alter the GC content
mupB_reverse_primer_ecoRI = Seq("GTCGCACACTCGCTACGCCTTAAG")

#In this code, I have manually added the EcoRI restriction sites (GAATTC) to the start of the forward primer and end of the reverse primer
#Could think about writing a line of code which automatically appends the primer sequences from a list of restriction enzyme sites

In [105]:
def gc_content(mupB_forward_primer_ecoRI): 
    return round((mupB_forward_primer_ecoRI.count("C") + mupB_forward_primer_ecoRI.count("G"))/(len(mupB_forward_primer_ecoRI)) * 100)

gc_content(mupB_forward_primer_ecoRI) #finding the GC content of the forward primer

56

In [106]:
def gc_content(mupB_reverse_primer_ecoRI): 
    return round((mupB_reverse_primer_ecoRI.count("C") + mupB_reverse_primer_ecoRI.count("G"))/(len(mupB_reverse_primer_ecoRI)) * 100)

gc_content(mupB_reverse_primer_ecoRI)

58

__Determining the total molecular weight__

In [68]:
#Dictionary for the amino acid molecular weight

amino_acid_mw = { 
    'A': '89', 'R': '174', 'N': '132', 'D': '133', 'C': '121', 
    'Q': '146', 'E': '147', 'G': '75', 'H': '155', 'I': '131',
    'L': '131', 'K': '146', 'M': '149', 'F': '165', 'P': '115', 
    'S': '105', 'T': '119', 'W': '204', 'Y': '181', 'V': '117'    
}

In [69]:
mupB_mw = []

In [70]:
for residue in mupB_seq: #looking for the amino acids in the mubB_seq
    molecular_weight = amino_acid_mw[residue]
    mupB_mw.append(molecular_weight) 

In [71]:
total_mw = "_".join(mupB_mw)

In [72]:
#print(total_mw)

In [73]:
num = int(total_mw)

In [74]:
#sum(total_mw)
        #This code was giving me errors 'unsupported operand type(s) for +: 'int' and 'str''
#that I assume was because the code could not add all the integers that I had generated from the 'total_mw' string

In [75]:
#using code that I got from https://www.geeksforgeeks.org/python-converting-all-strings-in-list-to-integers/
#I have tried to create a new list with the mupB_mw list and convert the numbers as integers into this new list
for i in range(0, len(mupB_mw)):
    mupB_mw[i] = int(mupB_mw[i])

In [76]:
sum(mupB_mw) #now that I have managed to make the list into integers, I was able to add all the mw 
            #of the amino acids in mupB sequence - this gives me a total mw of the protein

38354

__Conclusions:__

- Biopython was used to import the MupB sequence from NCBI
- Using a dictionary of optimised *E. coli* codons, the MupB protein amino acid sequence was converted to optimised DNA codons for expression in *E. coli* (called codon_output)
- Using a function sourced online, the GC content of the overall MupB DNA sequence was found to be 60%
- Primer sequences were selected by using the first 6 codons in the forward strand and the last 6 codons sequences generated by complimentation of the mupB sequence (using .compliment() function)
- The same GC function was applied to both these primer strands and found that the GC contents were not similiar - this could affect the Tm of both strands and make them incompatible - **future work should look into editing these primer sequences to find two primers that match GC content**
- Finally, using a dictionary of amino acid molecular weights, the total Mw of the MupB sequence was determined by using the loop operation, then converting the list generated into integers and then summing all numbers

**Further future work could look into using the MupB DNA sequence, converting it into a list and then editing different amino acids in order to build a library of MupB mutants - the same round of code could be applied for expressing these MupB mutants**