# DNA and Protein Analysis Notebook

### This notebook will take a DNA sequence and translate it to the protein sequence. We will use Biopython module since many useful functions exist for this type of analysis. 

### https://biopython.org/

In [94]:
from Bio.Seq import Seq
from Bio.SeqUtils import molecular_weight
from Bio.Data import CodonTable
from Bio.SeqUtils.ProtParam import ProteinAnalysis

### Print Codon Table for Reference

In [95]:
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
print (standard_table)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

### Our DNA sequence we want to analyze. This DNA sequence is eGFP (green fluorescent protein) with terminal TEV Cleavage site followed by a 6x his tag on the C-terminus.

In [96]:
dna = "ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGAAAACCTGTATTTCCAGGGTCATCACCATCACCATCAC"

In [97]:
dna_mw = molecular_weight(dna)
coding_dna = Seq(dna)
aa_translate = coding_dna.translate()
print ("DNA MW is", round(dna_mw,1))
print ()
print ("Protein Sequence is:") 
print ()
print (aa_translate)

DNA MW is 233051.3

Protein Sequence is:

MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYKENLYFQGHHHHHH


### Protein Analysis after translation of DNA Sequence

In [98]:
str_aa = str(aa_translate) #convert sequence object to strings for protein analysis
analysed_seq = ProteinAnalysis(str_aa)
molecular_weight_protein = analysed_seq.molecular_weight()
iso_protein = analysed_seq.isoelectric_point()
AA_Count = analysed_seq.count_amino_acids()

print ("Protein MW is", round(molecular_weight_protein,1), "da", "and there is", len(str_aa), "amino acids")
print ()
print ("Protein IsoElectric Point is", round(iso_protein,1))
print()
print("These amino acids are in the protein")
print()
print (AA_Count)

Protein MW is 28615.9 da and there is 252 amino acids

Protein IsoElectric Point is 5.9

These amino acids are in the protein

{'A': 8, 'C': 2, 'D': 18, 'E': 17, 'F': 13, 'G': 23, 'H': 15, 'I': 12, 'K': 20, 'L': 22, 'M': 6, 'N': 14, 'P': 10, 'Q': 9, 'R': 6, 'S': 10, 'T': 16, 'V': 18, 'W': 1, 'Y': 12}


### How does eGFP change once we remove the 6x his tag after performaing a TEV Cleavage step?

In [99]:
#TEV Protease recognition sequence is ENLYFQ|G. TEV cleaves between Q and G
protein_sequence_after_TEV = str_aa.replace("GHHHHHH", "") #Removes the sequence after the cut
analysed_seq_after_TEV = ProteinAnalysis(protein_sequence_after_TEV)
molecular_weight_protein_after_TEV = analysed_seq_after_TEV.molecular_weight()
iso_protein_after_TEV = analysed_seq_after_TEV.isoelectric_point()
AA_Count_after_TEV = analysed_seq_after_TEV.count_amino_acids()

print("New protein sequence is")
print()
print(protein_sequence_after_TEV)
print()
print("Protein MW is", round(molecular_weight_protein_after_TEV,1), "da", "and there is", len(protein_sequence_after_TEV), "amino acids")
print()
print ("Protein IsoElectric Point is", round(iso_protein_after_TEV,1))
print()
print("These amino acids are in the protein")
print()
print(AA_Count_after_TEV)

New protein sequence is

MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYKENLYFQ

Protein MW is 27736.0 da and there is 245 amino acids

Protein IsoElectric Point is 5.5

These amino acids are in the protein

{'A': 8, 'C': 2, 'D': 18, 'E': 17, 'F': 13, 'G': 22, 'H': 9, 'I': 12, 'K': 20, 'L': 22, 'M': 6, 'N': 14, 'P': 10, 'Q': 9, 'R': 6, 'S': 10, 'T': 16, 'V': 18, 'W': 1, 'Y': 12}


### After TEV-Clevage, eGFP's MW and PI changed due to removal of GHHHHHH