In [1]:
from Bio.Seq import Seq
from Bio.SeqUtils import GC

In [2]:
dna_sequence = Seq("ATCGATCGATCAGTCAGCTGA")

# Get the frequence of GC and AT in a Sequence

In [3]:
def AT(seq):
    return 100 - GC(seq)

In [4]:
GC(dna_sequence)

47.61904761904762

In [5]:
AT(dna_sequence)

52.38095238095238

In [6]:
GC(dna_sequence) + AT(dna_sequence)

100.0

# Finding temperature point for anneiling a DNA sequence
 **Higher GC content means higher melting point**
  * TM_Wallace  -> "Rule of thumb"
  * TM_GC       -> Calculation based on GC content. Salt and mismatch corrections can be included.
  * TM_NN       -> Calculation based on nearest neighbor thermodynamics. Several tables for DNA/DNA, DNA/RNA and RNA/RNA hybridizations are included. Corrections for mismatches, danglinh ends, salt concentration and other additives ate available.

In [7]:
from Bio.SeqUtils import MeltingTemp as mt

In [8]:
mt.Tm_Wallace(dna_sequence)

62.0

In [9]:
mt.Tm_GC(dna_sequence)

50.60727321779612

# Excercise
 **Which of the sequences below has the highest GC**
  * ex1 = ATGCATGGTGCGCGA
  * ex2 = ATTTGTGCTCCTGGA

In [10]:
seq1 = Seq("ATGCATGGTGCGCGA")
seq2 = Seq("ATTTGTGCTCCTGGA")

In [11]:
def check_higher_GC(sequence1, sequence2):
    if GC(sequence1) > GC(sequence2): 
        return f"{sequence1} has higher GC content"
    elif GC(sequence1) == GC(sequence2): 
        return f"Sequences has the same GC content"
    else: 
        return f"{sequence2} has higher GC content"

In [12]:
check_higher_GC(seq1, seq2)

'ATGCATGGTGCGCGA has higher GC content'

In [13]:
def get_metrics(seq):
    gc = GC(seq)
    at = AT(seq)
    melt_temp = mt.Tm_GC(seq)
    return f"seq:{seq}: GC:{gc}, AT:{at}, temp:{melt_temp}"

In [14]:
get_metrics(seq1)

'seq:ATGCATGGTGCGCGA: GC:60.0, AT:40.0, temp:44.254892265415165'

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [15]:
get_metrics(seq2)

'seq:ATTTGTGCTCCTGGA: GC:46.666666666666664, AT:53.333333333333336, temp:38.7882255987485'

## GC Skew

* Check when the nucleotide are over or under abundant in a particluar region of a DNA or RNA
* Helps to indicate DNA lagging strand or leading strand
* GC Skew pos(+) = leading
* GC Skew neg(-) = lagging
* GC Skew == 0   = no lagging or leading

In [16]:
from Bio.SeqUtils import GC123, GC_skew, xGC_skew

In [17]:
dna_sequence = Seq("ATGATCTCGTAA")

In [18]:
# GC123 is used to know the GC content for the whole sequence and for the first, second and third position of every single codon in the sequence
GC123(dna_sequence)

(33.333333333333336, 0.0, 25.0, 75.0)

In [19]:
# GC_Skew
GC_skew(dna_sequence)

[0.0]

In [20]:
GC_skew("ATGGGCCTAGGCCCGGAGGGGG")

[0.4117647058823529]

In [21]:
# Doesn't look at any ambiguous nucleotides like "S"
GC_skew("ATGGGCCTAGGCCCGGAGGSSS") # 3 S's in place of G's

[0.2857142857142857]

In [24]:
#plots
xGC_skew("ATGGGCCTAGGCCCGGAGGGGG")


ModuleNotFoundError: No module named 'Tkinter'