In [1]:
# GC-content is the percentage of nitrogenous bases in DNA or RNA molecule that are either G or C
# usefulness: 
# - in PCR experiments, GC content of short oligonucleotides known as primers is used to predict their annealing T to the template DNA
# - a higher GC- content is less stable than DNA with high GC content
# - High GC content DNA can make it difficult to perform PCR amplification due to difficulty in designing a primer long enough to provide great specificity

In [2]:
# AT content is the percentage of nitrogenous bases in DNA/RNA molecule that are either A or T
# - AT base pairing yields only 2 hydrogen bonds (is weaker than CG bond)

In [3]:
from Bio.SeqUtils import GC

In [13]:
from Bio.Seq import Seq

In [14]:
dna_seq = Seq('ATGATCTCGTAA')

In [15]:
dna_seq

Seq('ATGATCTCGTAA')

In [16]:
GC(dna_seq)

33.333333333333336

In [18]:
# Method 2
# Custom Fxn to get GC count
dna_seq.count('G')

2

In [23]:
def gc_content(seq):
    result = float(seq.count('G') + seq.count('C'))/len(seq) * 100
    return result

In [24]:
gc_content(dna_seq)

33.33333333333333

In [26]:
# Method 3
def gc_content2(seq):
    gc = [ B for B in seq if B in 'GC']
    result = float(len(gc))/len(seq) * 100
    return result

In [28]:
gc_content2(dna_seq)

33.33333333333333

In [29]:
dna_seq.lower()

Seq('atgatctcgtaa')

In [31]:
gc_content2(dna_seq.lower())

0.0

In [35]:
# Method 4
def gc_content3(seq):
    gc = [ B for B in seq.upper() if B in 'GC']
    result = float(len(gc))/len(seq) * 100
    return result

In [36]:
gc_content3('atgatctcgtaa')

33.33333333333333

In [38]:
### FXN for AT content
def at_content(seq):
    result = float(seq.count('A') + seq.count('T'))/len(seq) * 100
    return result

In [39]:
at_content(dna_seq)

66.66666666666666

In [40]:
## Melting point of DNA
# higher GC means high melting point
# TM_Wallace: 'Rule of thum"
# Tm_GC: Empirical formulas based on GC concent. Salt and mismatch corrections can be included.
# TM_NN: Calculation based on nearest neighbor thermodynamics. Several tables for DNA/DNA, DNA/RNA, RNA/RNA hybridizations are included. Correction for mismatches, damaging ends, salt cocentrations and other additives are availabe. 

In [41]:
import Bio.SeqUtils

In [42]:
dir(Bio.SeqUtils)

['GC',
 'GC123',
 'GC_skew',
 'IUPACData',
 'Seq',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'cos',
 'molecular_weight',
 'nt_search',
 'pi',
 're',
 'seq1',
 'seq3',
 'sin',
 'six_frame_translations',
 'xGC_skew']

In [44]:
from Bio.SeqUtils import MeltingTemp as mt

In [45]:
dna_seq

Seq('ATGATCTCGTAA')

In [46]:
GC(dna_seq)

33.333333333333336

In [47]:
# check for the melting point using wallace
mt.Tm_Wallace(dna_seq)

32.0

In [49]:
# Checking for the melting point using GC content
mt.Tm_GC(dna_seq)
# at this T the dna will melt drastically

23.32155893208184

In [None]:
## exercise
# hich of the following seq will have the highest GC?
# x1 = 'ATGCATGGTGCGCGA'
# x2 = 'ATTTGTGCTCCTGGA'

In [55]:
def get_metrics(seq):
    gc = GC(seq)
    at = at_content(seq)
    melting = mt.Tm_GC(seq)
    result = "GC:{} ,AT:{}, Temp:{}".format(gc,at,melting)
    return result

"{}".format(a)

f"{a}"

"%s" % a

In [51]:
dna1 = Seq('ATGCATGGTGCGCGA')

In [52]:
dna2 = Seq('ATTTGTGCTCCTGGA')

In [53]:
mt.Tm_GC(dna1)

44.254892265415165

In [54]:
mt.Tm_GC(dna2)

38.7882255987485

In [56]:
get_metrics(dna1)

'GC:60.0 ,AT:40.0, Temp:44.254892265415165'

In [57]:
get_metrics(dna2)

'GC:46.666666666666664 ,AT:53.333333333333336, Temp:38.7882255987485'

In [62]:
#### GC skwq 
# - check when the nucleotides (G,C) are over or under abundant in a particular region of DNA or RNA
# - helps to undicate DNA lagging strand or leading strand
# - GC skew pos = leading
# - GC skew neg = lagging


In [63]:
from Bio.SeqUtils import GC123,GC_skew,xGC_skew

In [64]:
# GC count first, second, third position
dna_seq

Seq('ATGATCTCGTAA')

In [65]:
GC123(dna_seq)

(33.333333333333336, 0.0, 25.0, 75.0)

In [66]:
GC(dna_seq)

33.333333333333336

In [69]:
# GC_Skew
GC_skew(dna_seq,10)

[0.0, 0.0]

In [70]:
GC_skew('ATGGGGTCCCCGTC')

[0.0]

In [71]:
xGC_skew(dna_seq)

In [None]:
#Subsequences
#- search for a DNA subseq in a seq, return list of [subseq, positions]

In [74]:
from Bio.SeqUtils import nt_search

In [83]:
main_seq = Seq('ACTATT')
subseq = Seq('ATT')

In [84]:
nt_search(str(main_seq),str(subseq))

['ATT', 3]