3.1 Sequences act like strings

In [2]:
from Bio.Seq import Seq

my_seq = Seq("GATCG")

for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))

0 G
1 A
2 T
3 C
4 G


In [3]:
print(len(my_seq))

5


In [4]:
print(my_seq[0])

G


In [5]:
print(my_seq[2])

T


In [6]:
print(my_seq[-1])

G


In [7]:
from Bio.Seq import Seq
"AAAA".count("AA")

2

In [8]:
Seq("AAAA").count("AA")

2

In [10]:
from Bio.Seq import Seq
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
len(my_seq)

32

In [11]:
my_seq.count("G")

9

In [12]:
100 * (my_seq.count("G") + my_seq.count("C")) / len(my_seq)

46.875

In [13]:
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
gc_fraction(my_seq)

0.46875

3.2 Slicing a sequence

In [14]:
from Bio.Seq import Seq
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
my_seq[4:12]

Seq('GATGGGCC')

In [15]:
my_seq[0::3]

Seq('GCTGTAGTAAG')

In [16]:
my_seq[1::3]

Seq('AGGCATGCATC')

In [17]:
my_seq[2::3]

Seq('TAGCTAAGAC')

In [18]:
my_seq[::-1]

Seq('CGCTAAAAGCTAGGATATATCCGGGTAGCTAG')

3.3 Turning Seq objects into strings

In [19]:
str(my_seq)

'GATCGATGGGCCTATATAGGATCGAAAATCGC'

In [20]:
print(my_seq)

GATCGATGGGCCTATATAGGATCGAAAATCGC


In [21]:
fasta_format_string = ">Name\n%s\n" % my_seq
print(fasta_format_string)

>Name
GATCGATGGGCCTATATAGGATCGAAAATCGC



3.4 Concatenating or adding sequences

In [22]:
from Bio.Seq import Seq
seq1 = Seq("ACGT")
seq2 = Seq("AACCGG")
seq1 + seq2

Seq('ACGTAACCGG')

In [24]:
from Bio.Seq import Seq
protein_seq = Seq("EVRANAK")
dna_seq = Seq("ACGT")
protein_seq + dna_seq

Seq('EVRANAKACGT')

In [25]:
from Bio.Seq import Seq
list_of_seq = [Seq("ACGT"), Seq("AACC"), Seq("GGTT")]
concatenated = Seq("")
for s in list_of_seq:
    concatenated += s

concatenated

Seq('ACGTAACCGGTT')

In [28]:
from Bio.Seq import Seq
contigs = [Seq("ATG"), Seq("ATCCCG"), Seq("TTGCA")]
spacer = Seq("N" * 10)
spacer.join(contigs)

Seq('ATGNNNNNNNNNNATCCCGNNNNNNNNNNTTGCA')

3.5 Changing case

In [29]:
from Bio.Seq import Seq
dna_seq = Seq("acgtACGT")
dna_seq

Seq('acgtACGT')

In [30]:
dna_seq.upper()

Seq('ACGTACGT')

In [32]:
dna_seq.lower()

Seq('acgtacgt')

In [33]:
"GTAC" in dna_seq

False

In [34]:
"GTAC" in dna_seq.upper()

True

3.6 Nucleotide sequences and (reverse) complements

In [35]:
from Bio.Seq import Seq
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
my_seq

Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC')

In [36]:
my_seq.complement()

Seq('CTAGCTACCCGGATATATCCTAGCTTTTAGCG')

In [37]:
my_seq.reverse_complement()

Seq('GCGATTTTCGATCCTATATAGGCCCATCGATC')

In [38]:
my_seq[::-1]

Seq('CGCTAAAAGCTAGGATATATCCGGGTAGCTAG')

In [39]:
from Bio.Seq import Seq
protein_seq = Seq("EVRNAK")
protein_seq.complement()

Seq('EBYNTM')

3.7 Transcription

In [40]:
from Bio.Seq import Seq
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')

In [41]:
template_dna = coding_dna.reverse_complement()
template_dna

Seq('CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT')

In [42]:
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')

In [43]:
messager_rna = coding_dna.transcribe()
messager_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')

In [44]:
template_dna.reverse_complement().transcribe()

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')

In [45]:
from Bio.Seq import Seq
messager_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
messager_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')

In [46]:
messager_rna.back_transcribe()

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')

3.8 Translation

In [47]:
from Bio.Seq import Seq
messager_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
messager_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')

In [48]:
messager_rna.translate()

Seq('MAIVMGR*KGAR*')

In [49]:
from Bio.Seq import Seq
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')

In [50]:
coding_dna.translate()

Seq('MAIVMGR*KGAR*')

In [51]:
coding_dna.translate(table="Vertebrate Mitochondrial")

Seq('MAIVMGRWKGAR*')

In [52]:
coding_dna.translate(table=2)

Seq('MAIVMGRWKGAR*')

In [53]:
coding_dna.translate()

Seq('MAIVMGR*KGAR*')

In [54]:
coding_dna.translate(to_stop=True)

Seq('MAIVMGR')

In [55]:
coding_dna.translate(table=2)

Seq('MAIVMGRWKGAR*')

In [56]:
coding_dna.translate(table=2, to_stop=True)

Seq('MAIVMGRWKGAR')

In [59]:
coding_dna.translate(table=2, stop_symbol="@")

Seq('MAIVMGRWKGAR@')

In [60]:
from Bio.Seq import Seq
gene = Seq(
    "GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCCATGGCA"
    "GCACAGGCTGCGGAAATTACGTTAGTCCCGTCAGTAAAATTACAGATAGGCGATCGTGAT"
    "AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT"
    "TATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT"
    "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA"
)
gene

Seq('GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCC...TAA')

In [62]:
print(gene.translate(table="Bacterial"))

VKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDHGWWKQHYEWRGNRWHLHGPPPPPRHHKKAPHDHHGGHGPGKHHR*


In [64]:
print(gene.translate(table="Bacterial", to_stop=True))

VKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDHGWWKQHYEWRGNRWHLHGPPPPPRHHKKAPHDHHGGHGPGKHHR


In [66]:
gene.translate(table="Bacterial", cds=True)

Seq('MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HHR')

3.9 Translation Tables

In [67]:
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]

In [74]:
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table = CodonTable.unambiguous_dna_by_id[2]

In [75]:
print(standard_table)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [76]:
print(mito_table)

Table 2 Vertebrate Mitochondrial, SGC1

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA W   | A
T | TTG L   | TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I(s)| ACT T   | AAT N   | AGT S   | T
A | ATC I(s)| ACC T   | AAC N   | AGC S   | C
A | ATA M(s)| ACA T   | AAA K   | AGA Stop| A
A | ATG M(s)| ACG T   | AAG K   | AGG Stop| G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V(s)| GCG A   | GAG E   | GGG G   

In [77]:
mito_table.stop_codons

['TAA', 'TAG', 'AGA', 'AGG']

In [78]:
mito_table.start_codons

['ATT', 'ATC', 'ATA', 'ATG', 'GTG']

3.10 Comparing Seq objects

In [79]:
from Bio.Seq import Seq
seq1 = Seq("ACGT")
"ACGT" == seq1

True

In [80]:
seq1 == "ACGT"

True

3.11 Sequences with unknown sequence contents

In [81]:
from Bio.Seq import Seq
unknown_seq = Seq(None, 10)

In [82]:
unknown_seq

Seq(None, length=10)

In [83]:
len(unknown_seq)

10

In [84]:
print(unknown_seq)

UndefinedSequenceError: Sequence content is undefined

3.12 Sequences with partially defined sequence contents

In [85]:
from Bio.Seq import Seq
seq = Seq({117512683: "TTGAAAACCTGAATGTGAGAGTCAGTCAAGGATAGT"}, length=159345973)

In [86]:
seq[1000:1020]

Seq(None, length=20)

In [87]:
seq[117512690:117512700]

Seq('CCTGAATGTG')

In [88]:
seq[117512670:117512690]

Seq({13: 'TTGAAAA'}, length=20)

In [89]:
seq[117512700:]

Seq({0: 'AGAGTCAGTCAAGGATAGT'}, length=41833273)

In [90]:
seq = Seq("ACGT")
undefined_seq = Seq(None, length=10)

seq + undefined_seq + seq

Seq({0: 'ACGT', 14: 'ACGT'}, length=18)

3.13 MutableSeq objects

In [91]:
from Bio.Seq import Seq
my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA")

In [92]:
my_seq[5] = "G"

TypeError: 'Seq' object does not support item assignment

In [93]:
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq(my_seq)
mutable_seq

MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA')

In [98]:
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA")

In [99]:
mutable_seq

MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA')

In [100]:
mutable_seq[5] = "C"
mutable_seq

MutableSeq('GCCATCGTAATGGGCCGCTGAAAGGGTGCCCGA')

In [101]:
mutable_seq.remove("T")
mutable_seq

MutableSeq('GCCACGTAATGGGCCGCTGAAAGGGTGCCCGA')

In [102]:
mutable_seq.reverse()
mutable_seq

MutableSeq('AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG')

In [103]:
from Bio.Seq import Seq
new_seq = Seq(mutable_seq)
new_seq

Seq('AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG')

3.14 Finding subsequences

In [104]:
from Bio.Seq import Seq, MutableSeq
seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA")
seq.index("ATGGGCCGC")

9

In [106]:
seq.index(b"ATGGGCCGC")

9

In [107]:
seq.index(bytearray(b"ATGGGCCGC"))

9

In [108]:
seq.index(Seq("ATGGGCCGC"))

9

In [109]:
seq.index("ACTG")

ValueError: subsection not found

In [110]:
seq.find("ACTG")

-1

In [112]:
seq.find("CC")

1

In [113]:
seq.rfind("CC")

29

In [114]:
for index, sub in seq.search(["CC", "GGG", "CC"]):
    print(index, sub)

1 CC
11 GGG
14 CC
23 GGG
28 CC
29 CC


3.15 Working with strings directly

In [115]:
from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate

my_string = "GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG"
reverse_complement(my_string)

'CTAACCAGCAGCACGACCACCCTTCCAACGACCCATAACAGC'

In [116]:
transcribe(my_string)

'GCUGUUAUGGGUCGUUGGAAGGGUGGUCGUGCUGCUGGUUAG'

In [117]:
back_transcribe(my_string)

'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG'

In [118]:
translate(my_string)

'AVMGRWKGGRAAG*'