In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/

/content/drive/MyDrive


**2.1 Working with sequences**

Biopython is a set of libraries to provide the ability to deal with “things” of interest to biologists working on the computer. In general this means that you will need to have at least some programming experience (in Python, of course!) or at least an interest in learning to program. Biopython’s job is to make your job easier as a programmer by supplying reusable libraries so that you can focus on answering your specific question of interest, instead of focusing on the internals of parsing a particular file format.

In [5]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 32.1 MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.79


In [6]:
#Thus, we’ll start with a quick introduction to the Biopython mechanisms for dealing with sequences, the Seq object
from Bio.Seq import Seq
my_seq=Seq('ABCDEFGH')

In [7]:
print(my_seq)

ABCDEFGH


In [8]:
dir(my_seq)

['__abstractmethods__',
 '__add__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_data',
 'back_transcribe',
 'complement',
 'complement_rna',
 'count',
 'count_overlap',
 'encode',
 'endswith',
 'find',
 'index',
 'join',
 'lower',
 'lstrip',
 'replace',
 'reverse_complement',
 'reverse_complement_rna',
 'rfind',
 'rindex',
 'rsplit',
 'rstrip',
 'split',
 'startswith',
 'strip',
 'tomutable',
 'transcribe',
 'translate',
 'ungap',
 'upper']

In [9]:
#The Seq object differs from the Python string in the methods it supports. You can’t do this with a plain string
my_seq.complement()

Seq('TVGHEFCD')

In [10]:
my_seq.reverse_complement()

Seq('DCFEHGVT')

In [11]:
# sequences act like a strings
from Bio.Seq import Seq
my_seq=Seq('GATCGFEG')
for index,letter in enumerate(my_seq):
  print("%i %s" % (index,letter))

0 G
1 A
2 T
3 C
4 G
5 F
6 E
7 G


In [12]:
print(len(my_seq))

8


In [17]:
#You can access elements of the sequence in the same way as for strings
print(my_seq[0])
print(my_seq[0:3])
print(my_seq[::-1])
print(my_seq[-1])
print(my_seq[-2])

G
GAT
GEFGCTAG
G
E


In [23]:
#The Seq object has a .count() method, just like a string. Note that this means that like a Python string, 
#this gives a non-overlapping count
print(my_seq.count('G'))
print(my_seq.count('A'))
print(my_seq.count('T'))

3
1
1


In [24]:
100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq)

50.0

In [25]:
#While you could use the above snippet of code to calculate a GC%, 
#note that the Bio.SeqUtils module has several GC functions already built
from Bio.Seq import Seq
from Bio.SeqUtils import GC
my_seq=Seq('GGGGGGGGGGGGGGGTCHHHHHTAGGGAAAAAA')
GC(my_seq)

57.57575757575758

In [27]:
#Turning Seq objects into strings
str(my_seq)

'GGGGGGGGGGGGGGGTCHHHHHTAGGGAAAAAA'

In [29]:
fasta_format_string = ">Name\n%s\n" % my_seq
print(fasta_format_string)

>Name
GGGGGGGGGGGGGGGTCHHHHHTAGGGAAAAAA



In [30]:
# Concatenating or adding sequences
from Bio.Seq import Seq
protein_seq = Seq("EVRNAK")
dna_seq = Seq("ACGT")
protein_seq + dna_seq

Seq('EVRNAKACGT')

In [36]:
from Bio.Seq import Seq
list_of_seqs = [Seq("ACGTF"), Seq("AACC"), Seq("GGTTPP")]
concatenated = Seq("")
for s in list_of_seqs:
  print(s)
  concatenated += s

ACGTF
AACC
GGTTPP


In [37]:
concatenated

Seq('ACGTFAACCGGTTPP')

In [39]:
from Bio.Seq import Seq
contigs = [Seq("ATG"), Seq("ATCCCG"), Seq("TTGCA")]
spacer = Seq("P"*10)
spacer.join(contigs)

Seq('ATGPPPPPPPPPPATCCCGPPPPPPPPPPTTGCA')

In [40]:
#Python strings have very useful upper and lower methods for changing the case. 
#For example,
from Bio.Seq import Seq
dna_seq = Seq("acgtACGT")
print(dna_seq)
print(dna_seq.upper())
print(dna_seq.lower())

acgtACGT
ACGTACGT
acgtacgt


In [41]:
# check letter in sequence
#These are useful for doing case insensitive matching:
"GTAC" in dna_seq


False

In [44]:
"GTAC" in dna_seq.upper()

True

In [45]:
"GTACB" in dna_seq.upper()

False

**Nucleotide sequences and (reverse) complements**


In [48]:
#For nucleotide sequences, you can easily obtain the complement or reverse complement of a Seq object 
#using its built-in methods
from Bio.Seq import Seq
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
print('Sequence:', my_seq)
Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC')
print('Complement:', my_seq.complement())
print('Reverse Complement:', my_seq.reverse_complement())

Sequence: GATCGATGGGCCTATATAGGATCGAAAATCGC
Complement: CTAGCTACCCGGATATATCCTAGCTTTTAGCG
Reverse Complement: GCGATTTTCGATCCTATATAGGCCCATCGATC


In [50]:
print('Reverse Sequence:',my_seq[::-1])

Reverse Sequence: CGCTAAAAGCTAGGATATATCCGGGTAGCTAG


**Transcription**

Before talking about transcription, I want to try to clarify the strand issue. Consider the following (made up) stretch of double stranded DNA which encodes a short peptide:

 
 	DNA coding strand (aka Crick strand, strand +1)	 
5’	ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG	3’
 	|||||||||||||||||||||||||||||||||||||||	 
3’	TACCGGTAACATTACCCGGCGACTTTCCCACGGGCTATC	5’
 	DNA template strand (aka Watson strand, strand −1)	 
 
 	|	 
 	Transcription	 
 	↓	 
 
5’	AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG	3’
 	Single stranded messenger RNA	 

In [52]:
#Now let’s actually get down to doing a transcription in Biopython. 
#First, let’s create Seq objects for the coding and template DNA strands:

from Bio.Seq import Seq
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
print('DNA sequence:',coding_dna)
template_dna = coding_dna.reverse_complement()
print('Reverse Compelment:',template_dna)

DNA sequence: ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
Reverse Compelment: CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT


In [53]:
print('DNA:',coding_dna)
messenger_rna = coding_dna.transcribe()
print('mRNA:',messenger_rna)

DNA: ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
mRNA: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG


In [58]:
print('mRNA1:',messenger_rna)
print('mRNA2:',template_dna.reverse_complement().transcribe())
print('mRNA3:',coding_dna.reverse_complement().reverse_complement().transcribe())

mRNA1: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
mRNA2: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
mRNA3: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG


In [61]:
#The Seq object also includes a back-transcription method for going from the mRNA to the coding strand of the DNA. 
#Again, this is a simple U → T substitution:
from Bio.Seq import Seq
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
print('mRNA:',messenger_rna)
print('back to DNA:',messenger_rna.back_transcribe())
print('Original DNA:',coding_dna)

mRNA: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
back to DNA: ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
Original DNA: ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG


**Translation**

Sticking with the same example discussed in the transcription section above, now let’s translate this mRNA into the corresponding protein sequence - again taking advantage of one of the Seq object’s biological methods:

In [64]:
dir(messenger_rna) # check different methods associated with sequences
#epscially['back_transcribe',
# 'complement','complement_rna','count','count_overlap','encode','endswith','find','index','join',
# 'lower','lstrip','replace','reverse_complement','reverse_complement_rna','rfind','rindex','rsplit','rstrip','split',
# 'startswith','strip','tomutable','transcribe','translate','ungap','upper'']

['__abstractmethods__',
 '__add__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_data',
 'back_transcribe',
 'complement',
 'complement_rna',
 'count',
 'count_overlap',
 'encode',
 'endswith',
 'find',
 'index',
 'join',
 'lower',
 'lstrip',
 'replace',
 'reverse_complement',
 'reverse_complement_rna',
 'rfind',
 'rindex',
 'rsplit',
 'rstrip',
 'split',
 'startswith',
 'strip',
 'tomutable',
 'transcribe',
 'translate',
 'ungap',
 'upper']

In [62]:
from Bio.Seq import Seq
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
print(messenger_rna)
print(messenger_rna.translate())

AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
MAIVMGR*KGAR*


In [66]:
# Working directly with strings
from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate
my_string = "GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG"
print(reverse_complement(my_string))
print(transcribe(my_string))
print(back_transcribe(my_string))
print(translate(my_string))


CTAACCAGCAGCACGACCACCCTTCCAACGACCCATAACAGC
GCUGUUAUGGGUCGUUGGAAGGGUGGUCGUGCUGCUGGUUAG
GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG
AVMGRWKGGRAAG*
