In [4]:
%matplotlib inline
import collections
from Bio.Blast import NCBIWWW
from Bio import SeqIO

from pprint import pprint
import matplotlib.pyplot as plt

## Module 1: DNA Sequencing, strings and matching

In [5]:
fp = "./files/lambda_virus.fa"
records = list(SeqIO.parse(fp, "fasta"))

In [6]:
def naive(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences


def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    t = ''
    for base in s:
        t = complement[base] + t
    return t

def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome

def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities


In [2]:
# Preparation functions
"""First, implement a version of the naive exact matching algorithm that is strand-aware. 
That is, instead of looking only for occurrences of P in T, 
additionally look for occurrences of thereverse complement of P in T. 

If P is ACT, your function should find occurrences of both ACT and its reverse complement AGT in T.

If P and its reverse complement are identical (e.g. AACGTT), 
then a given match offset should be reported only once. 

So if your new function is called naive_with_rc, then the old naivefunction and 
your new naive_with_rc function should return the same results when P equals its reverse complement.
"""

pass

In [13]:
record = records[0]

In [14]:
record

SeqRecord(seq=Seq('GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCG...ACG', SingleLetterAlphabet()), id='gi|9626243|ref|NC_001416.1|', name='gi|9626243|ref|NC_001416.1|', description='gi|9626243|ref|NC_001416.1| Enterobacteria phage lambda, complete genome', dbxrefs=[])

In [27]:
seq = record.seq
seq

Seq('GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCG...ACG', SingleLetterAlphabet())

In [28]:
r_seq = record.reverse_complement().seq
r_seq

Seq('CGTAACCTGTCGGATCACCGGAAAGGACCCGTAAAGTGATAATGATTATCATCT...CCC', SingleLetterAlphabet())

In [39]:
# 1.
print(seq.count("AGGT"))
print(r_seq.count("AGGT"))
seq.count("AGGT") + r_seq.count("AGGT")

150
150


306

In [37]:
# 3.
print(seq.count("TTAA"))
print(r_seq.count("TTAA"))

195
195


In [44]:
# 3.
print(seq.find("ACTAAGT"))
# print(r_seq.find("ACTAAGT"))
print(seq.find("ACTTAGT"))

27733
26028


In [47]:
# 4.
print(seq.find("AGTCGA"))
print(seq.find("TCGACT"))

18005
450


In [63]:
# 5.
def naive_2mm(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        mismatches = 0
        
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                mismatches += 1
                
            if mismatches > 2:
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences

p = 'CTGT'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CTGT' + ten_as + 'CTTT' + ten_as + 'CGGG' + ten_as
occurrences = naive_2mm(p, t)
print(occurrences) # [10, 24, 38]

occurrences = naive_2mm("TTCAAGCC", seq)
print(len(occurrences))

[10, 24, 38]
191


In [65]:
# 6.
occurrences = naive_2mm("AGGAGGTT", seq)
occurrences[0]

49

In [69]:
# 7.
fp = "./files/ERR037900_1.first1000.fastq"
records = list(SeqIO.parse(fp, "fasta"))
records

[SeqRecord(seq=Seq('@ERR037900.47509.7.42.5729.198593/1TAGGGTAGGGTTAGGGTTA...AT+', SingleLetterAlphabet()), id='==>=7DFFDHHHHHHGHHHEFDCA55445FECDA=<=<>8EEGDD7A4D44-5/><:7=:DAA####################################', name='==>=7DFFDHHHHHHGHHHEFDCA55445FECDA=<=<>8EEGDD7A4D44-5/><:7=:DAA####################################', description='==>=7DFFDHHHHHHGHHHEFDCA55445FECDA=<=<>8EEGDD7A4D44-5/><:7=:DAA####################################', dbxrefs=[]),
 SeqRecord(seq=Seq('@ERR037900.180509.8.1.1903.189882/1TTAGGGTTAGGGTTAGGGT...CG+', SingleLetterAlphabet()), id='C@C@DADFFFGGGG<DDDDC<CC@>>4<=5544535541@===58@A>8C??9;5445555135/5544@><94<:=@4A###################', name='C@C@DADFFFGGGG<DDDDC<CC@>>4<=5544535541@===58@A>8C??9;5445555135/5544@><94<:=@4A###################', description='C@C@DADFFFGGGG<DDDDC<CC@>>4<=5544535541@===58@A>8C??9;5445555135/5544@><94<:=@4A###################', dbxrefs=[]),
 SeqRecord(seq=Seq('@ERR037900.418509.2.27.12207.142740/1CTAACCCTAACCCTAAC...GT+', SingleLetterAlpha

In [73]:
print(len(records))


4
