### Material sources
#### http://betatim.github.io/posts/genome-hackers/


#### http://rosalind.info/problems/
#### https://www.pnas.org/content/98/17/9748

#### Transcribing DNA into RNA

In [None]:
import re

dna = 'GATGGAACTTGACTACGTAAATT'
rna = 'GAUGGAACUUGACUACGUAAAUU'

def transcribe(dna):
    return re.sub(r'T', 'U', dna, flags=re.IGNORECASE)

print(transcribe(dna) == rna)

#### Complementing a Strand of DNA

In [None]:
dna = 'AAAACCCGGT'
compl = 'ACCGGGTTTT'
sub_dict = {'A': 'T', 'G': 'C', 'T': 'A', 'C': 'G'}

def complementary_dna(dna):
    return ''.join([sub_dict[nt] for nt in dna.upper()[::-1]])
    
print(complementary_dna(dna) == compl)

#### Finding a Motif in DNA

In [None]:
import re
s = 'GATATATATGCATATACTT'
t = 'ATAT'

def find_overlapping(s, t):
    result = re.finditer(r'(?=({}))'.format(t), s)
    for i in result:
        print(i.start(1), i.end(1))

find_overlapping(s, t)

#### Overlap graph

In [None]:
def read_fasta(fasta_file):
    seq_sid = {}
    sid_seq = {} 
    try:
        with open(fasta_file, 'rt') as f:
            fasta_text = f.read()
    except:
        fasta_text = fasta_file
    for record in fasta_text.split('>')[1:]:
        sid, seq, _ = record.split('\n')
        seq_sid.setdefault(seq, sid)
        sid_seq.setdefault(sid, seq)
    return seq_sid, sid_seq

def overlap_suffix(x, y, l):
    """
    locate the length of overlapped strings
    """
    overlap_len = [0]
    i = len(x)
    while i > max(overlap_len):
        j = len(y)
        while j > max(overlap_len):
            while x[i-l:] == y[j-l:j]:
                overlap_len.append(l)
                l += 1
            j -= 1
        i -= 1
    return max(overlap_len)

def find_overlap(records, overlap):
    overlap_pair = []
    seqs = list(records.keys())
    for i in range(len(seqs)):
        for j in range(i+1, len(seqs)):
            mi = overlap_suffix(seqs[i], seqs[j], overlap)
            mj = overlap_suffix(seqs[j], seqs[i], overlap)
            if mi > mj and mi > overlap:
                overlap_pair.append([records[seqs[i]], records[seqs[j]], mi])
            elif mj > mi and mj > overlap:          
                overlap_pair.append([records[seqs[j]], records[seqs[i]], mj])
    return overlap_pair


fasta_file = '>Rosalind_0498\nAAATAAA\n>Rosalind_2391\nAAATTTT\n>Rosalind_2323\nTTTTCCC\n>Rosalind_0442\nAAATCCC\n>Rosalind_5013\nGGGTGGG\n'
overlap_len = 1
seq_sid, sid_seq = read_fasta(fasta_file)
overlap_pair = find_overlap(seq_sid, overlap_len)
print(overlap_pair)

#### Genome Assembly as Shortest Superstring

In [None]:
def shortest_superstring(overlap_pair, sid_seq):
    overlap_pair.sort(key= lambda x: x[2], reverse=True)
    overlap_string = ''
    done_set = []  
    for item in overlap_pair:
        if done_set == []:
            overlap_string = sid_seq[item[0]] + sid_seq[item[1]][item[2]:]
            done_set.append(item[0])
            done_set.append(item[1])
        elif item[0] == done_set[-1] and item[1] not in done_set:
            overlap_string += sid_seq[item[1]][item[2]:] 
            done_set.append(item[1])
    return overlap_string
   

fasta_file = '>Rosalind_56\nATTAGACCTG\n>Rosalind_57\nCCTGCCGGAA\n>Rosalind_58\nAGACCTGCCG\n>Rosalind_59\nGCCGGAATAC\n'
records = read_fasta(fasta_file)
overlap_len = 1
seq_sid, sid_seq = read_fasta(fasta_file)
overlap_pair = find_overlap(seq_sid, overlap_len)
sss = shortest_superstring(overlap_pair, sid_seq)
print(sss == 'ATTAGACCTGCCGGAATAC')
print(overlap_pair)

#### K-mer composition

In [None]:
import itertools

def ngram2index(ngram):
    return [''.join(item) for item in itertools.product('ACGT', repeat=ngram)]

                                                        
def kmer_composition(dna, k):
    ngram = ngram2index(k)
    kmer = {}
    for i in range(len(dna) - k + 1):
        kmer[dna[i:i+k]] = kmer.get(dna[i:i+k], 0)+1
    return [kmer.get(item, 0) for item in ngram]

        
my_dna = 'CTTCGAAAGTTTGGGCCGAGTCTTACAGTCGGTCTTGAAGCAAAGTAACGAACTCCACGGCCCTGACTACCGAACCAGTTGTGAGTACTCAACTGGGTGAGAGTGCAGTCCCTATTGAGTTTCCGAGACTCACCGGGATTTTCGATCCAGCCTCAGTCCAGTCTTGTGGCCAACTCACCAAATGACGTTGGAATATCCCTGTCTAGCTCACGCAGTACTTAGTAAGAGGTCGCTGCAGCGGGGCAAGGAGATCGGAAAATGTGCTCTATATGCGACTAAAGCTCCTAACTTACACGTAGACTTGCCCGTGTTAAAAACTCGGCTCACATGCTGTCTGCGGCTGGCTGTATACAGTATCTACCTAATACCCTTCAGTTCGCCGCACAAAAGCTGGGAGTTACCGCGGAAATCACAG'
kmer = kmer_composition(my_dna, 4)   
print(kmer)

#### Constructing De Bruijn Graph


In [None]:
def get_kmer(dna):
    k = len(dna) - 1
    return [dna[i:i+k] for i in range(len(dna) - k + 1)]
        
def de_bruijn_graph(dna):
    edges = {}
    all_kmer = set([kmer for item in dna for kmer in get_kmer(item) ])
    for i in all_kmer:
        for j in all_kmer:
            if i[1:] == j[:len(j)-1]:
                edges.setdefault(i, set([])).add(j)
            elif j[1:] == i[:len(j)-1]:
                edges.setdefault(j, set([])).add(i)
    return edges

dna = 'TGAT\nCATG\nTCAT\nATGC\nCATC\nCATC'
paths = de_bruijn_graph(dna.split('\n'))
print(paths)

#### Genome assembly with perfect coverage

In [None]:
def cyclic_superstring(edges):
    cyclic_string = ''
    all_edges = list(edges.keys())
    stack = []
    while set(stack) != set(all_edges):
        for k in edges.keys():
            if stack == []:
                cyclic_string += k
                stack.append(k)
                nodes = list(edges[k])
                cyclic_string += nodes[0][-1]
                stack.append(nodes[0])
            elif k in stack:
                nodes = list(edges[k])
                if nodes[0] not in stack:
                    cyclic_string += nodes[0][-1]
                    stack.append(nodes[0])
    return cyclic_string[len(all_edges[0])-1:]


dna = 'ATTAC\nTACAG\nGATTA\nACAGA\nCAGAT\nTTACA\nAGATT'
paths = de_bruijn_graph(dna.split('\n'))

cyclic_dna = cyclic_superstring(paths)
print(cyclic_dna)
answer = 'GATTACA'
print(answer)

#### Practical strategies for applying de Bruijn graphs
##### (Source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5531759/)
##### (Source: https://en.wikipedia.org/wiki/Scaffolding_(bioinformatics))

There are key assumptions in assembling the genome using de Bruijn graph method which are not a part of the lecture. Most developed tools have implemented remedies for these key issues in practice. 

1. Generate all k-mers present in the genome sequencing by breaking the reads into shorter k-mers, e.g. 100-nucleotide reads to 46 overlapping 55-mer.
2. Apply error correcting algorithms before aseembling the genomes to handle errors in reads.
3. Increase the number of edges connecting between prefixes and suffixes following the number of multiplicity.
4. Deal with linear and multiple chromosomes. (No extra algorithm needed as Eulerian's algorithm can handle.)
5. Use scaffolding to determine the correct order and orientation of the contigs and approximate the size of the gaps. Gap-filling algorithms can be also used t reduce the size of gaps between contigs. 