In [12]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

import itertools

def scs(ss):
    """ Returns shortest common superstrings of given
        strings, which must be the same length """
    shortest_sup = []
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if len(shortest_sup) == 0 or len(sup) < len(shortest_sup[0]):
            shortest_sup = [sup] # found shorter superstring
        elif len(sup) == len(shortest_sup[0]) and sup not in shortest_sup:
            shortest_sup.append(sup)
    return shortest_sup  # return shortest

In [3]:
scs(["CCT","CTT","CTT","TGC","TGC","TGG","TGG","GAT","GAT","ATT","ATT"])[0]

'CCTTGGATTGC'

In [16]:
len(scs(["CCT","CTT","CTT","TGC","TGC","TGG","TGG","GAT","GAT","ATT","ATT"]))

4

In [5]:
!wget --no-check https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ads1_week4_reads.fq

--2021-05-26 17:16:21--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ads1_week4_reads.fq
Resolving d28rh4a8wq0iu5.cloudfront.net... 13.226.211.176, 13.226.211.135, 13.226.211.37, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|13.226.211.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 395781 (387K) [video/m2ts]
Saving to: 'ads1_week4_reads.fq'


2021-05-26 17:16:22 (1.47 MB/s) - 'ads1_week4_reads.fq' saved [395781/395781]



In [6]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [7]:
sequences, _ = readFastq('ads1_week4_reads.fq')

In [30]:
import itertools

def maximal_overlap(reads, olaps, k):

    readA, readB = None, None
    best_olen = 0
    for a,b in itertools.permutations(reads, 2):
        if b in olaps[a[-k:]]:
            olen = overlap(a,b,k)
            if olen > best_olen:
                best_olen = olen
                readA, readB = a,b
    return readA, readB, best_olen

def greedy_scs(reads, k):    
    olaps = {}
    for read in reads:
        for i in range(len(read)-k+1):
            if read[i:i+k] not in olaps:
                olaps[read[i:i+k]] = [read]
            else:
                olaps[read[i:i+k]].append(read)
                
    readA, readB, olen = maximal_overlap(reads, olaps, k)
    while olen > 0:
        reads.remove(readA)
        reads.remove(readB)
        reads.append(readA + readB[olen:])
        readA, readB, olen = maximal_overlap(reads, olaps, k)
    return ''.join(reads)

In [31]:
assembly = greedy_scs(sequences, 10)

In [32]:
print("Number of A's: {}, Number of T's: {}".format(assembly.count('A'), assembly.count('T')))

Number of A's: 14441, Number of T's: 11547
