In [7]:
# !wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ads1_week4_reads.fq

In [1]:
import itertools

overlap_cache = {} 

def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match



def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

def scs_list(ss):
    """ Returns the alphebeticaly sorted list of shortest common superstrings of given strings, which must be the same length """
    shortest_sup = []
    shortest_length = 0
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_length == 0:
            shortest_sup.append(sup)
            shortest_length = len(sup)
        elif len(sup) < shortest_length:
            shortest_length = len(sup)
            shortest_sup = [sup]
        elif len(sup) == shortest_length:
            shortest_sup.append(sup)
        else:
            #simply ignore and move on
            None
    return sorted(shortest_sup)

def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            #All the reads are the same length (100 bases) 
            assert len(seq) == 100
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities


def pick_maximal_overlap(reads, k):
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""
    reada, readb = None, None
    best_olen = 0

    for a, b in itertools.permutations(reads, 2):
        
        if (a,b) in overlap_cache.keys() and overlap_cache[(a,b)] >= k:
            overlap_len = overlap_cache[(a,b)]
            return a, b, overlap_len
        else:
            olen = overlap(a, b, min_length=k)
            if olen > best_olen:
                reada, readb = a, b
                best_olen = olen
                overlap_cache[(a,b)]=best_olen

    return reada, readb, best_olen

def greedy_scs(reads, k):
    """ Greedy shortest-common-superstring merge.
        Repeat until no edges (overlaps of length >= k)
        remain. """
    lenCacheBefore = len(overlap_cache)
    read_a, read_b, olen = pick_maximal_overlap(reads, k)

    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])
        
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    
    lenCacheAfter = len(overlap_cache)

    return ''.join(reads)


# question 1
input_str = ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']
scs1 = scs(input_str)
print(f'Question 1: {len(scs1)}')


# question 2
shortest = scs_list( ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'] )
print(f'Question 2: {len(shortest)}')

# question 3 & 4
def count_a_and_t():
    
    reads, qualities = readFastq('ads1_week4_reads.fq')
    
    print('Round 1')
    for i in range (30, 100):
        print(f'Working on {i}')
        result = greedy_scs(list(reads), i)  # make copy of reads
        print(f'Found result {len(result)} bases long where k={i}')

        # virus genome length is 15894
        if len(result) == 15894:
            print(f"Question 3: {result.count('A')}")
            print(f"Question 4: {result.count('T')}")
            return

    print('Round 2')
    for i in range (30, 1, -1):
        print(f'Working on {i}')
        result = greedy_scs(list(reads), i)  # make copy of reads
        print(f'Found result {len(result)} bases long where k={i}')

        # virus genome length is 15894
        if len(result) == 15894:
            print(f"Question 3: {result.count('A')}")
            print(f"Question 4: {result.count('T')}")
            return
        
def main():
    count_a_and_t()
    
if __name__ == '__main__':
    main()

Question 1: 11
Question 2: 4
Round 1
Working on 30
Found result 16867 bases long where k=30
Working on 31
Found result 17196 bases long where k=31
Working on 32
Found result 18132 bases long where k=32
Working on 33
Found result 17992 bases long where k=33
Working on 34
Found result 18259 bases long where k=34
Working on 35
Found result 16325 bases long where k=35
Working on 36
Found result 17484 bases long where k=36
Working on 37
Found result 17974 bases long where k=37
Working on 38
Found result 17643 bases long where k=38
Working on 39
Found result 16781 bases long where k=39
Working on 40
Found result 17761 bases long where k=40
Working on 41
Found result 18010 bases long where k=41
Working on 42
Found result 17262 bases long where k=42
Working on 43
Found result 18860 bases long where k=43
Working on 44
Found result 17852 bases long where k=44
Working on 45
Found result 17094 bases long where k=45
Working on 46
Found result 18120 bases long where k=46
Working on 47
Found result 1