<a href="https://colab.research.google.com/github/Wan-Shi-Tong-bi/5Ws/blob/main/colab/4.02_GreedySCS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Shortest Common Superstring Problem
## Greedy Shortest Common Super String

Greedy Algorithms decide at each stage which alternative reduces problem complexity most.

In [1]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
      start = a.find(b[:min_length],start)
      if start == -1:
        return 0
      if b.startswith(a[start:]):
        return len(a) - start
      start += 1

In [2]:
import itertools

def scs(ss):
    """ Returns shortest common superstring of given strings,
        assuming no string is a strict substring of another """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
          olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
        sup += ssperm[i+1][olen:]
        
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
        
    return shortest_sup  # return shortest

In [3]:
def pick_maximal_overlap(reads, k):
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""
    reada, readb = None, None
    best_olen = 0
    for a, b in itertools.permutations(reads, 2):
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:
            reada, readb = a, b
            best_olen = olen
    return reada, readb, best_olen

In [4]:
def greedy_scs(reads, k):
    """ Greedy shortest-common-superstring merge.
        Repeat until no edges (overlaps of length >= k)
        remain. """
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    return ''.join(reads)

In [5]:
%%time
greedy_scs(['ABC', 'BCA', 'CAB'], 2)

CPU times: user 37 µs, sys: 0 ns, total: 37 µs
Wall time: 40.5 µs


'CABCA'

In [6]:
%%time
greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1)

CPU times: user 30 µs, sys: 5 µs, total: 35 µs
Wall time: 39.3 µs


'CDBCABCDA'

In [7]:
%%time
scs(['ABCD', 'CDBC', 'BCDA'])

CPU times: user 44 µs, sys: 8 µs, total: 52 µs
Wall time: 56.5 µs


'CDBCA'