In [1]:
# This function helps us figure out how to stitch two DNA reads (a and b) together.
def overlap(a, b, min_length=3):
    """ 
    Return the length of the longest suffix of 'a' that matches a prefix of 'b'.
    
    """
    
    # start searching for an overlap at the beginning of read 'a'.
    start = 0
    
    # This loop will keep running until we've either found the best possible overlap or we've run out of places to look in read 'a'.
    while True:
        # taking a small "seed" from the start of read 'b' and trying to find it in read 'a'.
        start = a.find(b[:min_length], start)
        
        if start == -1:  #  it means the seed wasn't found in the rest of string 'a'.
            # no overlap is possible. 
            return 0
            
        # If we found the seed, we need to verify it's a proper suffix/prefix match.
        # We check if the end of 'a' matches the start of 'b'.
        if b.startswith(a[start:]):
            return len(a) - start  # gives length of the overlap 
            
        # If it wasn't a suffix/prefix match we need to keep searching. 
        # We increment 'start' by 1 to look for the next possible occurrence of the seed.
        start += 1

In [3]:
overlap('TTACGT', 'CGTGTGC')

3

In [14]:
import itertools
from itertools import permutations

# This function finds all possible overlaps between reads.
def naive_overlap_map(reads, k):
    olaps = {}
    # Look at every possible pair of reads.
    for a, b in itertools.permutations(reads, 2):
        # Calculate the overlap length.
        olen = overlap(a, b, min_length=k)
        # If there's a real overlap...
        if olen > 0:
            # ...store it in our dictionary.
            olaps[(a, b)] = olen
    return olaps

In [15]:
reads = ['ACGGATC', 'GATCAAGT', 'TTCACGGA']
print(naive_overlap_map(reads, 3))

{('ACGGATC', 'GATCAAGT'): 4, ('TTCACGGA', 'ACGGATC'): 5}
