In [3]:
# Edit distance = minimum number of edits (insertions, deletions, substitutions) to change one string into the other.
def editDistRecursive(x, y):
    '''This function calculates the edit distance between two strings.'''
    # This recursive implementation is very slow because it re-computes the same subproblems many times.
    # Base case: if one string is empty, the distance is just the length of the other string.
    if len(x) == 0:
        return len(y)
    elif len(y) == 0:
        return len(x)
    else:
        # Case 1: Deletion from x (penalty = 1)
        distHor = editDistRecursive(x[:-1], y) + 1
        # Case 2: Insertion into x (penalty = 1)
        distVer = editDistRecursive(x, y[:-1]) + 1
        # Case 3: Substitution or match
        if x[-1] == y[-1]:
            # If the last characters match, the cost is 0.
            distDiag = editDistRecursive(x[:-1], y[:-1])
        else:
            # If they don't match, the substitution cost is 1.
            distDiag = editDistRecursive(x[:-1], y[:-1]) + 1
        # Return the minimum of the three possible operations.
        return min(distHor, distVer, distDiag)

In [4]:
# This function calculates edit distance using dynamic programming. It's much faster.
def editDistance(x, y):
    # Create a distance matrix (a grid) to store solutions to subproblems.
    D = []
    for i in range(len(x)+1):
        D.append([0]*(len(y)+1))
    
    # Initialize the first row and column. The distance from an empty string to a string of length 'i' is just 'i'.
    for i in range(len(x)+1):
        D[i][0] = i
    for i in range(len(y)+1):
        D[0][i] = i
    
    # Fill in the rest of the matrix.
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            # Calculate the cost of the three possible operations.
            distHor = D[i][j-1] + 1
            distVer = D[i-1][j] + 1
            if x[i-1] == y[j-1]:
                distDiag = D[i-1][j-1] # No penalty if characters are the same
            else:
                distDiag = D[i-1][j-1] + 1 # penalty of 1 for a substitution
            
            # The value in each cell is the minimum of the three possibilities.
            D[i][j] = min(distHor, distVer, distDiag)
            
    # The final edit distance is the value in the bottom-right corner.
    return D[-1][-1]


In [5]:
%%time
x = "AGTCGCGCTG"
y = "ATCAGCT"
editDistRecursive(x, y)

CPU times: user 288 ms, sys: 0 ns, total: 288 ms
Wall time: 503 ms


4

In [6]:
%%time
x = "AGTCGCGCTG"
y = "ATCAGCT"
editDistance(x, y)

CPU times: user 72 µs, sys: 0 ns, total: 72 µs
Wall time: 74.1 µs


4