# Jaccard Similarity
Jaccrd = intersection(A, B)/ Union(A, B)

In [1]:
def jaccard(x : str, y : str) -> float:
    """Compute the Jaccard similarity between two strings.

    The Jaccard similarity is defined as the size of the intersection divided by
    the size of the union of two sets. In this case, we treat each string as a
    set of characters.

    Args:
        x (str): First input string.
        y (str): Second input string.

    Returns:
        float: Jaccard similarity between the two strings.
    """
    set_x = set(x.split())
    set_y = set(y.split())

    intersection = set_x.intersection(set_y)
    union = set_x.union(set_y)

    if not union:
        return 1.0  # Both strings are empty

    return len(intersection) / len(union)

In [3]:
x = "Resistor of 10k Ohm ±5% tolerance"
y = "Resistor of 5k Ohm ±10% tolerance"
similarity = jaccard(x, y)

print(f"Jaccard similarity between the two strings: {similarity:.4f}")

Jaccard similarity between the two strings: 0.5000


# W Shingling
What w shingling does is he actually takes two or three words at a time instead of just splitting them indvidiually

In [8]:
a = "Resistor of 10 Ohm ±5% tolerance Cermamic type package"
split_a = a.split()
w_shingling_size = 2


In [11]:

shingles_a = []
for i in range(len(split_a) - w_shingling_size + 1):
    shingle  = ' '.join(split_a[i:i + w_shingling_size])
    shingles_a.append(shingle)
print(shingles_a)

['Resistor of', 'of 10', '10 Ohm', 'Ohm ±5%', '±5% tolerance', 'tolerance Cermamic', 'Cermamic type', 'type package']


Now instead of passing x we pass the new set genrated by shingles

In [13]:
def shingles(string: str, k: int) -> set:
    """Generate k-shingles from the input string.

    Args:
        string (str): Input string.
        k (int): Size of each shingle.

    Returns:
        set: A set of k-shingles.
    """
    split_string = string.split()
    shingles_set = set()
    for i in range(len(split_string) - k + 1):
        shingle = ' '.join(split_string[i:i + k])
        shingles_set.add(shingle)
    return shingles_set

def jaccard(x : set, y : set) -> float:
    intersection = x.intersection(y)
    union = x.union(y)

    if not union:
        return 1.0  # Both sets are empty

    return len(intersection) / len(union)

In [14]:
a = "Resistor of 10k Ohm ±5% tolerance" 
shingles_a = shingles(a, 2)
b = "Resistor of 5k Ohm ±10% tolerance"
shingles_b = shingles(b, 2)

print(f"W shingling Similarity: {jaccard(shingles_a, shingles_b):.4f}")

W shingling Similarity: 0.1111


# LevenShtein Similarity
Lev(A, B) is nothing but the minimum number of insert, delete or replace operations that takes to get the string A to String B

If you think of it broadely, it is DP problem in leetcode

In [4]:
# First lets follow the method we got from the lecture

import numpy as np

def levenshtein(a: str, b: str) -> int:
    """Compute the Levenshtein distance between two strings.

    Args:
        a (str): First input string.
        b (str): Second input string.

    Returns:
        int: Levenshtein distance between the two strings.
    """
    lev_matrix = np.zeros((len(a) + 1, len(b) + 1), dtype=int)

    for i in range(len(a) + 1):
        for j in range(len(b) + 1):
            if min(i, j) == 0:
                lev_matrix[i][j] = max(i, j)
            else:
                # Calucate costs for deletion, insertion, and substitution
                cost_del = lev_matrix[i - 1][j] # Deletion
                cost_ins = lev_matrix[i][j - 1] # Insertion
                cost_sub = lev_matrix[i - 1][j - 1] # Substitution
                min_cost = min(cost_del, cost_ins, cost_sub)
                if a[i - 1] != b[j - 1]:
                    min_cost += 1
                    
                lev_matrix[i][j] = min_cost
    
    return lev_matrix[len(a)][len(b)]

# Here in the lev matrix, lev_matrix[i][j] tells the minimum number of edits to convert a string to b

a = "Resistor of 10k Ohm ±5% tolerance"
b = "Resistor of 5k Ohm ±10% tolerance"
distance = levenshtein(a, b)
print(f"Levenshtein distance between the two strings: {distance}")

Levenshtein distance between the two strings: 4


In [5]:

def minDistance(word1, word2):
    m = len(word1) + 1
    n = len(word2)  + 1

    lev = [ [0]*(n+1) for i in range(m + 1) ]
    # lev[i][j] represents the minimum number of operations required to transform the 
    # substring word1[0...i-1] into the substring word2[0...j-1].
    for i in range(m + 1):
        lev[i][0] = i
    for j in range(n + 1):
        lev[0][j] = j
    '''
        These steps are because we see , 
        for example take 
        word1 = rahul
        word2 = rajul
        now lev[0][3] represents minmum number of operation to convert a null word to "ra"
        which means two additions
        vice vers lev[3][0] means minimum numbers of ops to convert "ra" to null which 
        means two deletions
        '''
    
    '''
        If word1[i-1] == word2[j-1], then dp[i][j] = dp[i-1][j-1].
        That is, no operation is required because the characters at positions i-1 and j-1
        are already same

        else
        dp[i-1][j-1] + 1: replace the character at position i-1 in word1 with the 
        character at position j-1 in word2
        dp[i-1][j] + 1: delete the character at position i-1 in word1.
        dp[i][j-1] + 1: insert the character at position j-1 in word2 into word1 at 
        position i.
    '''
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if word1[i - 1] == word2[j - 1]:
                lev[i][j] = lev[i - 1][j - 1]
            else:
                lev[i][j] = min(lev[i - 1][j - 1], lev[i - 1][j], lev[i][j - 1]) + 1

    return lev[m][n]        
            
    