## For numerical data

In [1]:
import numpy as np 

In [2]:
vector1 = [1,2,3,4,5]
vector1 = np.array(vector1)
vector2 = [6,7,8,9,10]
vector2 = np.array(vector2)

In [3]:
def euclidian_distance(v1, v2):
    return np.sqrt(np.sum((v1 - v2) ** 2))
print("Euclidian distance:", euclidian_distance(vector1, vector2))

Euclidian distance: 11.180339887498949


In [4]:
def manhattan_dsitance(v1, v2):
    return np.sum(np.abs(v1 - v2))
print("Manhattan distance:", manhattan_dsitance(vector1, vector2))

Manhattan distance: 25


In [5]:
# For p=2
def minkowski_distance(v1, v2, p=2):
    return sum(abs(a - b) ** p for a, b in zip(v1, v2)) ** (1/p)
print("Minkowski distance:", minkowski_distance(vector1, vector2))

Minkowski distance: 11.180339887498949


In [6]:
def supremum_distance(v1, v2):
    return np.max(np.abs(v1 - v2))
print("Supremum distance:", supremum_distance(vector1, vector2))

Supremum distance: 5


## For binary data

In [9]:
vector1 = [1,2,3,4,5]
vector1 = set(vector1)
vector2 = [4,5,6,7,8]
vector2 = set(vector2)

In [10]:
def simple_matching_coefficient(v1, v2):
    return len(v1.intersection(v2)) / len(v1.union(v2))
print("Simple matching coefficient:", simple_matching_coefficient(vector1, vector2))

Simple matching coefficient: 0.25


In [11]:
def jaccard_coeffecient(v1, v2):
    return len(v1.intersection(v2)) / len(v1.union(v2))
print("Jaccard coefficient:", jaccard_coeffecient(vector1, vector2))

Jaccard coefficient: 0.25


## For Textual data

In [14]:
vector1 = 'resin'
vector1 = set(vector1)
vector2 = 'resist'
vector2 = set(vector2)

def jaccard_similarity(v1, v2):
    return len(v1.intersection(v2)) / len(v1.union(v2))
print("Jaccard similarity:", jaccard_similarity(vector1, vector2))

Jaccard similarity: 0.6666666666666666


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "This is a restaurant.",
    "This is another restaurant.",
    "One more restaurant."
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Cosine Similarity Matrix: \n", cosine_similarity(X, X))

Cosine Similarity Matrix: 
 [[1.         0.8660254  0.33333333]
 [0.8660254  1.         0.28867513]
 [0.33333333 0.28867513 1.        ]]


In [20]:
def edit_distance(str1, str2):
    m, n = len(str1), len(str2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j],        
                                   dp[i][j - 1],        
                                   dp[i - 1][j - 1])   

    return dp[m][n]

str1 = "smitten"
str2 = "sitting"
distance = edit_distance(str1, str2)
print("Edit distance between '{}' and '{}' is: {}".format(str1, str2, distance))

Edit distance between 'smitten' and 'sitting' is: 3


In [22]:
def jaro_distance(str1, str2):
    if not (str1 or str2):
        return 1.0

    match_distance = max(len(str1), len(str2)) // 2 - 1
    matches = 0
    transpositions = 0

    str1_matches = [False] * len(str1)
    str2_matches = [False] * len(str2)

    for i in range(len(str1)):
        start = max(0, i - match_distance)
        end = min(i + match_distance + 1, len(str2))

        for j in range(start, end):
            if str2_matches[j]:
                continue
            if str1[i] == str2[j]:
                str1_matches[i] = str2_matches[j] = True
                matches += 1
                break

    if matches == 0:
        return 0.0
    k = transpositions = 0
    for i in range(len(str1)):
        if not str1_matches[i]:
            continue
        while not str2_matches[k]:
            k += 1
        if str1[i] != str2[k]:
            transpositions += 1
        k += 1

    return (matches / len(str1) + matches / len(str2) + (matches - transpositions // 2) / matches) / 3

# Example usage:
str1 = "alppe"
str2 = "apple"
distance = jaro_distance(str1, str2)
print("Jaro distance between '{}' and '{}' is: {:.4f}".format(str1, str2, distance))

Jaro distance between 'alppe' and 'apple' is: 0.8667


In [16]:
from collections import Counter

def n_gram_distance(str1, str2, n):
    ngrams1 = [str1[i:i+n] for i in range(len(str1) - n + 1)]
    ngrams2 = [str2[i:i+n] for i in range(len(str2) - n + 1)]

    count1 = Counter(ngrams1)
    count2 = Counter(ngrams2)

    total_unique_ngrams = len(set(ngrams1).union(set(ngrams2)))
    shared_ngrams = sum((count1 & count2).values())

    n_gram_distance = 1 - (shared_ngrams / total_unique_ngrams)
    return n_gram_distance

str1 = "resist"
str2 = "resin"
n = 2
distance = n_gram_distance(str1, str2, n)
print("N-gram distance between '{}' and '{}' ({}-grams) is: {:.4f}".format(str1, str2, n, distance))


N-gram distance between 'resist' and 'resin' (2-grams) is: 0.5000
