# Text Similarity Metrics

Exercise notebook

Course: Algorytmy Tekstowe at AGH University

## Preprocessing and vectorization

1. Preprocessing: Convert the text documents to lowercase and remove all punctuation marks (using regular expressions, for example).
2. Vocabulary creation: Create a vocabulary by taking all unique words from all text documents.
3. Word frequency vectors: Create two vectors, each representing the frequency of each word in the vocabulary in each text document.

In [20]:
from collections import defaultdict
import re

def preprocess(text: str) -> str:
    # Your code here:
    # Convert the text to lowercase.
    # Remove all punctuation marks;
    text = text.lower()
    return re.sub('[^a-z0-9]+', ' ', text)


def text_to_vec(docs: list[str]) -> list[list[int]]:
    # Your code here:
    # Convert documents to numerical vectors.
    # Preprocess documents with the preprocess() function.
    # Represent documents as vectors of word frequencies, 
    # you will need to extract a vocabulary from all the documents.
    freq_vecs = []
    new_docs = []
    for doc in docs:
        new_docs.append(preprocess(doc))
    dictonary = defaultdict(lambda : 0)

    for doc in new_docs:
        words = doc.split()
        for word in words:
            dictonary[word] += 1

    dict_by_id = defaultdict(lambda : 0)

    i = 0

    for key, value in dictonary.items():
        dict_by_id[key] = i
        i += 1

    for doc in new_docs:
        vec = [0 for _ in range(len(dictonary))]
        words = doc.split()
        for word in words:
            vec[dict_by_id[word]] += 1
        freq_vecs.append(vec)
    return freq_vecs


In [21]:
# Tests
text_a = "The quick brown fox jumped over the lazy dog."
text_b = "The lazy dog was jumped over by the quick brown fox."
vec_a, vec_b = text_to_vec([text_a, text_b])


assert(set(vec_a) == set([1, 1, 1, 2, 1, 1, 1, 1, 0, 0]))
assert(set(vec_b) == set([1, 1, 1, 2, 1, 1, 1, 1, 1, 1]))

## Cosine similarity

$$
\begin{equation}
    \cos(\theta) = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|}= \frac{\sum\limits_{i=1}^{n} A_i B_i}{\sqrt{\sum\limits_{i=1}^{n} A_i^2} \sqrt{\sum\limits_{i=1}^{n} B_i^2}}
    \qquad\begin{aligned}
    &\text{where:} \\
    &\mathbf{A}\text{ and }\mathbf{B} \text{ are the two vectors being compared}\\
    &n \text{ is the dimensionality of the vectors}\\
    &\theta \text{ represents the angle between two vectors } \mathbf{A} \text{ and } \mathbf{B} \text{ in a high-dimensional space}
    \end{aligned}
\end{equation}
$$

The dot product of $\mathbf{A}$ and $\mathbf{B}$ is divided by the product of their Euclidean lengths to normalize the result to a range of [-1, 1]. A value of 1 indicates that the two vectors are identical, while a value of -1 indicates that they are completely dissimilar.


In [27]:
import math

def cosine_similarity(text_a: str, text_b: str) -> float:
    # Your code here:
    # Implement the cosine similarity
    def get_dot_product(vec1, vec2):
        sum = 0
        for i in range(len(vec1)):
            sum += vec1[i] * vec2[i]
        return sum

    def power(x):
        return x*x

    def get_length(vec):
        return math.sqrt(sum(map(power, vec)))


    vec_a, vec_b = text_to_vec([text_a, text_b])
    

    cosine_similarity = get_dot_product(vec_a, vec_b) / (get_length(vec_a) * get_length(vec_b))
    
    return cosine_similarity

In [28]:
# Tests
dist = cosine_similarity(text_a, text_b)
assert(abs(dist - 0.91986) < 0.0001)

## Dice coefficient / Sørensen-Dice Index

$$
\begin{equation}
    \text{Dice}(A, B) = \frac{2 |A \cap B|}{|A| + |B|} 
    \qquad\begin{aligned}
    &\text{where:} \\
    &A \text{ and } B \text{ represent the two sets being compared} \\
    &|A| \text{ and } |B| \text{ represent the cardinality (number of elements) of the sets} \\
    &\text{and } |A \cap B| \text{ represents the size of the intersection of the two sets}
    \end{aligned}
\end{equation}
$$


In [35]:
def dice(text_a: str, text_b: str) -> float:
    # Your code here:
    # Implement the Dice coefficient
    dice = 0
    docs = [text_a, text_b]
    new_docs = []
    words = []
    for doc in docs:
        new_docs.append(preprocess(doc))

    for doc in new_docs:
        words.append(set(doc.split()))
    
    

    return 2 * len(words[0] & words[1]) / (len(words[0]) + len(words[1]))

dice(text_a, text_b)

0.8888888888888888

In [36]:
# Tests
dist = dice(text_a, text_b)
assert(abs(dist - 0.88888) < 0.0001)

## Euclidean distance

$$
\begin{equation}
    d(x,y) = \sqrt{\sum_{i=1}^{n}(x_i-y_i)^2}
    \qquad\begin{aligned}
    &\text{where:} \\
    &d(x,y) \text{ is the Euclidean distance} \\
    &x_i, y_i \text{ are the values of the i-th dimension of vectors } x \text{ and } y \\
    &n \text{ is the number of dimensions in the vectors}
    \end{aligned}
\end{equation}
$$

In [41]:
def euclidean_distance(text_a: str, text_b: str) -> float:
    # Your code here:
    # Implement the Euclidean distance
    dist = 0
    vec_a, vec_b = text_to_vec([text_a, text_b])
    for i in range(len(vec_a)):
        dist += (vec_a[i] - vec_b[i])**2
    
    return math.sqrt(dist)

In [42]:
# Tests

dist = euclidean_distance(text_a, text_b)
assert(abs(dist - 1.4142135) < 0.0001)

## LCS - Longest Common Subsequence

Longest, common, continuous subsequence of two sequences, aka "the longest substring".

In [57]:
from typing import Any, Sequence

def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
    # Your code here:
    # Implement the longest common subsequence calculation.
    # It should work on any sequences, not only on strings.
    len1 = len(seq_a) + 1  # wiersze
    len2 = len(seq_b) + 1  # kolumny
    tab = [[0 for _ in range(len2)] for _ in range(len1)]

    for i in range(len1):
        tab[i][0] = i
    for i in range(len2):
        tab[0][i] = i

    for i in range(1, len1):
        for j in range(1, len2):
            if seq_a[i-1] != seq_b[j-1]:
                tab[i][j] = min(tab[i-1][j-1] + 2, tab[i]
                                [j-1] + 1, tab[i-1][j] + 1)
            else:
                tab[i][j] = min(tab[i-1][j-1], tab[i]
                                [j-1] + 1, tab[i-1][j] + 1)


    return int((len(seq_a) + len(seq_b) - tab[-1][-1])/2)

def word_lcs(text_a: str, text_b: str) -> int:
    # You code here:
    # Using the above function implement the LCS algorithm for texts.
    # Make sure it works on whole words, not on characters.
    seq_a = preprocess(text_a).split()
    seq_b = preprocess(text_b).split()

    return lcs(seq_a, seq_b)


In [58]:
# Tests
assert lcs("banana", "ananas") == 5
assert word_lcs(text_a, text_b) == 4

## Levenshtein distance

The minimal number of operations that needs to be performed in order to turn sequence A into sequence B.

Available operations:

* Replace element
* Remove element
* Add element

In [64]:

def levenshtein(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
    # Your code here:
    # Implement the Levenshtein distance calculation.
    # It should work on any sequences, not only on strings.

    dist = 0
    len1 = len(seq_a) + 1  # wiersze
    len2 = len(seq_b) + 1  # kolumny
    tab = [[0 for _ in range(len2)] for _ in range(len1)]

    for i in range(len1):
        tab[i][0] = i
    for i in range(len2):
        tab[0][i] = i

    for i in range(1, len1):
        for j in range(1, len2):
            if seq_a[i-1] != seq_b[j-1]:
                tab[i][j] = min(tab[i-1][j-1] + 1, tab[i]
                                [j-1] + 1, tab[i-1][j] + 1)
            else:
                tab[i][j] = min(tab[i-1][j-1], tab[i]
                            [j-1] + 1, tab[i-1][j] + 1)

    # for i in range(len(tab)):
    #     print(tab[i])
    # print("WYNIK")
    return tab[-1][-1]

    


def word_levenshtein(text_a: str, text_b: str) -> int:
    # You code here:
    # Using the above function implement the LCS algorithm for texts.
    # Make sure it works on whole words, not on characters.
    seq_a = preprocess(text_a).split()
    seq_b = preprocess(text_b).split()
    
    return levenshtein(seq_a, seq_b)


In [66]:
# Tests
assert levenshtein("banana", "ananas") == 2
assert word_levenshtein(text_a, text_b) == 7