# Text similarity methods 

In [1]:
import re
import math
import numpy as np
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import fclusterdata

***

## 1. Zaimplementuj przynajmniej 3 "metryki" spośród wymienionych: cosinusowa, LCS, DICE, euklidesowa, Levenshteina.

### Cosine similarity

$$
\begin{equation}
    \cos(\theta) = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|}= \frac{\sum\limits_{i=1}^{n} A_i B_i}{\sqrt{\sum\limits_{i=1}^{n} A_i^2} \sqrt{\sum\limits_{i=1}^{n} B_i^2}}
    \qquad\begin{aligned}
    &\text{where:} \\
    &\mathbf{A}\text{ and }\mathbf{B} \text{ are the two vectors being compared}\\
    &n \text{ is the dimensionality of the vectors}\\
    &\theta \text{ represents the angle between two vectors } \mathbf{A} \text{ and } \mathbf{B} \text{ in a high-dimensional space}
    \end{aligned}
\end{equation}
$$

The dot product of $\mathbf{A}$ and $\mathbf{B}$ is divided by the product of their Euclidean lengths to normalize the result to a range of [-1, 1]. A value of 1 indicates that the two vectors are identical, while a value of -1 indicates that they are completely dissimilar.


In [186]:
def cosine_similarity(vector_a, vector_b) -> float:
    if np.linalg.norm(vector_a) == 0 or np.linalg.norm(vector_b) == 0:
        return 2
    return -(np.dot(vector_a, vector_b) / (np.linalg.norm(vector_a) * np.linalg.norm(vector_b))) + 2

### Dice coefficient / Sørensen-Dice Index

$$
\begin{equation}
    \text{Dice}(A, B) = \frac{2 |A \cap B|}{|A| + |B|} 
    \qquad\begin{aligned}
    &\text{where:} \\
    &A \text{ and } B \text{ represent the two sets being compared} \\
    &|A| \text{ and } |B| \text{ represent the cardinality (number of elements) of the sets} \\
    &\text{and } |A \cap B| \text{ represents the size of the intersection of the two sets}
    \end{aligned}
\end{equation}
$$


In [303]:
def dice_coefficient(vector_a, vector_b) -> float:
    x = np.count_nonzero(vector_a)
    y = np.count_nonzero(vector_b)
    if  x+y == 0:
        return 2
    return -((2 * np.count_nonzero(vector_a * vector_b)) / (x+y)) + 2

### Euclidean distance

$$
\begin{equation}
    d(x,y) = \sqrt{\sum_{i=1}^{n}(x_i-y_i)^2}
    \qquad\begin{aligned}
    &\text{where:} \\
    &d(x,y) \text{ is the Euclidean distance} \\
    &x_i, y_i \text{ are the values of the i-th dimension of vectors } x \text{ and } y \\
    &n \text{ is the number of dimensions in the vectors}
    \end{aligned}
\end{equation}
$$

In [6]:
def euclidean_distance( vector_a, vector_b) -> float:
    return np.linalg.norm(vector_a - vector_b)

## 2. Zaimplementuj przynajmniej 1 sposób oceny jakości klasteryzacji (np. indeks Daviesa-Bouldina).

In [479]:
def get_centroid(indexes, vectors):
    centroid = sum(vectors[indexes])
    centroid = centroid / len(indexes)
    return centroid

In [457]:
def get_avg_distance(centroid, vectors, indexes, metric):
    total = 0

    for i in range(len(indexes)):
        total += metric(centroid, vectors[indexes[i]])

    return total / len(indexes)

In [460]:
def davies_bouldin_index(vectors, clusters, metric):
    unique_clusters = set(clusters)
    
    centroids = {}
    for cluster_idx in unique_clusters:
        indexes = np.where(clusters == cluster_idx)[0]
        centroids[cluster_idx] = get_centroid(indexes, vectors)
    
    avg_distances = [0]
    for cluster_idx in unique_clusters:
        indexes = np.where(clusters == cluster_idx)[0]
        avg_distances.append(get_avg_distance(centroids[cluster_idx], vectors, indexes, metric))
    
    seperations = [[0 for _ in range(len(unique_clusters) + 1)] for _ in range(len(unique_clusters) + 1)]
    for i in unique_clusters:
        for j in unique_clusters:
            seperations[i][j] = metric(centroids[i], centroids[j])
    
    results = [[0 for _ in range(len(unique_clusters) + 1)] for _ in range(len(unique_clusters) + 1)]
    for i in unique_clusters:
        for j in unique_clusters:
            if i != j:
                results[i][j] = (avg_distances[i] + avg_distances[j]) / seperations[i][j]
    
    Ds = [0 for _ in range(len(unique_clusters) + 1)]
    for i in unique_clusters:
        result1 = results[i][:i]
        result2 = results[i][i+1:]
        if len(result1):
            Ds[i] = max(Ds[i], max(result1))
        if len(result2):
            Ds[i] = max(Ds[i], max(result2))
    
    return sum(Ds) / len(Ds)

## 3. Stwórz stoplistę najczęściej występujących słów i zastosuj ją jako pre-processing dla nazw. Algorytmy klasteryzacji powinny działać na dwóch wariantach: z pre-processingiem i bez pre-processingu.

In [7]:
class Words_Bagging:
    def __init__(self, texts: list[str]) -> None:
        self.texts_without_noises = self.remove_noises(texts)
        self.words_bag = self.find_words_bag()
        
    def remove_noises(self, texts) -> None:
        texts_without_noises = []
        for text in texts:
            text = text.lower()
            text = re.sub(r'[^\w\s]', '' , text)
            texts_without_noises.append(text)    
        
        return texts_without_noises
    
    def find_words_bag(self) -> dict[str, int]:
        words_bag = {}
        idx = 0

        for text in self.texts_without_noises:
            words = text.split()
            for word in words:
                if word not in words_bag:
                    words_bag[word] = idx
                    idx += 1

        return words_bag
    
    def get_words_bag(self) -> dict[str, int]:
        vectors = self.texts_to_vectors(self.words_bag)
        return self.words_bag, vectors
    
    def texts_to_vectors(self, words_bag: dict[str, int]) -> list[list[int]]:
        freq_vecs = []

        for text in self.texts_without_noises:
            vector = np.zeros(len(words_bag))
            words = text.split()
            for word in words:
                if word in words_bag:
                    idx = words_bag[word]
                    vector[idx] += 1
            
            freq_vecs.append(VectorText(text, vector))

        return freq_vecs
    
    def get_preprocessed_words_bag(self, number_of_most_popular_words_to_remove: int = None) -> dict[str, int]:
        if number_of_most_popular_words_to_remove is None:
            number_of_most_popular_words_to_remove = len(self.words_bag) // 5
            
        preprocessed_words_bag = self.preprocess(number_of_most_popular_words_to_remove)
        vectors = self.texts_to_vectors(preprocessed_words_bag)
        return preprocessed_words_bag, vectors
    
    def preprocess(self, number_of_most_popular_words_to_remove: int) -> str:
        stop_list = self.get_stop_list(number_of_most_popular_words_to_remove)
        preprocessed_words_bag = self.words_bag.copy()
        
        for stop_word in stop_list:
            preprocessed_words_bag.pop(stop_word)
        
        idx = 0
        for key in preprocessed_words_bag.keys():
            preprocessed_words_bag[key] = idx
            idx += 1
            
        return preprocessed_words_bag
    
    def get_stop_list(self, size: int) -> list[str]:
        stop_list = []
        
        words_frequency = self.get_words_frequency()    
        words_with_frequencies = [(word, words_frequency[index]) for (word, index) in self.words_bag.items()]
        sorted_words_with_frequencies = sorted(words_with_frequencies, key=lambda x: -x[1])
        
        stop_list = [sorted_words_with_frequencies[i][0] for i in range(size)]

        return stop_list
    
    def get_words_frequency(self) -> list[int]:
        words_frequency = [0 for _ in range(len(self.words_bag))]

        for text in self.texts_without_noises:
            available = [True for _ in range(len(self.words_bag))]
            words = text.split()
            for word in words:
                idx = self.words_bag[word]
                if available[idx]:
                    words_frequency[idx] += 1
                    available[idx] = False

        return words_frequency

In [8]:
class VectorText:
    def __init__(self, text, vector):
        self.text = text
        self.vector = vector

## 4. Wykonaj klasteryzację zawartości załączonego pliku (lines.txt) przy użyciu  metryk zaimplementowanych w pkt. 1. Każda linia to adres pocztowy firmy, różne sposoby zapisu tego samego adresu powinny się znaleźć w jednym klastrze.

In [431]:
with open("lines.txt") as file:
    lines = file.readlines()

lines = lines[:1000]
WB = Words_Bagging(lines)

***

In [432]:
words_bag_without_preprocessing, vectors_without_preprocessing = WB.get_words_bag()
vectors1 = np.array([vector.vector for vector in vectors_without_preprocessing])

In [433]:
result_cosine_similarity_without_preprocessing = fclusterdata(vectors1, t=5000, criterion="maxclust",metric=cosine_similarity)
result_cosine_similarity_without_preprocessing = np.array(result_cosine_similarity_without_preprocessing)

In [434]:
result_dice_coefficient_without_preprocessing = fclusterdata(vectors1, t=5000, criterion="maxclust",metric=dice_coefficient)
result_dice_coefficient_without_preprocessing = np.array(result_dice_coefficient_without_preprocessing)

In [435]:
result_euclidean_distance_without_preprocessing = fclusterdata(vectors1, t=5000, criterion="maxclust",metric=euclidean_distance)
result_euclidean_distance_without_preprocessing = np.array(result_euclidean_distance_without_preprocessing)

***

In [499]:
words_bag_with_preprocessing, vectors_with_preprocessing = WB.get_preprocessed_words_bag(400)
vectors2 = np.array([vector.vector for vector in vectors_with_preprocessing])

In [500]:
result_cosine_similarity_with_preprocessing = fclusterdata(vectors2, t=5000, criterion="maxclust",metric=cosine_similarity)
result_cosine_similarity_with_preprocessing = np.array(result_cosine_similarity_with_preprocessing)

In [501]:
result_dice_coefficient_with_preprocessing = fclusterdata(vectors2, t=5000, criterion="maxclust",metric=dice_coefficient)
result_dice_coefficient_with_preprocessing = np.array(result_dice_coefficient_with_preprocessing)

In [502]:
result_euclidean_distance_with_preprocessing = fclusterdata(vectors2, t=5000, criterion="maxclust",metric=euclidean_distance)
result_euclidean_distance_with_preprocessing = np.array(result_euclidean_distance_with_preprocessing)

## 5. Porównaj jakość wyników sposobami zaimplementowanymi w pkt. 2.

### Metryka - cosinus similarity bez stoplisty

In [491]:
davies_bouldin_index(vectors1, result_cosine_similarity_without_preprocessing, cosine_similarity)

1.4772602942089679

### Metryka - cosinus similarity ze stoplistą

In [503]:
davies_bouldin_index(vectors2, result_cosine_similarity_with_preprocessing, cosine_similarity)

1.584115819992787

### Metryka - dice coefficient bez stoplisty

In [493]:
davies_bouldin_index(vectors1, result_dice_coefficient_without_preprocessing, dice_coefficient)

1.4354596505140353

### Metryka - dice coefficient ze stoplistą

In [504]:
davies_bouldin_index(vectors2, result_dice_coefficient_with_preprocessing, dice_coefficient)

1.5490550850985179

### Metryka - euclidean distance bez stoplisty

In [495]:
davies_bouldin_index(vectors1, result_euclidean_distance_without_preprocessing, euclidean_distance)

0.0

### Metryka - euclidean distance ze stoplistą

In [505]:
davies_bouldin_index(vectors2, result_euclidean_distance_with_preprocessing, euclidean_distance)

0.0

## 6. Czy masz jakiś pomysł na poprawę jakości klasteryzacji w tym zadaniu?

1. Możnaby lepiej dobrać liczbę najpopularniejszych słów do usunięcia z tekstów, oraz usunąć rzadko pojawiające się słowa
2. Lepsze dane - na danych które mieliśmy nasze metryki mogły pomijać rezultaty w których spacje są wstawione w środku zdań, możnaby zastosować dodatkowy preprocessing.
3. Uwzględnienie tego by wyrazy które się w sobie zawierają były bliżej lub nawet były traktowane jako to samo.