In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN
import numpy as np
from scipy.spatial.distance import cdist
import itertools
import torch

""" Importing and initiating the model """
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

""" Creating a function for parsing through the "filtered_phrases.txt", adding it all to the list and returning the list """
def parse_phrases_to_list(input_file):
    phrases_list = []

    with open(input_file, 'r') as f:
        for line in f:
            phrase = line.strip()
            if phrase: 
                phrases_list.append(phrase)
    
    return phrases_list

input_file = 'Filtered_Phrases.txt'


phrases = parse_phrases_to_list(input_file)

""" Generating the vector embeddings for all phrases."""
embeddings = model.encode(phrases, convert_to_tensor=True)

""" Calculating the cosine similarity between all the phrases and saving them as a matrix."""
cosine_sim_matrix = util.pytorch_cos_sim(embeddings, embeddings)

""" Converting the cosine similarity metrics to cosine distance, so that we can identify the closer ones with their minimum distance."""

cosine_dist_matrix = 1 - cosine_sim_matrix.numpy()

cosine_dist_matrix = np.clip(cosine_dist_matrix, 0, None)

db = DBSCAN(metric='precomputed', eps=0.3, min_samples=1)
labels = db.fit_predict(cosine_dist_matrix)

""" Selecting the correct canonical phrases, canonical meaning the "keys" in our dictionary of tags. """
unique_labels = set(labels)
grouped_phrases = {}

for label in unique_labels:
    indices = np.where(labels == label)[0]
    cluster_embeddings = embeddings[indices]
    cluster_embeddings_np = cluster_embeddings.numpy()
    centroid = np.mean(cluster_embeddings_np, axis=0)
    centroid = torch.mean(cluster_embeddings, axis=0)
    distances = cdist([centroid], cluster_embeddings, metric='cosine')[0]
    canonical_index = indices[np.argmin(distances)]
    canonical_phrase = phrases[canonical_index]
    grouped_phrases[canonical_phrase] = [phrases[i] for i in indices]

""" Making sure the canonical phrases are unique"""
canonical_phrases = list(grouped_phrases.keys())
canonical_embeddings = model.encode(canonical_phrases, convert_to_tensor=False)
canonical_embeddings = np.array(canonical_embeddings)

pairwise_sim = util.cos_sim(canonical_embeddings, canonical_embeddings).numpy()

uniqueness_threshold = 0.8
similar_pairs = []
for i, j in itertools.combinations(range(len(canonical_phrases)), 2):
    if pairwise_sim[i][j] > uniqueness_threshold:
        similar_pairs.append((canonical_phrases[i], canonical_phrases[j]))

""" This is only for handling the similar pairs if we they are found in run time. """
if similar_pairs:
    print("Some canonical phrases are too similar:")
    for pair in similar_pairs:
        print(f"- {pair[0]} and {pair[1]}")
else:
    print("All canonical phrases are semantically unique.")


"""Outputting the grouped dictionary that we have got."""
output_file = "Grouped_Output_bert_and_clustered.txt"

with open(output_file, 'w') as f_out:
    for key, phrase in grouped_phrases.items():
        f_out.write(f"{key}: {phrase}" + '\n')

print("Grouped Tags:")

"""I displayed it to help me in identifying the issue in run time."""
for canonical, group in grouped_phrases.items():
    print(f"{canonical}: {group}")
