In [1]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
original1 = "Thank your message to show our words to the doctor, as his next contract checking, to all of us."
reconstructed1 = "Thank your note to show our words to the supervisor, as his next agreement checking, to all of us."
original2 = "Also, kindly remind me please, if the doctor still plan for the acknowledgments section edit before he sending again."
reconstructed2 = "Also, kindly let me know me please, if the supervisor still intend for the thank you note part change before he sends it again."

In [3]:
sentences = [original1, reconstructed1, original2, reconstructed2]
tokenized_sentences = [simple_preprocess(s) for s in sentences]
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=2)

In [4]:
def average_similarity(words1, words2, model):
    similarities = []
    for w1 in words1:
        if w1 in model.wv:
            max_sim = max([
                cosine_similarity([model.wv[w1]], [model.wv[w2]])[0][0]
                for w2 in words2 if w2 in model.wv
            ], default=0)
            similarities.append(max_sim)
    return np.mean(similarities) if similarities else 0

In [5]:
def compare_sentences(original, reconstructed):
    tokens1 = simple_preprocess(original)
    tokens2 = simple_preprocess(reconstructed)
    sim1 = average_similarity(tokens1, tokens2, model)
    sim2 = average_similarity(tokens2, tokens1, model)
    return (sim1 + sim2) / 2

In [6]:
similarity_1 = compare_sentences(original1, reconstructed1)
similarity_2 = compare_sentences(original2, reconstructed2)
print(f"Similarity score (original1 vs reconstructed1): {similarity_1:.4f}")
print(f"Similarity score (original2 vs reconstructed2): {similarity_2:.4f}")


Similarity score (original1 vs reconstructed1): 0.8700
Similarity score (original2 vs reconstructed2): 0.6652
