In [5]:
import numpy as np

from langchain_huggingface import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"

In [7]:
embedding_model = HuggingFaceEmbeddings(
    model_name = DEFAULT_EMBEDDING_MODEL
)

print(embedding_model.model_name)

all-MiniLM-L6-v2


In [25]:
# --- B. Define Test Sentences ---

# Sentence 1: Relevant to our PCOS domain
sentence_pcos = "Polycystic Ovary Syndrome (PCOS) is primarily characterized by hormonal imbalance."

# Sentence 2: Semantically similar to the first (both about health conditions)
sentence_diabetes = "Diabetes Mellitus is a metabolic disorder characterized by high blood sugar levels."

# Sentence 3: Completely unrelated
sentences_computer = "The Python programming language is excellent for machine learning and web development."

In [None]:
vector_pcos= embedding_model.embed_query(sentence_pcos)

vector_diabetes = embedding_model.embed_query(sentence_diabetes)




In [26]:
vector_computer = embedding_model.embed_query(sentences_computer)

In [27]:
print(vector_pcos)
print(vector_diabetes)
print(vector_computer)

[0.015075505711138248, -0.004277346655726433, -0.0518186017870903, 0.05898018926382065, 0.03557907044887543, -0.061084598302841187, 0.08376295864582062, 0.04922734573483467, 0.06900820881128311, 0.0053519695065915585, -0.07897625118494034, 0.10570316761732101, -0.041899748146533966, -0.011132310144603252, 0.03325052186846733, 0.10740663856267929, 0.025309143587946892, 0.03592119365930557, 0.03552217781543732, 0.0735660195350647, 0.04933444410562515, -0.07086186110973358, -0.041451770812273026, 0.05140148848295212, -0.09363303333520889, -0.08061583340167999, 0.03728669136762619, 0.04091250151395798, -0.05874478444457054, 0.06742937117815018, -0.052063122391700745, 0.07978279888629913, 0.008350413292646408, 0.03363165631890297, 0.026623617857694626, 0.06374864280223846, -0.03526676446199417, -0.03698297590017319, -0.061514586210250854, -0.0437050387263298, 0.07375186681747437, -0.042866867035627365, -0.010750862769782543, 0.0863417536020279, 0.008700708858668804, 0.06442583352327347, -0.

In [28]:
# We convert them to numpy arrays for the cosine_similarity function
vectors_to_compare = [vector_pcos, vector_diabetes, vector_computer]

# Calculate similarity scores
similarity_pcos_diabetes = cosine_similarity(
    np.array(vector_pcos).reshape(1, -1), 
    np.array(vector_diabetes).reshape(1, -1)
)[0][0]

similarity_pcos_computer = cosine_similarity(
    np.array(vector_pcos).reshape(1, -1), 
    np.array(vector_computer).reshape(1, -1)
)[0][0]

print("\n--- Semantic Similarity Scores (0.0 = different, 1.0 = same) ---")
print(f"PCOS vs. Diabetes (Similar meaning): {similarity_pcos_diabetes:.4f}")
print(f"PCOS vs. Computer (Unrelated meaning): {similarity_pcos_computer:.4f}")


--- Semantic Similarity Scores (0.0 = different, 1.0 = same) ---
PCOS vs. Diabetes (Similar meaning): 0.1970
PCOS vs. Computer (Unrelated meaning): 0.0370
