In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from gensim import downloader as gensim_api
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np

# 📜 Instructions

Define two sentences with an ambiguous word, like "bank":
```python
sentence_a = "He sat on the river bank."
sentence_b = "He went to the bank to deposit money."
```

## Word2Vec (Static):

1. Load a pre-trained Word2Vec model (e.g., `glove-wiki-gigaword-100` from gensim.downloader).
2. Get the vector for the word "bank".
3. **Observation:** Notice that you can only get one vector for "bank," regardless of context.

## BERT (Contextual):

1. Load a pre-trained BERT model and tokenizer (e.g., `bert-base-uncased`).
2. Tokenize sentence_a and pass it to the model to get its hidden states (embeddings).
3. Find the token index corresponding to the word "bank" in sentence_a and extract its vector from the last hidden state.
4. Repeat the process for sentence_b.
5. You now have two different vectors for "bank": `bank_vector_a` and `bank_vector_b`.
6. Calculate the cosine similarity between `bank_vector_a` and `bank_vector_b`.

In [3]:
sentence_a = "He sat on the river bank."
sentence_b = "He went to the bank to deposit money."
target_word = "bank"

In [4]:
# Word2Vec (Static)

glove = gensim_api.load("glove-wiki-gigaword-100") # load only once

In [5]:
def tokenize(text):
    return [t.strip(".,!?;:").lower() for t in text.split()]

tokens_a = tokenize(sentence_a)
tokens_b = tokenize(sentence_b)

idx_a = [i for i, t in enumerate(tokens_a) if t == target_word][0]
idx_b = [i for i, t in enumerate(tokens_b) if t == target_word][0]

print(f"Sentence A tokens: {tokens_a}")
print(f"Sentence B tokens: {tokens_b}")
print()
print(f"Positions of '{target_word}' in A: {idx_a}")
print(f"Positions of '{target_word}' in B: {idx_b}")
print()

bank_a = tokens_a[idx_a]
bank_b = tokens_b[idx_b]

bank_vec_in_a = glove.get_vector(bank_a)
bank_vec_in_b = glove.get_vector(bank_b)

print(f"First 5 values of 'bank' in sentence a: {bank_vec_in_a[:5]}")
print(f"First 5 values of 'bank' in sentence b: {bank_vec_in_b[:5]}")
print()

same_values = np.allclose(bank_vec_in_a, bank_vec_in_b)
print(f"'bank' vectors across the sentences are {'' if same_values else 'not '}the same")

Sentence A tokens: ['he', 'sat', 'on', 'the', 'river', 'bank']
Sentence B tokens: ['he', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money']

Positions of 'bank' in A: 5
Positions of 'bank' in B: 4

First 5 values of 'bank' in sentence a: [ 0.41869  -0.92211   0.048684  0.11798   0.22062 ]
First 5 values of 'bank' in sentence b: [ 0.41869  -0.92211   0.048684  0.11798   0.22062 ]

'bank' vectors across the sentences are the same


In [None]:
# BERT (Contextual)

# run only once

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

In [7]:
def get_token_vector(sentence, target):
    tokens = tokenizer(sentence, return_tensors="pt")
    outputs = model(**tokens)
    last_hidden_state = outputs.last_hidden_state.squeeze(0)

    token_strs = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
    print(f"\nTokens for '{sentence}':")
    print(token_strs)

    target_indices = [i for i, tok in enumerate(token_strs) if target in tok]
    if not target_indices:
        raise ValueError(f"Target '{target}' not found in tokenized output.")

    idx = target_indices[0]
    vec = last_hidden_state[idx].detach().numpy()
    return vec

vec_a = get_token_vector(sentence_a, target_word)
vec_b = get_token_vector(sentence_b, target_word)

def cosine_similarity(u, v):
    return float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)))

similarity = cosine_similarity(vec_a, vec_b)

print(f"\nCosine similarity between 'bank' in two contexts: {similarity:.6f}")


Tokens for 'He sat on the river bank.':
['[CLS]', 'he', 'sat', 'on', 'the', 'river', 'bank', '.', '[SEP]']

Tokens for 'He went to the bank to deposit money.':
['[CLS]', 'he', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.', '[SEP]']

Cosine similarity between 'bank' in two contexts: 0.472152


## 📜 Instructions

Define a query sentence and a list of candidate sentences:
```python
query = "A man is playing a guitar."
candidates = [
    "A person is strumming a musical instrument.",  # High similarity, different words
    "A woman is playing a guitar.",                 # High similarity, different subject
    "A man is eating a sandwich.",                  # Low similarity, same subject
    "The guitar is being played by a man.",         # High similarity, passive voice
    "Dogs are chasing a cat."                       # No similarity
]
```

### Method 1: Word2Vec (with Averaging)

For the query and each candidate sentence:
1. Get the Word2Vec vector for each word (you may want to lowercase and remove stop words).
2. Create a single "sentence vector" by averaging the vectors of all words in the sentence.
3. Calculate the cosine similarity between the query vector and each candidate vector.
4. Rank the candidates from most to least similar to the query.

### Method 2: BERT (with Mean Pooling)

For the query and each candidate sentence:
1. Get the BERT token embeddings (last hidden state) for the sentence.
2. Create a single "sentence vector" by averaging all the token embeddings (this is a common but often sub-optimal strategy called "mean pooling").
3. Calculate the cosine similarity between the query vector and each candidate vector.
4. Rank the candidates.

### Method 3: Sentence-BERT (The Right Tool)

1. Load a pre-trained Sentence-BERT model (e.g., `all-MiniLM-L6-v2`).
2. Use the `model.encode()` function to get a single, high-quality sentence embedding for the query and each candidate.
3. Calculate the cosine similarity between the query embedding and each candidate embedding.
4. Rank the candidates.

In [8]:
query = "A man is playing a guitar."
candidates = [
    "A person is strumming a musical instrument.",  # High similarity, different words
    "A woman is playing a guitar.",                 # High similarity, different subject
    "A man is eating a sandwich.",                  # Low similarity, same subject
    "The guitar is being played by a man.",         # High similarity, passive voice
    "Dogs are chasing a cat."                       # No similarity
]

def cosine(u, v):
    u = np.asarray(u, dtype = np.float32)
    v = np.asarray(v, dtype = np.float32)
    num = float(np.dot(u, v))
    den = float(np.linalg.norm(u) * np.linalg.norm(v))
    return 0.0 if den == 0.0 else num / den

def rank_by_similarity(query_vec, cand_vecs, labels):
    scores = [cosine(query_vec, v) for v in cand_vecs]
    order = np.argsort(scores)[::-1]
    return [(labels[i], scores[i]) for i in order]

In [9]:
# Word2Vec (with Averaging)

stopwords = {
    "a","an","the","is","am","are","was","were","be","been","being",
    "to","of","and","or","for","in","on","at","by","with","as","that","this",
    "it","its","his","her","their","he","she","they","you","i"
}

def tokenize(text):
    return [t.strip(".,!?;:()\"'").lower() for t in text.split() if t.strip(".,!?;:()\"'")]

def sentence_vec(text):
    tokens = [t for t in tokenize(text) if t not in stopwords and t in glove.key_to_index]
    token_vectors = [glove.get_vector(t) for t in tokens]
    matrix = np.stack(token_vectors, axis = 0)
    return matrix.mean(axis = 0)

query_vector = sentence_vec(query)
candidate_vectors = [sentence_vec(s) for s in candidates]
ranked = rank_by_similarity(query_vector, candidate_vectors, candidates)

for i, (sent, score) in enumerate(ranked, 1):
    print(f"{i}. {score:.4f}  |  {sent}")

1. 0.9838  |  The guitar is being played by a man.
2. 0.9730  |  A woman is playing a guitar.
3. 0.7859  |  A person is strumming a musical instrument.
4. 0.5927  |  A man is eating a sandwich.
5. 0.4672  |  Dogs are chasing a cat.


In [10]:
# BERT (with Mean Pooling)

def encode_mean(sentence):
    inputs = tokenizer(sentence, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)

    last_hidden = outputs.last_hidden_state
    mask = inputs["attention_mask"].unsqueeze(-1)

    summed = (last_hidden * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1)
    pooled = summed / counts

    return pooled.squeeze(0).cpu().numpy()

query_vector = encode_mean(query)
candidate_vectors = np.vstack([encode_mean(s) for s in candidates])

ranked = rank_by_similarity(query_vector, candidate_vectors, candidates)

print("\nBERT (Mean Pooling) similarity results:")
for i, (sent, score) in enumerate(ranked, 1):
    print(f"{i}. {score:.4f}  |  {sent}")


BERT (Mean Pooling) similarity results:
1. 0.9841  |  A woman is playing a guitar.
2. 0.8898  |  The guitar is being played by a man.
3. 0.8878  |  A man is eating a sandwich.
4. 0.7919  |  Dogs are chasing a cat.
5. 0.7911  |  A person is strumming a musical instrument.
