# Download embeddings model from hugging face

In [77]:
import sys
print(sys.executable)

/root/MLX_Week2-1/venv/bin/python


## get model for embeddings

In [78]:
from huggingface_hub import hf_hub_download
import torch
from model import CBOW

# model withou title hackers
model_path = hf_hub_download(repo_id="cocoritzy/cbow-upvotes_model", filename="cbow_model.pt")

## retreive the checkpoints --> A checkpoint is a file that saves the state of your model

In [79]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(model_path, map_location=device) #A checkpoint is a file that saves the state of your model (


In [80]:
print(checkpoint["model_state_dict"])

OrderedDict([('embeddings.weight', tensor([[-6.4979e-01, -5.7941e-01,  6.6106e-01,  ..., -2.2381e+00,
         -4.7060e-04,  1.5041e-01],
        [-4.9834e-01, -1.3562e+00,  5.9971e-02,  ...,  2.2810e-02,
         -4.2601e+00,  1.6657e+00],
        [-3.5826e-01,  1.8044e+00,  3.4292e-01,  ..., -4.9925e-01,
          6.3616e-02, -2.0757e-01],
        ...,
        [ 3.0328e+00,  2.2346e+00, -4.0947e+00,  ...,  1.5389e+00,
         -3.1774e+00,  1.5126e+00],
        [ 9.5289e-01, -7.5809e-01, -1.0489e+00,  ...,  2.7495e+00,
         -2.2415e+00,  6.3896e-01],
        [ 6.0927e-01, -3.6017e-01, -2.4570e+00,  ...,  2.5984e+00,
         -1.5189e+00, -1.4678e+00]], device='cuda:0')), ('linear.weight', tensor([[-0.9190, -3.4283, -1.5246,  ...,  0.7419,  2.3657, -0.5639],
        [-0.9271, -3.3398, -1.2597,  ...,  0.3505,  2.7461, -0.6402],
        [-1.0051, -3.6519, -1.2860,  ...,  0.3628,  2.5529, -0.4234],
        ...,
        [-1.0449, -2.6130, -1.2064,  ...,  0.3532,  2.4671, -0.5399],
   

## load the dimension - for embedding dim and vocab_list

In [81]:

token_to_index = checkpoint["token_to_index"]
embedding_dim= checkpoint["embedding_dim"]
vocab_size = len(token_to_index)  # fill in actual size


# Load the model architecture

In [82]:

model = CBOW(voc=vocab_size, emb=embedding_dim)

## load model parameters

In [83]:

model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)
model.eval()# it contains the model's parameters and other information needed to resume training or make predictions.

CBOW(
  (embeddings): Embedding(30000, 100)
  (linear): Linear(in_features=100, out_features=30000, bias=True)
)

# Load query and title documents - text hard and soft

In [84]:

from datasets import load_dataset

df_hn = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives")
df_sn = load_dataset("cocoritzy/week_2_triplet_dataset_soft_negatives")
# dataset = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives", split="train[:10%]") # 10% of the data


split_data_hn = df_hn["train"].train_test_split(test_size=0.2, seed=42) # 80% train, 20% test
split_data_sn = df_sn["train"].train_test_split(test_size=0.2, seed=42) # 80% train, 20% test

df_hn = split_data_hn["train"].to_pandas()
df_sf = split_data_sn["train"].to_pandas()


In [114]:
df_hn['query'].describe()

count                           63760
unique                          63760
top       why do atoms have no charge
freq                                1
Name: query, dtype: object

# Convert title to embedding

In [86]:
embedding_layer = model.embeddings

In [87]:
def title_to_embedding(words):
    tokens = words.lower().split()
    indices = [token_to_index.get(tok, 0) for tok in tokens]  # 0 for unknowns - get the value associated with the words

    indices_tensor = torch.tensor(indices, dtype=torch.long, device=device) # converts the list indices into a PyTorch tensors

    with torch.no_grad(): # This makes the code faster and uses less memory, because you're not training, just extracting embeddings.
        embeds = embedding_layer(indices_tensor) # [num_tokens, embedding_dim]
        return embeds.mean(dim=0) # average pooling


## little test on the side 

In [88]:
import torch.nn.functional as F

query = df_hn.iloc[12]["query"]
negative = df_hn.iloc[12]["negative_passage"]
postive = df_hn.iloc[12]["positive_passage"]

query_emb = title_to_embedding("bad")
negative_emb = title_to_embedding("good")
positive_emb = title_to_embedding("bad")

def cosine_similarity(x, y):
    return F.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0)).item()

sim_pos = cosine_similarity(query_emb, positive_emb)
sim_neg = cosine_similarity(query_emb, negative_emb)
print(sim_pos, sim_neg)

0.9999998807907104 0.6890109777450562


## apply it on all the datasets

In [89]:
import pandas as pd

def compute_similarity_from_row(row):
    query_vec = title_to_embedding(row['query'])
    pos_vec = title_to_embedding(row['positive_passage'])
    neg_vec = title_to_embedding(row['negative_passage'])

    sim_pos = F.cosine_similarity(query_vec.unsqueeze(0), pos_vec.unsqueeze(0)).item()
    sim_neg = F.cosine_similarity(query_vec.unsqueeze(0), neg_vec.unsqueeze(0)).item()

    return pd.Series({'sim_pos': sim_pos, 'sim_neg': sim_neg})


In [90]:
df_hn[["sim_pos", "sim_neg"]] = df_hn.apply(compute_similarity_from_row, axis=1)

In [91]:
df_hn["correct"] = df_hn["sim_pos"] > df_hn["sim_neg"]
accuracy = df_hn["correct"].mean()
print("Accuracy:", accuracy)


Accuracy: 0.5690401505646173


In [92]:
df_hn.head()

Unnamed: 0,query_id,query,positive_passage,negative_passage,negative_index_in_group,sim_pos,sim_neg,correct
0,22392,is it okay to mix magnesium and cranberry when...,Cranberry has relatively high levels of oxalat...,"However, studies suggest that cranberry does n...",8,0.609932,0.633598,False
1,41323,salary range associate professor,Associate Professor Salary. Associate Professo...,Professors in the United States are often vete...,1,0.748564,0.125579,True
2,42361,weather in tenerife in may,Below is the typical weather in Tenerife in Ma...,Weather in Tenerife in May. Home to some of Eu...,1,0.277476,0.365389,False
3,80083,what is morocco main language,The official language of Morocco is Arabic whi...,"Arabic, along with Berber, is one of two Moroc...",4,0.316427,0.051429,True
4,54985,tdap vaccine age limit,DTaP is approved for children under age 7. Tda...,1 Krishnarajah G. Cost-effectiveness analysis ...,1,0.451703,0.189826,True


In [93]:
df_hn['sim_neg'].mean()

np.float64(0.4157390201678429)

In [94]:
df_hn['sim_pos'].mean()

np.float64(0.4431712256025563)

In [95]:
df_hn.describe()

Unnamed: 0,query_id,negative_index_in_group,sim_pos,sim_neg
count,63760.0,63760.0,63760.0,63760.0
mean,60827.147114,4.601239,0.443171,0.415739
std,23756.762111,2.496681,0.166992,0.172019
min,19699.0,1.0,-0.324485,-0.422133
25%,40242.75,2.0,0.336305,0.305455
50%,60815.0,4.0,0.454442,0.42805
75%,81290.25,7.0,0.562367,0.539422
max,102128.0,10.0,0.957413,0.958487


## try better encoding 

# Instantiate towers

In [118]:
import torch
import torch.nn as nn

class QryTower(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(100, 1)

    def forward(self, x):
        return self.fc(x)

class DocTower(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(100, 1)

    def forward(self, x):
        return self.fc(x)

qryTower = QryTower()
docTower = DocTower()


In [119]:
def title_to_embedding(words):
    tokens = words.lower().split()
    indices = [token_to_index.get(tok, 0) for tok in tokens]  # 0 for unknowns - get the value associated with the words

    indices_tensor = torch.tensor(indices, dtype=torch.long, device=device) # converts the list indices into a PyTorch tensors

    with torch.no_grad(): # This makes the code faster and uses less memory, because you're not training, just extracting embeddings.
        embeds = embedding_layer(indices_tensor) # [num_tokens, embedding_dim]
        return embeds.mean(dim=0) # average pooling

In [120]:
query_vecs = []
pos_vecs = []
neg_vecs = []

for _, row in df_hn.iterrows():
    query_vecs.append(title_to_embedding(row["query"]))
    pos_vecs.append(title_to_embedding(row["positive_passage"]))
    neg_vecs.append(title_to_embedding(row["negative_passage"]))

# Stack into tensors: [N, D]
query_vecs = torch.stack(query_vecs)
pos_vecs = torch.stack(pos_vecs)
neg_vecs = torch.stack(neg_vecs)


In [121]:
pos_vecs[12].shape

torch.Size([100])

## Initialise the model - Two neural nets (QryTower, DocTower) that learn to project those embeddings into a new space

In [None]:
qryTower = QryTower().to(device) #Initialize and move models to the right device
docTower = DocTower().to(device)


## Pass embeddings through the models

In [None]:

qry = qryTower(query_vecs)   # [N, 1] 
pos = docTower(pos_vecs)     # [N, 1]
neg = docTower(neg_vecs)     # [N, 1]

# Cosine similarities along dim=1 (batch)
dst_pos = F.cosine_similarity(qry, pos, dim=1)
dst_neg = F.cosine_similarity(qry, neg, dim=1)

dst_dif = dst_pos - dst_neg
margin = 0.2
loss = torch.clamp(margin - dst_dif, min=0).mean()

loss.backward()


## define optimizer

In [124]:
import torch.optim as optim

optimizer = optim.Adam(
    list(qryTower.parameters()) + list(docTower.parameters()), 
    lr=1e-3
)


In [None]:
qryTower.train() # Puts both models into training mode
docTower.train()

for epoch in range(1):  # you can increase epochs
    total_loss = 0

    for i, row in df_hn.iterrows():
        query_vec = title_to_embedding(row["query"]).unsqueeze(0).to(device)
        pos_vec = title_to_embedding(row["positive_passage"]).unsqueeze(0).to(device)
        neg_vec = title_to_embedding(row["negative_passage"]).unsqueeze(0).to(device)

        qry = qryTower(query_vec)
        pos = docTower(pos_vec)
        neg = docTower(neg_vec)

        dst_pos = F.cosine_similarity(qry, pos)
        dst_neg = F.cosine_similarity(qry, neg)
        margin = 0.2
        loss = torch.clamp(margin - (dst_pos - dst_neg), min=0).mean() #ensures the loss is never negative

        optimizer.zero_grad() # clears old gradients
        loss.backward() # computes new gradients from the loss
        optimizer.step() #updates model weights to reduce the loss

        total_loss += loss.item() # Accumulate the scalar loss so you can report the average later

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(df_hn):.4f}")


Epoch 1, Loss: 0.3508


In [None]:
from torch.utils.data import Dataset

class TripletDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        query = title_to_embedding(row["query"])
        pos = title_to_embedding(row["positive_passage"])
        neg = title_to_embedding(row["negative_passage"])
        return query, pos, neg


In [None]:
from torch.utils.data import DataLoader

dataset = TripletDataset(df_hn)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)
