Download embedding model 

In [117]:
from huggingface_hub import hf_hub_download
import torch
from model import CBOW

# model withou title hackers
model_path = hf_hub_download(repo_id="cocoritzy/cbow-upvotes_model", filename="cbow_model.pt")

Load model architecture and parameters

In [118]:
# Retrieve checkpoint 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(model_path, map_location=device) #A checkpoint is a file that saves the state of your model (
token_to_index = checkpoint["token_to_index"]
embedding_dim= checkpoint["embedding_dim"]
vocab_size = len(token_to_index)  # fill in actual size

In [119]:

model = CBOW(voc=vocab_size, emb=embedding_dim)
model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)
model.eval()# it contains the model's parameters and other information needed to resume training or make predictions.
embedding_layer = model.embeddings


Load query and title documents - text hard and soft

In [120]:
from datasets import load_dataset

df_hn = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives")
df_sn = load_dataset("cocoritzy/week_2_triplet_dataset_soft_negatives")
# dataset = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives", split="train[:10%]") # 10% of the datab
df_hn = df_hn["train"].to_pandas()
df_sn = df_sn["train"].to_pandas()

In [121]:
df_sn

Unnamed: 0,query_id,query,positive_passage,negative_passage,negative_from_query_id
0,19699,what is rba,Results-Based Accountability® (also known as R...,I finally found some real salary data for phys...,86595
1,19700,was ronald reagan a democrat,"From Wikipedia, the free encyclopedia. A Reaga...",The Pacific Ocean lies to the east while the S...,66360
2,19701,how long do you need for sydney and surroundin...,Sydney is the capital city of the Australian s...,"Probiotics are found in foods such as yogurt, ...",88507
3,19702,price to install tile in shower,1 Install ceramic tile floor to match shower-A...,Iodine is critical to thyroid health and funct...,87550
4,19703,why conversion observed in body,Conversion disorder is a type of somatoform di...,The answer to the question how much does it co...,61479
...,...,...,...,...,...
79699,102124,meaning of propagation,definition of propagation the act or action of...,A minimum of two credits of laboratory science...,21857
79700,102125,do you have to do a phd to be a clinical psych...,The goal you choose will determine your path. ...,1 The mitochondria of eukaryotes evolved from ...,28764
79701,102126,what wine goes with oysters,You may also enjoy these other types of wine w...,Raynaud's (say ray-NOHZ) phenomenon is a probl...,42284
79702,102127,what strengths does lithium come in,"Lithium 150 mg. Lithium (Eskalith ® , Eskalith...",While kids feel like they’ve been grownups for...,42891


split train/test data

In [122]:
import random
from sklearn.model_selection import train_test_split
# Split into training and test sets (e.g. 20% train, 2% test = 90% drop)

train_df, test_df = train_test_split(
    df_sn,
    train_size=0.20,
    test_size=0.02,
    random_state=42
)


Instantiate towers

In [123]:
import torch
import torch.nn as nn

class QryTower(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(100, 1)

    def forward(self, x):
        return self.fc(x)

class DocTower(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(100, 1)

    def forward(self, x):
        return self.fc(x)

qryTower = QryTower().to(device) #Initialize and move models to the right device
docTower = DocTower().to(device)


In [124]:
import torch.nn as nn

initiate class for data laoders

In [125]:
# # query/passage to average embedding 
# def title_to_embedding(words):
#     tokens = words.lower().split()
#     indices = [token_to_index.get(tok, 0) for tok in tokens]  # 0 for unknowns - get the value associated with the words

#     indices_tensor = torch.tensor(indices, dtype=torch.long, device=device) # converts the list indices into a PyTorch tensors

#     with torch.no_grad(): # This makes the code faster and uses less memory, because you're not training, just extracting embeddings.
#         embeds = embedding_layer(indices_tensor) # [num_tokens, embedding_dim]
#         return embeds.mean(dim=0) # average pooling

In [126]:
from torch.utils.data import Dataset

class TripletDataset(Dataset):
    def __init__(self, df, token_to_index, embedding_layer, device):
        self.df = df
        self.token_to_index = token_to_index
        self.embedding_layer = embedding_layer
        self.device = device

    def title_to_embedding(self, words):
        tokens = words.lower().split()
        indices = [self.token_to_index.get(tok, 0) for tok in tokens]
        indices_tensor = torch.tensor(indices, dtype=torch.long, device=self.device)
        #with torch.no_grad():
        embeds = self.embedding_layer(indices_tensor)
        return embeds.mean(dim=0)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        query = self.title_to_embedding(row["query"])
        pos = self.title_to_embedding(row["positive_passage"])
        neg = self.title_to_embedding(row["negative_passage"])
        return query, pos, neg



training data

In [127]:
from torch.utils.data import DataLoader

triplet_dataset = TripletDataset(train_df, token_to_index, embedding_layer, device)
dataloader = DataLoader(triplet_dataset, batch_size=128, shuffle=True)



In [128]:
for query, pos, neg in dataloader:
    print("query shape:", query.shape)
    print("positive shape:", pos.shape)
    print("negative shape:", neg.shape)
    break
len(dataloader)


query shape: torch.Size([128, 100])
positive shape: torch.Size([128, 100])
negative shape: torch.Size([128, 100])


125

Pass embeddings through the models

define loss function and optimiser

In [129]:
# import torch.nn.functional as F

# # Cosine similarities along dim=1 (batch)
# dst_pos = F.cosine_similarity(query, pos, dim=1)
# dst_neg = F.cosine_similarity(query, neg, dim=1)

# dst_dif = dst_pos - dst_neg
# margin = 0.2

# loss = torch.clamp(margin - dst_dif, min=0).mean()
# #loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

training 

In [130]:
import torch.nn.functional as F

# Set models to training mode
qryTower.train()
docTower.train()

for epoch in range(20):
    total_loss = 0
    for query_vecs, pos_vecs, neg_vecs in dataloader:
        query_vecs = query_vecs.to(device)
        pos_vecs = pos_vecs.to(device)
        neg_vecs = neg_vecs.to(device)

        qry = qryTower(query_vecs)
        pos = docTower(pos_vecs)
        neg = docTower(neg_vecs)

        dst_pos = F.cosine_similarity(qry, pos, dim=1)
        dst_neg = F.cosine_similarity(qry, neg, dim=1)

        margin = 0.2
        loss = torch.clamp(margin - (dst_pos - dst_neg), min=0).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")



Epoch 1, Loss: 0.6182
Epoch 2, Loss: 0.6158
Epoch 3, Loss: 0.6161
Epoch 4, Loss: 0.6185
Epoch 5, Loss: 0.6196
Epoch 6, Loss: 0.6201
Epoch 7, Loss: 0.6219
Epoch 8, Loss: 0.6237
Epoch 9, Loss: 0.6233
Epoch 10, Loss: 0.6166
Epoch 11, Loss: 0.6136
Epoch 12, Loss: 0.6122
Epoch 13, Loss: 0.6050
Epoch 14, Loss: 0.6064
Epoch 15, Loss: 0.6070
Epoch 16, Loss: 0.6076
Epoch 17, Loss: 0.6084
Epoch 18, Loss: 0.6075
Epoch 19, Loss: 0.6125
Epoch 20, Loss: 0.6136


In [131]:
qryTower.eval()
docTower.eval()

correct = 0
total = 0

with torch.no_grad():
    for query_vecs, pos_vecs, neg_vecs in dataloader:
        query_vecs = query_vecs.to(device)
        pos_vecs = pos_vecs.to(device)
        neg_vecs = neg_vecs.to(device)

        qry = qryTower(query_vecs)
        pos = docTower(pos_vecs)
        neg = docTower(neg_vecs)

        dst_pos = F.cosine_similarity(qry, pos, dim=1)
        dst_neg = F.cosine_similarity(qry, neg, dim=1)

        correct += (dst_pos > dst_neg).sum().item()
        total += query_vecs.size(0)

accuracy = correct / total
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.1882


In [134]:
import torch.nn.functional as F

def word_cosine_similarity(word1, word2, token_to_index, embedding_layer, device):
    idx1 = torch.tensor([token_to_index.get(word1.lower(), 0)], dtype=torch.long, device=device)
    idx2 = torch.tensor([token_to_index.get(word2.lower(), 0)], dtype=torch.long, device=device)

    emb1 = embedding_layer(idx1)  # shape [1, embedding_dim]
    emb2 = embedding_layer(idx2)

    similarity = F.cosine_similarity(emb1, emb2).item()  # scalar
    return similarity


In [137]:
sim = word_cosine_similarity("good", "bad", token_to_index, embedding_layer, device)
print(f"Cosine similarity between 'cat' and 'dog': {sim:.4f}")


Cosine similarity between 'cat' and 'dog': 0.6890
