In [99]:
from huggingface_hub import hf_hub_download
import torch
from model import CBOW

## retrieve the model 

In [100]:

# model 
model_path = hf_hub_download(repo_id="cocoritzy/cbow-upvotes_model", filename="cbow_model.pt")

# Retrieve checkpoint 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(model_path, map_location=device) #A checkpoint is a file that saves the state of your model (

embedding_dim= checkpoint["embedding_dim"]
token_to_index = checkpoint["token_to_index"]
vocab_size = len(token_to_index)  # fill in actual size

model = CBOW(voc=vocab_size, emb=embedding_dim)
model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)

embedding_layer = model.embeddings

## retreive datasets

In [101]:
from datasets import load_dataset

df_hn = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives")
df_sn = load_dataset("cocoritzy/week_2_triplet_dataset_soft_negatives")
# dataset = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives", split="train[:10%]") # 10% of the datab
df_hn = df_hn["train"].to_pandas()
df_sn = df_sn["train"].to_pandas()

In [102]:
df_sn

Unnamed: 0,query_id,query,positive_passage,negative_passage,negative_from_query_id
0,19699,what is rba,Results-Based Accountability® (also known as R...,I finally found some real salary data for phys...,86595
1,19700,was ronald reagan a democrat,"From Wikipedia, the free encyclopedia. A Reaga...",The Pacific Ocean lies to the east while the S...,66360
2,19701,how long do you need for sydney and surroundin...,Sydney is the capital city of the Australian s...,"Probiotics are found in foods such as yogurt, ...",88507
3,19702,price to install tile in shower,1 Install ceramic tile floor to match shower-A...,Iodine is critical to thyroid health and funct...,87550
4,19703,why conversion observed in body,Conversion disorder is a type of somatoform di...,The answer to the question how much does it co...,61479
...,...,...,...,...,...
79699,102124,meaning of propagation,definition of propagation the act or action of...,A minimum of two credits of laboratory science...,21857
79700,102125,do you have to do a phd to be a clinical psych...,The goal you choose will determine your path. ...,1 The mitochondria of eukaryotes evolved from ...,28764
79701,102126,what wine goes with oysters,You may also enjoy these other types of wine w...,Raynaud's (say ray-NOHZ) phenomenon is a probl...,42284
79702,102127,what strengths does lithium come in,"Lithium 150 mg. Lithium (Eskalith ® , Eskalith...",While kids feel like they’ve been grownups for...,42891


In [103]:
df_sn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79704 entries, 0 to 79703
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   query_id                79704 non-null  int64 
 1   query                   79704 non-null  object
 2   positive_passage        79704 non-null  object
 3   negative_passage        79704 non-null  object
 4   negative_from_query_id  79704 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 3.0+ MB


## split the training and testing datasets 

In [104]:
import random
from sklearn.model_selection import train_test_split

train_sn, test_sn = train_test_split(
    df_sn,
    train_size=0.80,
    test_size=0.2,
    random_state=42
)

train_sn.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63763 entries, 5007 to 15795
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   query_id                63763 non-null  int64 
 1   query                   63763 non-null  object
 2   positive_passage        63763 non-null  object
 3   negative_passage        63763 non-null  object
 4   negative_from_query_id  63763 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.9+ MB


## count the max words in negative, p and query

In [105]:
def count_max_words(column_name):
    token_lists = train_sn[column_name].apply(lambda x: x.split())
    word_counts = token_lists.apply(len)
    return print(word_counts.max())

count_max_words("negative_passage")
count_max_words("positive_passage")
count_max_words("query")

199
201
26


In [106]:
from torch.utils.data import Dataset

class TripletDataset(Dataset): #Defines a new dataset class that returns triplets: query, positive doc, negative doc.

    def __init__(self, df, token_to_index, embedding_layer, device):
        self.df = df
        self.token_to_index = token_to_index
        self.embedding_layer = embedding_layer
        self.device = device

        self.query_max_len = max(len(text.lower().split()) for text in df["query"]) #Calculates the maximum length of the query text in the dataset.
        all_docs = df["positive_passage"].tolist() + df["negative_passage"].tolist() # Combines all documents (positive and negative) into one list.
        self.doc_max_len = max(len(text.lower().split()) for text in all_docs)
        
    
    def embed_and_pad(self, text, max_len):
        tokens = text.lower().split()
        indices = [self.token_to_index.get(tok, 0) for tok in tokens[:max_len]]  # Converts the list of indices into a PyTorch tensor and moves it to the correct device (CPU/GPU).
        indices += [0] * (max_len - len(indices))  # pad with 0s
        indices_tensor = torch.tensor(indices, dtype=torch.long, device=self.device) #Passes the tensor through the embedding layer.  
        embedded = self.embedding_layer(indices_tensor)  # [max_len, embedding_dim]

        return embedded
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        
        query = self.embed_and_pad(row["query"], self.query_max_len)
        pos   = self.embed_and_pad(row["positive_passage"], self.doc_max_len)
        neg   = self.embed_and_pad(row["negative_passage"], self.doc_max_len)
    

        return query, pos, neg



In [107]:
from torch.utils.data import DataLoader

triplet_dataset = TripletDataset(train_sn, token_to_index, embedding_layer, device)
dataloader = DataLoader(triplet_dataset, batch_size=128, shuffle=True)

for query, pos, neg in dataloader:
    print("\nBatch shapes:")
    print("Query batch:", query.shape)
    print("Pos batch:", pos.shape)
    print("Neg batch:", neg.shape)
    break




Batch shapes:
Query batch: torch.Size([128, 26, 100])
Pos batch: torch.Size([128, 201, 100])
Neg batch: torch.Size([128, 201, 100])


## similarity test after embedding 

In [108]:
import torch.nn.functional as F

# Récupère un batch
for query_batch, _, _ in dataloader:
    # Prends deux vecteurs identiques du batch
    q1 = query_batch[0]  # (seq_len, emb_dim)
    q2 = query_batch[0]  # même vecteur que q1

    # Moyenne des embeddings pour obtenir un vecteur global de la phrase
    q1_vec = q1.mean(dim=0).unsqueeze(0)  # (1, emb_dim)
    q2_vec = q2.mean(dim=0).unsqueeze(0)

    # Similarité cosinus
    similarity = F.cosine_similarity(q1_vec, q2_vec)
    print(f"✅ Similarité cosinus entre deux requêtes identiques : {similarity.item():.4f}")
    break


✅ Similarité cosinus entre deux requêtes identiques : 1.0000


## create the model 

In [109]:
# import torch
# import torch.nn as nn
# # from model_average import QryTower, DocTower

# class QryTower(nn.Module):
#     def forward(self, x):
#         return x.mean(dim=1)  # [batch_size, embedding_dim]

# class DocTower(nn.Module):
#     def forward(self, x):
#         return x.mean(dim=1)  # [batch_size, embedding_dim]


In [110]:
import torch.nn as nn

class QryTower(nn.Module):
    def __init__(self, embedding_dim=100, hidden_dim=64):
        super().__init__()
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, x):  # x : [batch_size, seq_len, emb_dim]
        _, hidden = self.rnn(x)
        return hidden.squeeze(0)  # [batch_size, hidden_dim]

class DocTower(nn.Module):
    def __init__(self, embedding_dim=100, hidden_dim=64):
        super().__init__()
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, x):  # x : [batch_size, seq_len, emb_dim]
        _, hidden = self.rnn(x)
        return hidden.squeeze(0)  # [batch_size, hidden_dim]


## model initialisation 

In [111]:

qryTower = QryTower().to(device)
docTower = DocTower().to(device)

In [112]:
## define model parameters

In [113]:
import torch.optim as optim

# Combine parameters from both towers
params = list(qryTower.parameters()) + list(docTower.parameters())

# Adam optimizer
optimizer = optim.Adam(params, lr=1e-3)

In [114]:
## train model 

In [115]:
qryTower.train()
docTower.train()

margin = 0.5  # marge pour la triplet loss

for epoch in range(20):
    total_loss = 0

    for qry, pos, neg in dataloader:
        qry, pos, neg = qry.to(device), pos.to(device), neg.to(device)

        # 1. Forward pass
        qry_vec = qryTower(qry)  # [batch, emb_dim]
        pos_vec = docTower(pos)
        neg_vec = docTower(neg)

        # 2. Cosine similarity
        sim_pos = torch.nn.functional.cosine_similarity(qry_vec, pos_vec, dim=1)  # [batch]
        sim_neg = torch.nn.functional.cosine_similarity(qry_vec, neg_vec, dim=1)

        # 3. Triplet loss : max(0, margin - (sim_pos - sim_neg))
        loss = torch.relu(margin - (sim_pos - sim_neg)).mean()

        # 4. Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Avg Loss: {avg_loss:.4f}")


Epoch 1, Avg Loss: 0.4278
Epoch 2, Avg Loss: 0.2585
Epoch 3, Avg Loss: 0.2177
Epoch 4, Avg Loss: 0.1813
Epoch 5, Avg Loss: 0.1563
Epoch 6, Avg Loss: 0.1387
Epoch 7, Avg Loss: 0.1262
Epoch 8, Avg Loss: 0.1160
Epoch 9, Avg Loss: 0.1083
Epoch 10, Avg Loss: 0.1012
Epoch 11, Avg Loss: 0.0950
Epoch 12, Avg Loss: 0.0901
Epoch 13, Avg Loss: 0.0844
Epoch 14, Avg Loss: 0.0802
Epoch 15, Avg Loss: 0.0762
Epoch 16, Avg Loss: 0.0725
Epoch 17, Avg Loss: 0.0688
Epoch 18, Avg Loss: 0.0664
Epoch 19, Avg Loss: 0.0640
Epoch 20, Avg Loss: 0.0604


In [116]:
test_dataset = TripletDataset(test_sn, token_to_index, embedding_layer, device)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)




In [117]:
qryTower.eval()
docTower.eval()

correct = 0
total = 0

with torch.no_grad():
    for qry, pos, neg in test_loader:
        qry, pos, neg = qry.to(device), pos.to(device), neg.to(device)

        qry_vec = qryTower(qry)
        pos_vec = docTower(pos)
        neg_vec = docTower(neg)

        sim_pos = torch.nn.functional.cosine_similarity(qry_vec, pos_vec, dim=1)
        sim_neg = torch.nn.functional.cosine_similarity(qry_vec, neg_vec, dim=1)

        # compte si le modèle préfère le doc positif
        correct += (sim_pos > sim_neg).sum().item()
        total += qry.size(0)

accuracy = correct / total
print(f"✅ Accuracy sur le set de test : {accuracy:.2%}")




✅ Accuracy sur le set de test : 86.46%


## save model 

In [118]:
torch.save({
    "qryTower": qryTower.state_dict(),
    "docTower": docTower.state_dict(),
    "token_to_index": token_to_index
}, "two_tower_model_GRU.pt")

In [119]:
my_query = "how to start learning deep learning"
my_passage = "how to start learning deep learning"


In [120]:
qryTower.eval()
docTower.eval()

with torch.no_grad():
    # Tokenise et embed
    qry_tensor = triplet_dataset.embed_and_pad(my_query, triplet_dataset.query_max_len).unsqueeze(0)
    doc_tensor = triplet_dataset.embed_and_pad(my_passage, triplet_dataset.doc_max_len).unsqueeze(0)

    # Encode avec les deux tours
    qry_vec = qryTower(qry_tensor)
    doc_vec = docTower(doc_tensor)

    # Calcule la similarité
    similarity = torch.nn.functional.cosine_similarity(qry_vec, doc_vec, dim=1)  # batch=1 → résultat = [1]
    print(f"\n🔍 Similarité query-passage : {similarity.item():.4f}")



🔍 Similarité query-passage : 0.4976


## same words, same tower, same length

In [121]:
query = "hello world"
tensor1 = triplet_dataset.embed_and_pad(query, triplet_dataset.query_max_len).unsqueeze(0)
tensor2 = triplet_dataset.embed_and_pad(query, triplet_dataset.query_max_len).unsqueeze(0)

vec1 = qryTower(tensor1)
vec2 = qryTower(tensor2)

sim = torch.nn.functional.cosine_similarity(vec1, vec2, dim=1)
print("🔁 Similarité entre deux encodages identiques :", sim.item())



🔁 Similarité entre deux encodages identiques : 1.0000001192092896
