In [1]:
from huggingface_hub import hf_hub_download
import torch
from model import CBOW

  from .autonotebook import tqdm as notebook_tqdm


## retrieve the model 

In [27]:
import torch
import torch.nn as nn
import csv
from huggingface_hub import hf_hub_download

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Repo Hugging Face
repo_id = "nodozi/MLX_Week2"

# 1. Download embeddings
embedding_path = hf_hub_download(
    repo_id=repo_id,
    filename="glove_embeddings.pt",
    repo_type="dataset"
)

embeddings = torch.load(embedding_path, map_location=device)  # [vocab_size, embedding_dim]

# 2. Download vocab
vocab_path = hf_hub_download(
    repo_id=repo_id,
    filename="glove_ids_to_words.csv",
    repo_type="dataset"
)


In [28]:
# Ouvre le fichier et affiche les 5 premières lignes
with open(vocab_path, mode='r', encoding='utf-8') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        print(row)
        if i == 4:
            break


['Token_ID', 'Word']
['1', 'the']
['2', ',']
['3', '.']
['4', 'of']


In [29]:
def load_vocabulary(csv_path):
    word_to_idx = {}
    with open(csv_path, mode='r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)  # 🟢 saute la première ligne
        for row in reader:
            if len(row) == 2:
                idx, word = row
                word_to_idx[word] = int(idx)
    return word_to_idx


token_to_index = load_vocabulary(vocab_path)


In [30]:
for token in ["<pad>", "<unk>", "PAD", "UNK", "[PAD]", "[UNK]"]:
    if token in token_to_index:
        print(f"✅ Le token spécial '{token}' existe avec l’index : {token_to_index[token]}")
    else:
        print(f"❌ Le token '{token}' n’est pas dans le vocabulaire.")

❌ Le token '<pad>' n’est pas dans le vocabulaire.
❌ Le token '<unk>' n’est pas dans le vocabulaire.
❌ Le token 'PAD' n’est pas dans le vocabulaire.
❌ Le token 'UNK' n’est pas dans le vocabulaire.
❌ Le token '[PAD]' n’est pas dans le vocabulaire.
❌ Le token '[UNK]' n’est pas dans le vocabulaire.


In [31]:

# 4. Embedding config
embedding_dim = embeddings.shape[1]
vocab_size = embeddings.shape[0]

embedding_layer = nn.Embedding.from_pretrained(embeddings, freeze=True).to(device)

## retreive datasets

In [7]:
from datasets import load_dataset

df_hn = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives")
df_sn = load_dataset("cocoritzy/week_2_triplet_dataset_soft_negatives")
# dataset = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives", split="train[:10%]") # 10% of the datab
df_hn = df_hn["train"].to_pandas()
df_sn = df_sn["train"].to_pandas()

In [8]:
df_sn

Unnamed: 0,query_id,query,positive_passage,negative_passage,negative_from_query_id
0,19699,what is rba,Results-Based Accountability® (also known as R...,I finally found some real salary data for phys...,86595
1,19700,was ronald reagan a democrat,"From Wikipedia, the free encyclopedia. A Reaga...",The Pacific Ocean lies to the east while the S...,66360
2,19701,how long do you need for sydney and surroundin...,Sydney is the capital city of the Australian s...,"Probiotics are found in foods such as yogurt, ...",88507
3,19702,price to install tile in shower,1 Install ceramic tile floor to match shower-A...,Iodine is critical to thyroid health and funct...,87550
4,19703,why conversion observed in body,Conversion disorder is a type of somatoform di...,The answer to the question how much does it co...,61479
...,...,...,...,...,...
79699,102124,meaning of propagation,definition of propagation the act or action of...,A minimum of two credits of laboratory science...,21857
79700,102125,do you have to do a phd to be a clinical psych...,The goal you choose will determine your path. ...,1 The mitochondria of eukaryotes evolved from ...,28764
79701,102126,what wine goes with oysters,You may also enjoy these other types of wine w...,Raynaud's (say ray-NOHZ) phenomenon is a probl...,42284
79702,102127,what strengths does lithium come in,"Lithium 150 mg. Lithium (Eskalith ® , Eskalith...",While kids feel like they’ve been grownups for...,42891


In [9]:
df_sn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79704 entries, 0 to 79703
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   query_id                79704 non-null  int64 
 1   query                   79704 non-null  object
 2   positive_passage        79704 non-null  object
 3   negative_passage        79704 non-null  object
 4   negative_from_query_id  79704 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 3.0+ MB


## split the training and testing datasets 

In [10]:
import random
from sklearn.model_selection import train_test_split

train_sn, test_sn = train_test_split(
    df_sn,
    train_size=0.80,
    test_size=0.2,
    random_state=42
)

train_sn.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63763 entries, 5007 to 15795
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   query_id                63763 non-null  int64 
 1   query                   63763 non-null  object
 2   positive_passage        63763 non-null  object
 3   negative_passage        63763 non-null  object
 4   negative_from_query_id  63763 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.9+ MB


## count the max words in negative, p and query

In [11]:
def count_max_words(column_name):
    token_lists = train_sn[column_name].apply(lambda x: x.split())
    word_counts = token_lists.apply(len)
    return print(word_counts.max())

count_max_words("negative_passage")
count_max_words("positive_passage")
count_max_words("query")

199
201
26


In [12]:
from torch.utils.data import Dataset
import torch

class TripletDataset(Dataset):
    def __init__(self, df, token_to_index, embedding_layer, device):
        self.df = df
        self.token_to_index = token_to_index
        self.embedding_layer = embedding_layer
        self.device = device

        self.embedding_dim = embedding_layer.embedding_dim
        self.oov_embeddings = {}  # For storing fixed random vectors for OOV tokens

        self.query_max_len = max(len(text.lower().split()) for text in df["query"])
        all_docs = df["positive_passage"].tolist() + df["negative_passage"].tolist()
        self.doc_max_len = max(len(text.lower().split()) for text in all_docs)

    def embed(self, token):
        """Return embedding for token: from vocab or generate fixed OOV vector."""
        if token in self.token_to_index:
            idx = self.token_to_index[token]
            return self.embedding_layer(torch.tensor(idx, device=self.device))
        else:
            if token not in self.oov_embeddings:
                self.oov_embeddings[token] = torch.randn(self.embedding_dim, device=self.device) * 0.1
            return self.oov_embeddings[token]
    
    def embed_text(self, text, max_len):
        tokens = text.lower().split()
        embedded_tokens = []

        for tok in tokens[:max_len]:
            emb = self.embed(tok)
            embedded_tokens.append(emb)

        true_len = len(embedded_tokens)

        # Padding with vector at index 0
        pad_len = max_len - true_len
        if pad_len > 0:
            pad_vec = self.embedding_layer(torch.tensor(0, device=self.device))  # index 0 used for padding
            embedded_tokens.extend([pad_vec] * pad_len)

        embedded = torch.stack(embedded_tokens)
        return embedded, true_len


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        query, q_len = self.embed_text(row["query"], self.query_max_len)
        pos, p_len   = self.embed_text(row["positive_passage"], self.doc_max_len)
        neg, n_len   = self.embed_text(row["negative_passage"], self.doc_max_len)

        return query, q_len, pos, p_len, neg, n_len


In [13]:
def collate_fn(batch):
    # Unpack everything from the batch
    queries, q_lens, pos, p_lens, neg, n_lens = zip(*batch)

    # Stack the tensors and lengths
    return (
        torch.stack(queries), torch.tensor(q_lens),
        torch.stack(pos),     torch.tensor(p_lens),
        torch.stack(neg),     torch.tensor(n_lens)
    )


In [14]:
from torch.utils.data import DataLoader

triplet_dataset = TripletDataset(train_sn, token_to_index, embedding_layer, device)

dataloader = DataLoader(
    triplet_dataset,
    batch_size=128,
    shuffle=True,
    collate_fn=collate_fn
)


# for query, pos, neg in dataloader:
#     print("\nBatch shapes:")
#     print("Query batch:", query.shape)
#     print("Pos batch:", pos.shape)
#     print("Neg batch:", neg.shape)
#     break



## create the model 

In [15]:
# import torch
# import torch.nn as nn
# # from model_average import QryTower, DocTower

# class QryTower(nn.Module):
#     def forward(self, x):
#         return x.mean(dim=1)  # [batch_size, embedding_dim]

# class DocTower(nn.Module):
#     def forward(self, x):
#         return x.mean(dim=1)  # [batch_size, embedding_dim]


In [16]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

class QryTower(nn.Module):
    def __init__(self, embedding_dim=100, hidden_dim=64):
        super().__init__()
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, x, lengths):
        # 1. Sort by descending length
        lengths, sort_idx = lengths.sort(descending=True)
        x = x[sort_idx]

        # 2. Pack
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True)

        # 3. GRU
        _, hidden = self.rnn(packed)  # hidden: [1, batch_size, hidden_dim]

        # 4. Unsort
        _, unsort_idx = sort_idx.sort()
        hidden = hidden.squeeze(0)[unsort_idx]  # [batch_size, hidden_dim]

        return hidden

class DocTower(nn.Module):
    def __init__(self, embedding_dim=100, hidden_dim=64):
        super().__init__()
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, x, lengths):
        # 1. Sort by descending length
        lengths, sort_idx = lengths.sort(descending=True)
        x = x[sort_idx]

        # 2. Pack
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True)

        # 3. GRU
        _, hidden = self.rnn(packed)  # hidden: [1, batch_size, hidden_dim]

        # 4. Unsort
        _, unsort_idx = sort_idx.sort()
        hidden = hidden.squeeze(0)[unsort_idx]  # [batch_size, hidden_dim]

        return hidden

# class QryTower(nn.Module):
#     def __init__(self, embedding_dim=100, hidden_dim=64):
#         super().__init__()
#         self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

#     def forward(self, x):  # x : [batch_size, seq_len, emb_dim]
#         _, hidden = self.rnn(x)
#         return hidden.squeeze(0)  # [batch_size, hidden_dim]

# class DocTower(nn.Module):
#     def __init__(self, embedding_dim=100, hidden_dim=64):
#         super().__init__()
#         self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

#     def forward(self, x):  # x : [batch_size, seq_len, emb_dim]
#         _, hidden = self.rnn(x)
#         return hidden.squeeze(0)  # [batch_size, hidden_dim]


## model initialisation 

In [17]:

qryTower = QryTower().to(device)
docTower = DocTower().to(device)

In [18]:
## define model parameters

In [19]:
import torch.optim as optim

# Combine parameters from both towers
params = list(qryTower.parameters()) + list(docTower.parameters())

# Adam optimizer
optimizer = optim.Adam(params, lr=1e-3)

In [20]:
## train model 

In [21]:
qryTower.train()
docTower.train()

margin = 0.5  # Triplet loss margin

for epoch in range(20):
    total_loss = 0

    for qry, q_len, pos, p_len, neg, n_len in dataloader:
        # Move to device
        qry, q_len = qry.to(device), q_len.to(device)
        pos, p_len = pos.to(device), p_len.to(device)
        neg, n_len = neg.to(device), n_len.to(device)

        # 1. Forward pass using RNN with packed sequences
        qry_vec = qryTower(qry, q_len)  # [batch_size, hidden_dim]
        pos_vec = docTower(pos, p_len)
        neg_vec = docTower(neg, n_len)

        # 2. Cosine similarity
        sim_pos = torch.nn.functional.cosine_similarity(qry_vec, pos_vec, dim=1)
        sim_neg = torch.nn.functional.cosine_similarity(qry_vec, neg_vec, dim=1)

        # 3. Triplet loss
        loss = torch.relu(margin - (sim_pos - sim_neg)).mean()

        # 4. Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Avg Loss: {avg_loss:.4f}")


KeyboardInterrupt: 

In [None]:
test_dataset = TripletDataset(test_sn, token_to_index, embedding_layer, device)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)


In [None]:
qryTower.eval()
docTower.eval()

correct = 0
total = 0

with torch.no_grad():
    for qry, q_len, pos, p_len, neg, n_len in dataloader:
        # Move to device
        qry, q_len = qry.to(device), q_len.to(device)
        pos, p_len = pos.to(device), p_len.to(device)
        neg, n_len = neg.to(device), n_len.to(device)

        # 1. Forward pass using RNN with packed sequences
        qry_vec = qryTower(qry, q_len)  # [batch_size, hidden_dim]
        pos_vec = docTower(pos, p_len)
        neg_vec = docTower(neg, n_len)

        sim_pos = torch.nn.functional.cosine_similarity(qry_vec, pos_vec, dim=1)
        sim_neg = torch.nn.functional.cosine_similarity(qry_vec, neg_vec, dim=1)

        # compte si le modèle préfère le doc positif
        correct += (sim_pos > sim_neg).sum().item()
        total += qry.size(0)

accuracy = correct / total
print(f"✅ Accuracy sur le set de test : {accuracy:.2%}")




✅ Accuracy sur le set de test : 96.01%


## save model 

In [None]:
torch.save({
    "qryTower": qryTower.state_dict(),
    "docTower": docTower.state_dict(),
    "token_to_index": token_to_index
}, "two_tower_model_GRU_padding_128.pt")

In [None]:
my_query = "what is rba"
my_passage = "rba is how do you train  learning deep learning"


In [None]:
qryTower.eval()
docTower.eval()

with torch.no_grad():
    # Tokenise et embed + get lengths
    qry_tensor, qry_len = triplet_dataset.embed_text(my_query, triplet_dataset.query_max_len)
    doc_tensor, doc_len = triplet_dataset.embed_text(my_passage, triplet_dataset.doc_max_len)

    # Add batch dimension
    qry_tensor = qry_tensor.unsqueeze(0)  # [1, seq_len, emb_dim]
    doc_tensor = doc_tensor.unsqueeze(0)
    qry_len = torch.tensor([qry_len])
    doc_len = torch.tensor([doc_len])

    # Move to device
    qry_tensor, qry_len = qry_tensor.to(device), qry_len.to(device)
    doc_tensor, doc_len = doc_tensor.to(device), doc_len.to(device)

    # Encode with towers (using packed sequences)
    qry_vec = qryTower(qry_tensor, qry_len)  # [1, hidden_dim]
    doc_vec = docTower(doc_tensor, doc_len)

    # Compute cosine similarity
    similarity = torch.nn.functional.cosine_similarity(qry_vec, doc_vec, dim=1)
    print(f"\n🔍 Similarité query-passage : {similarity.item():.4f}")




🔍 Similarité query-passage : 0.4412


## same words, same tower, same length

In [None]:
query = "hello world"

# Embed and get sequence length
tensor1, len1 = triplet_dataset.embed_text(query, triplet_dataset.query_max_len)
tensor2, len2 = triplet_dataset.embed_text(query, triplet_dataset.query_max_len)

# Add batch dimension
tensor1 = tensor1.unsqueeze(0).to(device)  # [1, seq_len, emb_dim]
tensor2 = tensor2.unsqueeze(0).to(device)
len1 = torch.tensor([len1], device=device)
len2 = torch.tensor([len2], device=device)

# Encode using the query tower
vec1 = qryTower(tensor1, len1)  # [1, hidden_dim]
vec2 = qryTower(tensor2, len2)

# Compute similarity
sim = torch.nn.functional.cosine_similarity(vec1, vec2, dim=1)  # [1]
print("🔁 Similarité entre deux encodages identiques :", sim.item())



🔁 Similarité entre deux encodages identiques : 1.0


## retrieve most similar documents 

In [None]:
docTower.eval()

all_doc_texts = train_sn["positive_passage"].tolist()[:500]  # Use fewer if needed

doc_vectors = []
with torch.no_grad():
    for text in all_doc_texts:
        doc_tensor, doc_len = triplet_dataset.embed_text(text, triplet_dataset.doc_max_len)
        doc_tensor = doc_tensor.unsqueeze(0).to(device)
        doc_len = torch.tensor([doc_len], device=device)

        doc_vec = docTower(doc_tensor, doc_len)  # [1, hidden_dim]
        doc_vectors.append(doc_vec.squeeze(0))

doc_matrix = torch.stack(doc_vectors).to(device)


In [None]:
from torch.nn.functional import cosine_similarity

query = "how do solar panels work" 

qryTower.eval()
with torch.no_grad():
    qry_tensor, qry_len = triplet_dataset.embed_text(query, triplet_dataset.query_max_len)
    qry_tensor = qry_tensor.unsqueeze(0).to(device)
    qry_len = torch.tensor([qry_len], device=device)

    qry_vec = qryTower(qry_tensor, qry_len)

    similarities = cosine_similarity(qry_vec, doc_matrix)  # [num_docs]
    top_k = 5
    top_indices = similarities.topk(top_k).indices.cpu().numpy()

    print(f"\nQuery: {query}")
    print(f"\nTop {top_k} most relevant documents:")

    for i in top_indices:
        print(f"\n🔸 Similarity: {similarities[i].item():.4f}")
        print(all_doc_texts[i])
