In [7]:
# Make sure this notebook is running on the GPU
import torch
from tqdm import tqdm
from model import DocTower
import chromadb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [9]:
import torch
import torch.nn as nn
import csv
from huggingface_hub import hf_hub_download

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Repo Hugging Face
repo_id = "nodozi/MLX_Week2"

# 1. Download embeddings
embedding_path = hf_hub_download(
    repo_id=repo_id,
    filename="glove_embeddings.pt",
    repo_type="dataset"
)

embeddings = torch.load(embedding_path, map_location=device)  # [vocab_size, embedding_dim]

# 2. Download vocab
vocab_path = hf_hub_download(
    repo_id=repo_id,
    filename="glove_ids_to_words.csv",
    repo_type="dataset"
)


In [10]:
def load_vocabulary(csv_path):
    word_to_idx = {}
    with open(csv_path, mode='r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)  # 🟢 saute la première ligne
        for row in reader:
            if len(row) == 2:
                idx, word = row
                word_to_idx[word] = int(idx)
    return word_to_idx


token_to_index = load_vocabulary(vocab_path)


In [11]:

# 4. Embedding config
embedding_dim = embeddings.shape[1]
vocab_size = embeddings.shape[0]

embedding_layer = nn.Embedding.from_pretrained(embeddings, freeze=True).to(device)

In [12]:
# Load the list of documents
# Prepare the document dataset
# Get the dataset from cocoritz
# Combine the positive and negative passages into a single documents dataset
from datasets import load_dataset
import pandas as pd

df_sn = load_dataset("cocoritzy/week_2_triplet_dataset_soft_negatives")
df_sn = df_sn["train"].to_pandas()
df_sn.head()

# Create a list of documents from all values in the positive and negative columns 
print(len(df_sn['positive_passage'].tolist()))
print(len(df_sn['negative_passage'].tolist()))
all_passages = df_sn['positive_passage'].tolist() + df_sn['negative_passage'].tolist()
df = pd.DataFrame({'passages': all_passages})
df 
# print(all_passages[:5])

79704
79704


Unnamed: 0,passages
0,Results-Based Accountability® (also known as R...
1,"From Wikipedia, the free encyclopedia. A Reaga..."
2,Sydney is the capital city of the Australian s...
3,1 Install ceramic tile floor to match shower-A...
4,Conversion disorder is a type of somatoform di...
...,...
159403,A minimum of two credits of laboratory science...
159404,1 The mitochondria of eukaryotes evolved from ...
159405,Raynaud's (say ray-NOHZ) phenomenon is a probl...
159406,While kids feel like they’ve been grownups for...


In [None]:
from torch.utils.data import Dataset
import torch


class TripletDataset(Dataset):
    def __init__(self, df, token_to_index, embedding_layer, device):
        self.df = df
        self.token_to_index = token_to_index
        self.embedding_layer = embedding_layer
        self.device = device

        self.embedding_dim = embedding_layer.embedding_dim
        self.oov_embeddings = {}  # For storing fixed random vectors for OOV tokens

        #self.query_max_len = max(len(text.lower().split()) for text in df["query"])
        #all_docs = df["positive_passage"].tolist() + df["negative_passage"].tolist()
        self.doc_max_len = max(len(text.lower().split()) for text in df["passages"])

    def embed(self, token):
        """Return embedding for token: from vocab or generate fixed OOV vector."""
        if token in self.token_to_index:
            idx = self.token_to_index[token]
            return self.embedding_layer(torch.tensor(idx, device=self.device))
        else:
            if token not in self.oov_embeddings:
                self.oov_embeddings[token] = torch.randn(self.embedding_dim, device=self.device) * 0.1
            return self.oov_embeddings[token]
    
    def embed_text(self, text, max_len):
        tokens = text.lower().split()
        embedded_tokens = []

        for tok in tokens[:max_len]:
            emb = self.embed(tok)
            embedded_tokens.append(emb)

        true_len = len(embedded_tokens)

        # Padding with vector at index 0
        pad_len = max_len - true_len
        if pad_len > 0:
            pad_vec = self.embedding_layer(torch.tensor(0, device=self.device))  # index 0 used for padding
            embedded_tokens.extend([pad_vec] * pad_len)

        embedded = torch.stack(embedded_tokens)
        return embedded, true_len


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        doc, d_len   = self.embed_text(row["passages"], self.doc_max_len)

        return doc,d_len

def collate_fn(batch):
    # Unpack everything from the batch
    doc, d_len = zip(*batch)

    # Stack the tensors and lengths
    return (
        torch.stack(doc),torch.tensor(d_len)
    )


## calling our datagrame embedding

In [None]:

from torch.utils.data import DataLoader

triplet_dataset = TripletDataset(df, token_to_index, embedding_layer, device)

dataloader = DataLoader(
    triplet_dataset,
    batch_size=128,
    shuffle=True,
    collate_fn=collate_fn
)

## create the loop

In [18]:
# Load the complete statedict
state_dict = torch.load("two_tower_model_GRU_padding.pt", map_location=device)


# Extract only the DocTower parameters
doc_tower_state = state_dict['docTower']
word_to_id = state_dict['token_to_index']

model = DocTower()
model.load_state_dict(doc_tower_state)
model.eval()

# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./chroma_db")

# Create or access a collection
collection_name = 'marco_sn_documents'
collection = client.get_or_create_collection(name=collection_name)

# Determine how many documents are already in the collection
existing_docs_count = collection.count()
print(f"Collection already contains {existing_docs_count} documents")

# Iterate over the DataLoader and add documents to the collection
model.to(device)
model.eval()

for i, (doc, d_len) in tqdm(enumerate(dataloader), total=len(dataloader)):
    # Move the data to the GPU
    doc = doc.to(device)
    d_len = d_len.to(device)

    # Get the embeddings
    with torch.no_grad():
        embeddings = model(doc, d_len)

    # Convert embeddings to numpy arrays
    embeddings_np = embeddings.cpu().numpy()

    # Add documents to the collection
    for j in range(embeddings_np.shape[0]):
        embedding = embeddings_np[j]
        passage = df.iloc[i * 128 + j]['passages']
        collection.add(
            documents=[passage],
            metadatas=[{"index": i * 128 + j}],
            ids=[str(existing_docs_count + i * 128 + j)],
            embeddings=[embedding]
        )
    # Update the count of existing documents
    existing_docs_count += embeddings_np.shape[0]

Collection already contains 0 documents


  2%|▏         | 22/1246 [02:04<1:55:04,  5.64s/it]


KeyboardInterrupt: 

In [None]:
# Embed the documents
def embed_docs(model, token_to_index, documents, batch_size, device, collection=None, start_id=0):

    """
    Embed a list of texts using a pre-trained model.
    
    Args:
        model: The pre-trained doc tower model to use for embedding.
        documents: The list of texts to embed.
    """



    total_documents = len(documents)
    
    # Create progress bar for the total number of batches
    total_batches = (total_documents + batch_size - 1) // batch_size
    progress_bar = tqdm(range(total_batches), desc="Processing batches")

    with torch.no_grad():
        for i in range(0, total_documents, batch_size):

            # Get batch of data
            batch_passages = documents[i:i + batch_size] 

            # Convert text to token ids using the model's vocab
            batch_ids = []
            batch_lengths = []
            max_len = 201

            for doc in batch_passages:
                # Split into words and convert to ids
                words = doc.split()
                ids = [token_to_index.get(word, token_to_index.get('<unk>', 0)) for word in words]
                batch_ids.append(ids)
                batch_lengths.append(len(ids))
                max_len = max(max_len, len(ids))
            
            # Pad sequences to max length
            padded_ids = []
            for ids in batch_ids:
                padding = [0] * (max_len - len(ids))  # Assuming 0 is the padding token
                padded_ids.append(ids + padding)
            
            # Convert to tensors
            input_ids = torch.tensor(padded_ids, device=device).float() # GRU requires a float
            lengths = torch.tensor(batch_lengths, device=device)

            # Debug prints
            print(f"Input shape: {input_ids.shape}")
            print(f"Lengths shape: {lengths.shape}")
        

            doc_embeds = model(input_ids, lengths)
            batch_embeddings = doc_embeds.cpu().numpy()
            
            batch_ids = [str(start_id + i + j) for j in range(len(batch_passages))]
            
            
            print('Documents embedded. Storing to ChromaDB...')
            # Add batch directly to ChromaDB collection
            collection.add(
                documents=batch_passages,
                ids=batch_ids,
                embeddings=batch_embeddings
            )
            
            # Update progress bar
            progress_bar.update()
            progress_bar.set_postfix({"Processed": f"{i+batch_size}/{len(documents)} passages"})

    print('Documents embedded and stored in ChromaDB.')
    return 
        
    


In [None]:
def populate_chroma():
        # Add batch directly to ChromaDB collection
    collection.add(
        documents=passages,# list
        ids=ids, # list 
        embeddings=embeddings # list
        )
    
    return