## Install and Import

In [None]:
!pip install -U sentence-transformers

In [None]:
import os
import json
import pickle
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

## Embed Chunks (512)

We load the pre-chunked documents (chunk size 512) and embed them with the MiniLM embedder.

In [None]:
# Load the JSON file with chunks
chunk_json_path = '/kaggle/input/chunk-doc-512/chunk_doc_512.json'
with open(chunk_json_path, 'r') as f:
    chunk_doc_index = json.load(f)

# Prepare the embedder (using MiniLM-L6-v2 model) and switch to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device=device)

# Convert chunks into a list for easy embedding
chunk_texts = list(chunk_doc_index.values())
chunk_ids = list(chunk_doc_index.keys())

# Define batch size based on memory availability and initialize variables
batch_size = 32  # Adjust based on your system memory (64 is safe for moderate memory)
embeddings = []
saved_batches = 0
total_chunks = len(chunk_texts)

# Embed in batches and save periodically to avoid memory overload
for i in tqdm(range(0, total_chunks, batch_size), desc="Embedding Chunks"):
    # Get the current batch
    batch_texts = chunk_texts[i:i + batch_size]
    batch_ids = chunk_ids[i:i + batch_size]

    # Encode the batch
    batch_embeddings = model.encode(batch_texts, batch_size=batch_size, show_progress_bar=False)
    
    # Convert embeddings to list and pair with chunk IDs
    batch_data = {batch_ids[j]: batch_embeddings[j].tolist() for j in range(len(batch_ids))}
    
    # Append to embeddings list
    embeddings.append(batch_data)

    # Save intermediate batches to .pkl every 100,000 chunks
    if (i // batch_size) % (50000 // batch_size) == 0 and i > 0:
        with open(f'/kaggle/working/chunk_embeddings_batch_{saved_batches}.pkl', 'wb') as pkl_file:
            pickle.dump(embeddings, pkl_file)
        saved_batches += 1
        embeddings = []  # Clear memory by resetting the embeddings list

# Save any remaining batches
if embeddings:
    with open(f'/kaggle/working/chunk_embeddings_batch_{saved_batches}.pkl', 'wb') as pkl_file:
        pickle.dump(embeddings, pkl_file)

print("Embedding process completed and saved in batches as .pkl files.")


## Merge Embeddings

Given the memory constraint, the embeddings were saved in separate files. Next we'll merge them in one for future use.

In [1]:
# Directory where all the chunk embedding .pkl files are saved
embedding_dir = '/kaggle/working/'

# List all .pkl files related to chunk embeddings
pkl_files = sorted([f for f in os.listdir(embedding_dir) if f.startswith("chunk_embeddings_batch_") and f.endswith(".pkl")])

# Initialize an empty dictionary to store all embeddings
merged_embeddings = {}

# Load each .pkl file and merge contents
for pkl_file in pkl_files:
    with open(os.path.join(embedding_dir, pkl_file), 'rb') as f:
        batch_data = pickle.load(f)
        merged_embeddings.update(batch_data)  # Merge dictionaries

# Save the merged embeddings into a single .pkl file
with open('/kaggle/working/merged_chunk_embeddings.pkl', 'wb') as merged_file:
    pickle.dump(merged_embeddings, merged_file)

print("All embeddings have been merged and saved as merged_chunk_embeddings.pkl")


All embeddings have been merged and saved as merged_chunk_embeddings.pkl
