In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
import os
import gc
import numpy as np
import glob

# --- Configuration ---
TOKENIZED_INPUT_DIR = "../data/tokenized_data_test/small_diffs"
EMBEDDINGS_OUTPUT_DIR = "../data/codebert_embeddings/small_diffs" # New directory for final embeddings
MODEL_NAME = "microsoft/codebert-base"

In [3]:
# --- Setup ---
os.makedirs(EMBEDDINGS_OUTPUT_DIR, exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the base CodeBERT model (without the classification head)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval() # Set model to evaluation mode (important for inference)

# --- Get the list of tokenized chunk files to process ---
tokenized_files = sorted(glob.glob(os.path.join(TOKENIZED_INPUT_DIR, 'chunk_*.pt')))
print(f"Found {len(tokenized_files)} tokenized chunks to process.")


# --- Main Loop: Stream Chunks, Generate Embeddings, Save Incrementally ---
for chunk_path in tqdm(tokenized_files, desc="Generating Embeddings from Chunks"):
    try:
        # 1. Load one chunk of tokenized data
        data = torch.load(chunk_path)
        
        # Prepare inputs and move to the GPU/CPU
        inputs = {
            'input_ids': data['input_ids'].to(device),
            'attention_mask': data['attention_mask'].to(device)
        }
        labels = data['labels']
        
        # 2. Get embeddings from the model
        # torch.no_grad() is a crucial optimization for speed and memory
        with torch.no_grad():
            outputs = model(**inputs)
        
        # 3. Extract the [CLS] token embedding for each item in the batch
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        # 4. Save the resulting embeddings and labels for this batch
        # This naturally saves the output "batch vice"
        output_filename = os.path.basename(chunk_path).replace('chunk_', 'embeddings_') + '.npz'
        output_path = os.path.join(EMBEDDINGS_OUTPUT_DIR, output_filename)
        np.savez_compressed(output_path, embeddings=embeddings, labels=labels.numpy())

        # 5. Clean up memory
        del data, inputs, labels, outputs, embeddings
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
    except Exception as e:
        print(f"Error processing file {chunk_path}: {e}")


print("\n--- Embedding Generation Complete --- ✅")
print(f"Final embeddings have been saved incrementally to the '{EMBEDDINGS_OUTPUT_DIR}' directory.")

Using device: cpu


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Found 251 tokenized chunks to process.


Generating Embeddings from Chunks:   0%|          | 0/251 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]


--- Embedding Generation Complete --- ✅
Final embeddings have been saved incrementally to the 'data/codebert_embeddings/small_diffs' directory.
