In [None]:
import pandas as pd
import numpy as np
import glob
import os
from tqdm.notebook import tqdm

# --- Configuration ---
ORIGINAL_DATA_CSV = "../data/small_diffs.csv"
EMBEDDINGS_DIR = "../data/codebert_embeddings/small_diffs"
FINAL_EMBEDDING_CSV_PATH = "../data/final_embedding_dataset.csv"
CHUNK_SIZE = 100 # Must match the chunk size used to create the embeddings

In [None]:
print("Starting the process to create the final CSV with embeddings...")

# --- Step 1: Prepare the CSV Reader and Get File Lists ---
try:
    # This reader will process the original CSV in chunks
    csv_reader = pd.read_csv(ORIGINAL_DATA_CSV, chunksize=CHUNK_SIZE ,nrows=10000)
    # Get the list of all embedding files, sorted to ensure correct order
    embedding_files = sorted(glob.glob(os.path.join(EMBEDDINGS_DIR, '*.npz')))
    
    if not embedding_files:
        print(f"ERROR: No embedding files found in '{EMBEDDINGS_DIR}'. Please generate embeddings first.")
        raise FileNotFoundError("No embedding files found. Please generate embeddings first.")

except FileNotFoundError:
    print(f"ERROR: Could not find '{ORIGINAL_DATA_CSV}' or embedding files. Please check paths.")
    # Stop if files are missing
    raise

# --- Step 2: Process Chunks and Append to Final CSV ---
# This loop reads one chunk of the original CSV and one corresponding embedding file at a time
is_first_chunk = True
for i, (original_chunk_df, embedding_file_path) in enumerate(tqdm(zip(csv_reader, embedding_files), desc="Combining Data")):
    # Load the embeddings and labels for the current chunk
    with np.load(embedding_file_path) as data:
        embeddings = data['embeddings']
        # The labels are already in the original_chunk_df, so we don't need them here.
    
    # Create a new DataFrame for the embeddings
    # The columns will be named 'emb_0', 'emb_1', ..., 'emb_767'
    embedding_df = pd.DataFrame(embeddings, columns=[f'emb_{j}' for j in range(embeddings.shape[1])])
    
    # Reset indices on both DataFrames to ensure they align correctly
    original_chunk_df.reset_index(drop=True, inplace=True)
    embedding_df.reset_index(drop=True, inplace=True)
    
    # Combine the original data chunk with its embeddings
    combined_chunk_df = pd.concat([original_chunk_df, embedding_df], axis=1)
    
    # Append the combined chunk to the final CSV file
    # The header is written only for the first chunk
    combined_chunk_df.to_csv(FINAL_EMBEDDING_CSV_PATH, mode='a', header=is_first_chunk, index=False)
    
    is_first_chunk = False


print("\n--- Process Complete --- ✅")
print(f"Final dataset with commit hashes, labels, and embeddings saved to '{FINAL_EMBEDDING_CSV_PATH}'.")

# You can now load this new CSV to train your XGBoost or other models.
# For example, to check the first few rows and columns:
# final_df = pd.read_csv(FINAL_EMBEDDING_CSV_PATH)
# display(final_df.head())

Starting the process to create the final CSV with embeddings...


Combining Data: 0it [00:00, ?it/s]


--- Process Complete --- ✅
Final dataset with commit hashes, labels, and embeddings saved to 'final_embedding_dataset.csv'.
