# Tokenizing small diff

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from tqdm.notebook import tqdm
import os
import gc

In [None]:
# --- Configuration ---
INPUT_CSV = "../small_diffs.csv"
TOKENIZED_DATA_DIR = "../tokenized_data_test/small_diffs"
CHUNK_SIZE = 500 # How many rows to process at a time

In [3]:
# --- Setup ---
os.makedirs(TOKENIZED_DATA_DIR, exist_ok=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# --- Checkpointing: Determine where to resume from ---
processed_chunks = 0
if os.path.exists(TOKENIZED_DATA_DIR):
    existing_files = [f for f in os.listdir(TOKENIZED_DATA_DIR) if f.startswith('chunk_') and f.endswith('.pt')]
    if existing_files:
        processed_chunks = len(existing_files)

start_chunk = processed_chunks
rows_to_skip = start_chunk * CHUNK_SIZE
print(f"Processing small diffs: Resuming from chunk #{start_chunk}.")

Processing small diffs: Resuming from chunk #0.


In [4]:
# --- Main Resumable Loop for Small Diffs ---
try:
    csv_reader = pd.read_csv(INPUT_CSV, chunksize=CHUNK_SIZE, skiprows=range(1, rows_to_skip + 1))
    
    total_rows_processed = 0
    for i, chunk_df in enumerate(tqdm(csv_reader, desc="Processing Small Diffs")):
        current_chunk_index = start_chunk + i
        
        diff_texts = chunk_df['diff'].astype(str).tolist()
        labels = chunk_df['is_bug_introducing'].tolist()
        
        encodings = tokenizer(diff_texts, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        
        chunk_path = os.path.join(TOKENIZED_DATA_DIR, f"chunk_{current_chunk_index}.pt")
        torch.save({
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask'],
            'labels': torch.tensor(labels)
        }, chunk_path)
        
        total_rows_processed += len(chunk_df)
        del diff_texts, labels, encodings, chunk_df
        gc.collect()

    print(f"\nSmall diff processing complete. ✅")

except FileNotFoundError:
    print(f"ERROR: Input file not found at '{INPUT_CSV}'")

Processing Small Diffs: 0it [00:00, ?it/s]


Small diff processing complete. ✅


# new tokenizing + embedding
previous methot returns somany null valuse

In [2]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
import os
import gc
import numpy as np

In [None]:
# --- Configuration ---
CLEAN_SOURCE_CSV = "../data/final_dataset_with_full_diffs_CLEANED.csv"
FINAL_OUTPUT_DIR = "../data/final_embeddings_sliding_window"
ERROR_LOG_PATH = "../logs/embedding_errors.log"
CHUNK_SIZE = 10 # How many rows from the CSV to process at a time
MODEL_NAME = "microsoft/codebert-base"

# --- Sliding Window Configuration ---
MAX_LENGTH = 512  # The model's max token length
OVERLAP = 50      # How many tokens to overlap between chunks
STRIDE = MAX_LENGTH - OVERLAP # The step size for the window

In [4]:
# --- Setup ---
os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# --- Checkpointing ---
processed_chunks = len(os.listdir(FINAL_OUTPUT_DIR))
rows_to_skip = processed_chunks * CHUNK_SIZE
print(f"Resuming: Found {processed_chunks} processed embedding batches. Skipping first {rows_to_skip} rows.")

# --- Main Resumable Loop ---
try:
    # csv_reader = pd.read_csv(CLEAN_SOURCE_CSV, chunksize=CHUNK_SIZE, skiprows=range(1, rows_to_skip + 1))
    csv_reader = pd.read_csv(CLEAN_SOURCE_CSV, chunksize=CHUNK_SIZE, skiprows=range(1, rows_to_skip + 1),nrows=10)
    
    for i, chunk_df in enumerate(tqdm(csv_reader, desc="Processing CSV Chunks")):
        current_chunk_index = processed_chunks + i
        
        batch_embeddings = []
        batch_labels = []
        
        # Iterate through each row in the current pandas chunk
        for _, row in chunk_df.iterrows():
            try:
                diff_text = str(row['diff'])
                label = int(row['is_bug_introducing'])
                
                # Tokenize the entire diff without truncation first
                all_input_ids = tokenizer.encode(diff_text, add_special_tokens=False)
                
                final_embedding = None
                
                if len(all_input_ids) <= MAX_LENGTH - 2: # -2 for [CLS] and [SEP]
                    # If the diff is short, process it normally
                    inputs = tokenizer(diff_text, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
                    with torch.no_grad():
                        outputs = model(**inputs)
                    final_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                else:
                    # --- Sliding Window Logic ---
                    chunk_embeddings = []
                    # Create overlapping chunks of token IDs
                    for start in range(0, len(all_input_ids), STRIDE):
                        end = start + MAX_LENGTH - 2
                        chunk_ids = all_input_ids[start:end]
                        
                        # Add special tokens and convert to tensor
                        input_tensor = torch.tensor([tokenizer.cls_token_id] + chunk_ids + [tokenizer.sep_token_id]).unsqueeze(0).to(device)
                        
                        # Get embedding for this chunk
                        with torch.no_grad():
                            outputs = model(input_tensor)
                        chunk_embeddings.append(outputs.last_hidden_state[:, 0, :])
                    
                    # --- Pooling Step ---
                    # Average the embeddings of all chunks to get a single vector
                    if chunk_embeddings:
                        stacked_embeddings = torch.stack(chunk_embeddings)
                        final_embedding = torch.mean(stacked_embeddings, dim=0).cpu().numpy()

                if final_embedding is not None:
                    batch_embeddings.append(final_embedding.flatten())
                    batch_labels.append(label)

            except Exception as e:
                with open(ERROR_LOG_PATH, 'a', encoding='utf-8') as f:
                    f.write(f"Error processing commit {row.get('commit_hash', 'N/A')}: {e}\n")

        # Save the collected embeddings and labels for this chunk
        if batch_embeddings:
            output_path = os.path.join(FINAL_OUTPUT_DIR, f"batch_{current_chunk_index}.npz")
            np.savez_compressed(output_path, embeddings=np.array(batch_embeddings), labels=np.array(batch_labels))

        # Clean up memory
        del batch_embeddings, batch_labels, chunk_df
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    print("\n--- Embedding Generation Complete --- ✅")

except FileNotFoundError:
    print(f"ERROR: Clean file not found at '{CLEAN_SOURCE_CSV}'. Please run the cleaning step first.")

Using device: cpu
Resuming: Found 0 processed embedding batches. Skipping first 0 rows.


Processing CSV Chunks: 0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (145865 > 512). Running this sequence through the model will result in indexing errors



--- Embedding Generation Complete --- ✅


In [None]:
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm

# --- Configuration ---
EMBEDDINGS_DIR = FINAL_OUTPUT_DIR # Use the directory from the previous cell
OUTPUT_CSV_PATH = "../data/final_embeddings_sliding_window.csv"

# --- Logic ---
# Get a list of all the .npz files generated in the previous step
try:
    npz_files = [f for f in os.listdir(EMBEDDINGS_DIR) if f.endswith('.npz')]
    if not npz_files:
        print(f"No .npz files found in '{EMBEDDINGS_DIR}'. Nothing to convert.")
    else:
        print(f"Found {len(npz_files)} .npz files to process.")
        
        list_of_dfs = []

        # Loop through each file, load it, and convert to a DataFrame
        for file_name in tqdm(npz_files, desc="Converting .npz to DataFrame chunks"):
            file_path = os.path.join(EMBEDDINGS_DIR, file_name)
            
            with np.load(file_path) as data:
                embeddings = data['embeddings']
                labels = data['labels']
                
                # Create a DataFrame for the embeddings
                embedding_df = pd.DataFrame(embeddings, columns=[f'embedding_{i}' for i in range(embeddings.shape[1])])
                
                # Create a DataFrame for the labels
                labels_df = pd.DataFrame(labels, columns=['label'])
                
                # Combine them side-by-side
                chunk_df = pd.concat([labels_df, embedding_df], axis=1)
                list_of_dfs.append(chunk_df)

        # Concatenate all the small DataFrames into one large DataFrame
        print("Concatenating all chunks into the final DataFrame...")
        final_df = pd.concat(list_of_dfs, ignore_index=True)

        # Save the final DataFrame to a CSV file
        final_df.to_csv(OUTPUT_CSV_PATH, index=False)
        
        print(f"\nSuccessfully converted and saved data to '{OUTPUT_CSV_PATH}' ✅")
        print(f"Final DataFrame shape: {final_df.shape}")

except FileNotFoundError:
    print(f"ERROR: The directory '{EMBEDDINGS_DIR}' was not found.")


Found 1 .npz files to process.


Converting .npz to DataFrame chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Concatenating all chunks into the final DataFrame...

Successfully converted and saved data to 'data/final_embeddings_sliding_window.csv' ✅
Final DataFrame shape: (10, 769)


In [14]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
import os
import gc
import numpy as np

In [None]:
# --- Configuration ---
CLEAN_SOURCE_CSV = "../data/final_dataset_with_full_diffs_CLEANED.csv"
FINAL_OUTPUT_CSV = "../data/final_embeddings_sliding_window/final_dataset_with_embeddings.csv" # The single, final output file
ERROR_LOG_PATH = "../logs/embedding_errors.log"
# CHUNK_SIZE = 64
CHUNK_SIZE = 32
MODEL_NAME = "microsoft/codebert-base"

# --- Sliding Window Configuration ---
MAX_LENGTH = 512
OVERLAP = 50
STRIDE = MAX_LENGTH - OVERLAP

In [16]:
# --- Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# --- Checkpointing: Determine where to resume from by counting rows in the output CSV ---
rows_to_skip = 0
if os.path.exists(FINAL_OUTPUT_CSV):
    with open(FINAL_OUTPUT_CSV, 'r', encoding='utf-8') as f:
        # -1 to not count the header row
        rows_to_skip = max(0, sum(1 for line in f) - 1)
    print(f"Resuming: Found {rows_to_skip} rows already processed in '{FINAL_OUTPUT_CSV}'.")


Using device: cpu


In [17]:
rows_to_skip = 50000

In [None]:
# --- Main Resumable Loop ---
try:
    # csv_reader = pd.read_csv(CLEAN_SOURCE_CSV, chunksize=CHUNK_SIZE, skiprows=range(1, rows_to_skip + 1))
    csv_reader = pd.read_csv(CLEAN_SOURCE_CSV, chunksize=CHUNK_SIZE, skiprows=range(1, rows_to_skip + 1),nrows=100)
    
    # We write the header only if the file is new or empty
    write_header = (rows_to_skip == 0)

    for chunk_df in tqdm(csv_reader, desc="Generating Embeddings"):
        batch_embeddings = []
        
        for _, row in chunk_df.iterrows():
            try:
                diff_text = str(row['diff'])
                all_input_ids = tokenizer.encode(diff_text, add_special_tokens=False)
                final_embedding_vector = None
                
                if len(all_input_ids) <= MAX_LENGTH - 2:
                    inputs = tokenizer(diff_text, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
                    with torch.no_grad():
                        outputs = model(**inputs)
                    final_embedding_vector = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
                else:
                    # Sliding Window Logic
                    chunk_embeddings = []
                    for start in range(0, len(all_input_ids), STRIDE):
                        chunk_ids = all_input_ids[start : start + MAX_LENGTH - 2]
                        input_tensor = torch.tensor([tokenizer.cls_token_id] + chunk_ids + [tokenizer.sep_token_id]).unsqueeze(0).to(device)
                        with torch.no_grad():
                            outputs = model(input_tensor)
                        chunk_embeddings.append(outputs.last_hidden_state[:, 0, :])
                    
                    if chunk_embeddings:
                        stacked_embeddings = torch.stack(chunk_embeddings)
                        final_embedding_vector = torch.mean(stacked_embeddings, dim=0).cpu().numpy().flatten()

                if final_embedding_vector is not None:
                    batch_embeddings.append(final_embedding_vector)
                else:
                    # Append a zero vector if embedding fails for any reason
                    batch_embeddings.append(np.zeros(model.config.hidden_size))

            except Exception as e:
                batch_embeddings.append(np.zeros(model.config.hidden_size)) # Append zero vector on error
                with open(ERROR_LOG_PATH, 'a', encoding='utf-8') as f:
                    f.write(f"Error on commit {row.get('commit_hash', 'N/A')}: {e}\n")

        # --- Combine original data with new embeddings ---
        embedding_df = pd.DataFrame(batch_embeddings, columns=[f'emb_{j}' for j in range(model.config.hidden_size)])
        
        # Reset indices to ensure correct side-by-side concatenation
        chunk_df.reset_index(drop=True, inplace=True)
        chunk_df.drop(columns=['diff'], inplace=True)
        embedding_df.reset_index(drop=True, inplace=True)
        
        combined_chunk_df = pd.concat([chunk_df, embedding_df], axis=1)
        
        # --- Append the combined chunk to the final CSV ---
        combined_chunk_df.to_csv(FINAL_OUTPUT_CSV, mode='a', header=write_header, index=False)
        write_header = False # Ensure header is only written once

        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    print("\n--- Final Dataset Generation Complete --- ✅")
    print(f"Dataset with all original metadata and embeddings saved to '{FINAL_OUTPUT_CSV}'.")

except FileNotFoundError:
    print(f"ERROR: Clean file not found at '{CLEAN_SOURCE_CSV}'. Please run the cleaning step first.")

Generating Embeddings: 0it [00:00, ?it/s]