# Tokenizing small diff

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from tqdm.notebook import tqdm
import os
import json
import gc

In [2]:
# --- Configuration ---
INPUT_CSV = "small_diffs.csv"
TOKENIZED_DATA_DIR = "tokenized_data_test/small_diffs"
CHUNK_SIZE = 500 # How many rows to process at a time

In [3]:
# --- Setup ---
os.makedirs(TOKENIZED_DATA_DIR, exist_ok=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# --- Checkpointing: Determine where to resume from ---
processed_chunks = 0
if os.path.exists(TOKENIZED_DATA_DIR):
    existing_files = [f for f in os.listdir(TOKENIZED_DATA_DIR) if f.startswith('chunk_') and f.endswith('.pt')]
    if existing_files:
        processed_chunks = len(existing_files)

start_chunk = processed_chunks
rows_to_skip = start_chunk * CHUNK_SIZE
print(f"Processing small diffs: Resuming from chunk #{start_chunk}.")

Processing small diffs: Resuming from chunk #0.


In [4]:
# --- Main Resumable Loop for Small Diffs ---
try:
    csv_reader = pd.read_csv(INPUT_CSV, chunksize=CHUNK_SIZE, skiprows=range(1, rows_to_skip + 1))
    
    total_rows_processed = 0
    for i, chunk_df in enumerate(tqdm(csv_reader, desc="Processing Small Diffs")):
        current_chunk_index = start_chunk + i
        
        diff_texts = chunk_df['diff'].astype(str).tolist()
        labels = chunk_df['is_bug_introducing'].tolist()
        
        encodings = tokenizer(diff_texts, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        
        chunk_path = os.path.join(TOKENIZED_DATA_DIR, f"chunk_{current_chunk_index}.pt")
        torch.save({
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask'],
            'labels': torch.tensor(labels)
        }, chunk_path)
        
        total_rows_processed += len(chunk_df)
        del diff_texts, labels, encodings, chunk_df
        gc.collect()

    print(f"\nSmall diff processing complete. ✅")

except FileNotFoundError:
    print(f"ERROR: Input file not found at '{INPUT_CSV}'")

Processing Small Diffs: 0it [00:00, ?it/s]


Small diff processing complete. ✅
