# tokenizing large diffs

In [1]:
import csv
import torch
from transformers import AutoTokenizer
from tqdm.notebook import tqdm
import os
import gc
import sys

In [2]:
# --- Configuration ---
LARGE_DIFFS_CSV = "data/large_diffs.csv"
SMALL_DIFFS_CSV = "data/small_diffs.csv"
TOKENIZED_DATA_DIR = "data/tokenized_data_test4/large_diffs"
LABEL_ERROR_LOG_PATH = "logs/label_parsing_errors.log" # New log file for bad labels

In [3]:
# --- Setup ---
os.makedirs(TOKENIZED_DATA_DIR, exist_ok=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# --- FIX 1: Increase the CSV field size limit to prevent crashes ---
max_int = sys.maxsize
while True:
    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int / 10)
print(f"CSV field size limit increased to handle large fields.")

# --- Get header from the small_diffs file ---
try:
    with open(SMALL_DIFFS_CSV, 'r', encoding='utf-8') as f:
        header = next(csv.reader(f))
    print("Header read successfully.")
except FileNotFoundError:
    print(f"ERROR: '{SMALL_DIFFS_CSV}' not found.")
    raise

# --- Checkpointing ---
processed_files_count = len(os.listdir(TOKENIZED_DATA_DIR))
print(f"Processing large diffs: Resuming, found {processed_files_count} already processed.")

# --- Main Memory-Safe Loop with Enhanced Logging ---
with open(LARGE_DIFFS_CSV, 'r', encoding='utf-8') as infile, \
     open(LABEL_ERROR_LOG_PATH, 'a', encoding='utf-8') as error_log:
    
    reader = csv.reader(infile)
    try:
        total_rows = sum(1 for row in reader)
        infile.seek(0)
    except Exception:
        total_rows = None

    # Skip already processed rows
    for _ in range(processed_files_count):
        next(reader, None)

    for i, row in enumerate(tqdm(reader, desc="Processing Large Diffs", initial=processed_files_count, total=total_rows)):
        file_index = processed_files_count + i
        
        row_dict = dict(zip(header, row))
        
        diff_text = row_dict.get('diff', '')
        commit_hash = row_dict.get('commit_hash', 'UNKNOWN_HASH')
        
        # --- FIX 2: Enhanced Label Parsing and Logging ---
        label_str = row_dict.get('is_bug_introducing', '0')
        try:
            label = int(float(label_str))
        except (ValueError, TypeError):
            # If parsing fails, log the commit hash and bad label to a file
            error_log.write(f"Commit: {commit_hash}, Unparseable_Label: '{label_str}'\n")
            label = 0 # Default to 0 and continue
            
        # Tokenize just this one diff
        encoding = tokenizer(str(diff_text), truncation=True, padding="max_length", max_length=512, return_tensors="pt")

        # Save the single tokenized item to its own file
        chunk_path = os.path.join(TOKENIZED_DATA_DIR, f"large_item_{file_index}.pt")
        torch.save({
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'labels': torch.tensor([label])
        }, chunk_path)

        gc.collect()

print(f"\nLarge diff processing complete. ✅")
print(f"Any label parsing errors have been logged to '{LABEL_ERROR_LOG_PATH}'.")

CSV field size limit increased to handle large fields.
Header read successfully.
Processing large diffs: Resuming, found 0 already processed.


Processing Large Diffs:   0%|          | 0/479 [00:00<?, ?it/s]


Large diff processing complete. ✅
Any label parsing errors have been logged to 'logs/label_parsing_errors.log'.
