In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from tqdm.notebook import tqdm
import os
import json

In [None]:
# --- Configuration ---
LARGE_CSV_PATH = "final_dataset_with_diffs.csv"
TOKENIZED_DATA_DIR = "tokenized_data" # A new directory to save the chunks
CHUNK_SIZE = 100 # How many rows to process at a time

: 

In [None]:
# --- Setup ---
os.makedirs(TOKENIZED_DATA_DIR, exist_ok=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
total_rows = 0

print(f"Starting pre-processing of '{LARGE_CSV_PATH}'...")
# Create a data reader that yields chunks of the CSV
csv_reader = pd.read_csv(LARGE_CSV_PATH, chunksize=CHUNK_SIZE)

for i, chunk_df in enumerate(tqdm(csv_reader, desc="Processing Chunks")):
    diff_texts = chunk_df['diff'].astype(str).tolist()
    labels = chunk_df['is_bug_introducing'].tolist()
    
    # Tokenize the batch of diffs
    encodings = tokenizer(
        diff_texts, 
        truncation=True, 
        padding="max_length", # Pad to a uniform length
        max_length=512
    )
    
    # Prepare data as PyTorch tensors
    input_ids = torch.tensor(encodings['input_ids'])
    attention_mask = torch.tensor(encodings['attention_mask'])
    labels_tensor = torch.tensor(labels)
    
    # Save the processed chunk to a file
    chunk_path = os.path.join(TOKENIZED_DATA_DIR, f"chunk_{i}.pt")
    torch.save({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels_tensor
    }, chunk_path)
    
    total_rows += len(chunk_df)

# Save metadata about our dataset
metadata = {'total_samples': total_rows, 'chunk_size': CHUNK_SIZE}
with open(os.path.join(TOKENIZED_DATA_DIR, 'metadata.json'), 'w') as f:
    json.dump(metadata, f)

print(f"\nPre-processing complete. ✅ Saved {i+1} chunks to '{TOKENIZED_DATA_DIR}'.")

Starting pre-processing of 'final_dataset_with_diffs.csv'...


Processing Chunks: 0it [00:00, ?it/s]