# Data preparation for codeBERT embeddings

In [None]:
import pandas as pd
import numpy as np
import git
from loguru import logger
from tqdm import tqdm

# Logger config
logger.add("logs/diff_extraction.log")

logger.info("Libraries imported and logger configured for Feature Extraction.")

Load the last data set 

In [None]:
FINAL_DATASET_PATH = "../data/final/final_labeled_training_dataset.csv"
df = pd.read_csv(FINAL_DATASET_PATH)
df = df[['commit_hash', 'is_bug_introducing']].dropna()

Function to get commit diff

In [None]:
REPO_PATH = "../../ballerina-lang/" # Make sure this path is correct
repo = git.Repo(REPO_PATH)

def get_commit_diff(commit_hash):
    try:
        commit = repo.commit(commit_hash)
        parent = commit.parents[0] if commit.parents else None
        if parent:
            return repo.git.diff(parent, commit, '--no-color', '--unified=0')
        return "" # No parent, no diff
        
    except Exception:
        logger.error(f"Error retrieving diff for commit {commit_hash}")
        return "" # Handle errors or missing commits

In [None]:
# Load previous progress
df = pd.read_pickle('temp_backup.pkl')  # or 'commits_with_diffs.pkl'
print(f"Loaded {len(df)} previously processed commits")

Create lists of diffs and labels

In [None]:
# Ultra simple - just replace your failing code with this:

print("Processing commits in small batches to avoid memory issues...")

# Process in small chunks and save frequently
chunk_size = 30  # Very small to be safe
all_results = []

for i in tqdm(range(0, len(df), chunk_size), desc="Processing"):
    chunk = df.iloc[i:i+chunk_size]
    
    for _, row in chunk.iterrows():
        try:
            diff = get_commit_diff(row['commit_hash'])
            if diff.strip():  # Only keep non-empty diffs
                new_row = row.copy()
                new_row['diff'] = diff
                all_results.append(new_row)
        except:
            continue  # Skip problematic commits
    
    # Save progress every 100 commits
    if len(all_results) % 100 == 0 and all_results:
        pd.DataFrame(all_results).to_pickle('temp_backup.pkl')
        print(f"💾 Backup saved - {len(all_results)} commits processed")

# Final result
if all_results:
    df = pd.DataFrame(all_results).reset_index(drop=True)
    df.to_pickle('commits_with_diffs.pkl')  # Save final version
    print(f"✅ Success! {len(df)} commits with diffs")
else:
    print("❌ No commits with diffs found")
    df = pd.DataFrame()

In [None]:

# Note: This can be slow. You might want to run it once and save the results.
tqdm.pandas()  # Enables progress_apply
print("Extracting diffs for each commit...")

df['diff'] = df['commit_hash'].progress_apply(get_commit_diff)
df = df[df['diff'] != ""] # Filter out commits with no diff

In [None]:
df.to_csv("../data/final/final_labeled_training_dataset_with_diffs.csv", index=False)

In [None]:

# Use a chronological split (assuming df is still sorted by date)
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False)

train_texts, train_labels = train_df['diff'].tolist(), train_df['is_bug_introducing'].tolist()
test_texts, test_labels = test_df['diff'].tolist(), test_df['is_bug_introducing'].tolist()

# --- 4. Tokenize the data ---
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# --- 5. Create a PyTorch Dataset object ---
class BugDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BugDataset(train_encodings, train_labels)
test_dataset = BugDataset(test_encodings, test_labels)

print("Data prepared and tokenized for CodeBERT.")