In [1]:
import pandas as pd
import git
import os
import csv
from tqdm.notebook import tqdm

In [None]:
# --- Configuration ---
FINAL_DATASET_PATH = "../data/final/final_labeled_training_dataset.csv"
REPO_PATH = "../../ballerina-lang/"
FINAL_CSV_WITH_DIFFS_PATH = "final_dataset_with_diffs2.csv"
ERROR_LOG_PATH = "log/diff_errors.log"

os.environ['GIT_TERMINAL_PROMPT'] = '0' 

In [None]:
# --- Checkpointing: Find commits already processed ---
processed_hashes = set()
header = []
if os.path.exists(FINAL_CSV_WITH_DIFFS_PATH):
    print(f"Resuming from checkpoint file: '{FINAL_CSV_WITH_DIFFS_PATH}'")
    # Read only the commit_hash column to save memory
    processed_df = pd.read_csv(FINAL_CSV_WITH_DIFFS_PATH, usecols=['commit_hash'])
    processed_hashes = set(processed_df['commit_hash'])
    print(f"Found {len(processed_hashes)} commits already processed.")

# --- Load the full dataset and filter ---
df = pd.read_csv(FINAL_DATASET_PATH)
df_to_process = df[~df['commit_hash'].isin(processed_hashes)]
print(f"New commits to process: {len(df_to_process)}.")

Resuming from checkpoint file: 'final_dataset_with_diffs2.csv'
Found 57338 commits already processed.
New commits to process: 68487.


In [4]:
# --- Main Sequential Loop ---
repo = git.Repo(REPO_PATH)

# Open the output file once and write to it line by line
with open(FINAL_CSV_WITH_DIFFS_PATH, 'a', newline='', encoding='utf-8') as outfile, \
     open(ERROR_LOG_PATH, 'a', encoding='utf-8') as error_file:
    
    writer = csv.DictWriter(outfile, fieldnames=df.columns.tolist() + ['diff'])
    # Write header only if the file is new
    if not processed_hashes and os.path.getsize(FINAL_CSV_WITH_DIFFS_PATH) == 0:
        writer.writeheader()

    for _, row in tqdm(df_to_process.iterrows(), total=len(df_to_process), desc="Processing Commits"):
        commit_hash = row['commit_hash']
        diff_text = ""
        try:
            commit = repo.commit(commit_hash)
            if commit.parents:
                parent = commit.parents[0]
                diff_text = repo.git.diff(parent, commit, '--no-color', '--unified=0')
        except Exception as e :
            error_file.write(f"Commit: {commit_hash}\nError: {str(e)}\n---\n")
        
        
        # Prepare the full record and write it to the CSV
        record = row.to_dict()
        record['diff'] = diff_text
        writer.writerow(record)

print("\n--- Process Complete ---")

Processing Commits:   0%|          | 0/68487 [00:00<?, ?it/s]

UnicodeEncodeError: 'utf-8' codec can't encode characters in position 1374-1376: surrogates not allowed

In [None]:
# --- Main Sequential Loop (Corrected for Unicode Errors) ---
repo = git.Repo(REPO_PATH)

with open(FINAL_CSV_WITH_DIFFS_PATH, 'a', newline='', encoding='utf-8') as outfile, \
     open(ERROR_LOG_PATH, 'a', encoding='utf-8') as error_file:
    
    writer = csv.DictWriter(outfile, fieldnames=df.columns.tolist() + ['diff'])
    if not processed_hashes and os.path.getsize(FINAL_CSV_WITH_DIFFS_PATH) == 0:
        writer.writeheader()

    for _, row in tqdm(df_to_process.iterrows(), total=len(df_to_process), desc="Processing Commits"):
        commit_hash = row['commit_hash']
        diff_text = ""
        try :
            commit = repo.commit(commit_hash)
            if commit.parents:
                parent = commit.parents[0]
                diff_text = repo.git.diff(parent, commit, '--no-color', '--unified=0')
        except Exception as e :
            error_file.write(f"Commit: {commit_hash}\nError: {str(e)}\n---\n")
        
        # Clean the diff_text to replace any invalid unicode characters.
        cleaned_diff = diff_text.encode('utf-8', 'replace').decode('utf-8')
        
        # Prepare the full record and write it to the CSV
        record = row.to_dict()
        record['diff'] = cleaned_diff # Use the cleaned version
        writer.writerow(record)

print("\n--- Process Complete ---")

Processing Commits:   0%|          | 0/68487 [00:00<?, ?it/s]


--- Process Complete ---
