In [1]:
import pandas as pd
import git
import os
import numpy as np
import gc
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm.notebook import tqdm

In [None]:
# --- Configuration ---
FINAL_DATASET_PATH = "../data/final/final_labeled_training_dataset.csv"
REPO_PATH = "../../ballerina-lang/"
BATCH_SIZE = 500
MAX_WORKERS = 10
MONGO_URI = "mongodb://localhost:27017/"
MONGO_DB_NAME = "bug_prediction_project_2"
MONGO_COLLECTION_NAME = "commits_with_diffs"
ERROR_LOG_PATH = "diff_errors.log"  # New file to store errors

# --- Environment variable to prevent Git hangs ---
os.environ["GIT_TERMINAL_PROMPT"] = "0"


# --- NEW Worker Function that returns errors ---
def get_commit_diff_worker(commit_hash, repo_path):
    """
    Robust worker that returns a (diff, error) tuple.
    """
    try:
        repo = git.Repo(repo_path)
        commit = repo.commit(commit_hash)
        if commit.parents:
            parent = commit.parents[0]
            diff_text = repo.git.diff(parent, commit, "--no-color", "--unified=0")
            return (diff_text, None)  # Success
        return ("", None)  # Success (no parent)
    except Exception as e:
        return (None, str(e))  # Failure, return the error message


# --- Connect to MongoDB and Checkpoint (Same as before) ---
print(f"Connecting to MongoDB...")
client = MongoClient(MONGO_URI)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]
collection.create_index("commit_hash", unique=True)

processed_hashes = {
    doc["commit_hash"] for doc in collection.find({}, {"commit_hash": 1, "_id": 0})
}
print(f"Found {len(processed_hashes)} commits already in the database.")

df = pd.read_csv(FINAL_DATASET_PATH)
df_to_process = df[~df["commit_hash"].isin(processed_hashes)].copy()
print(f"New commits to process: {len(df_to_process)}.")


Connecting to MongoDB...
Found 0 commits already in the database.
New commits to process: 125825.


In [None]:
# --- Main Loop with Error Logging ---
df_batches = np.array_split(df_to_process, max(1, len(df_to_process) // BATCH_SIZE))
worker_with_path = partial(get_commit_diff_worker, repo_path=REPO_PATH)

for i, batch_df in enumerate(tqdm(df_batches, desc="Overall Progress")):
    batch_records = batch_df.to_dict("records")
    commit_hashes_in_batch = [rec["commit_hash"] for rec in batch_records]

    processed_records = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        results_iterator = tqdm(
            executor.map(worker_with_path, commit_hashes_in_batch),
            total=len(commit_hashes_in_batch),
            desc=f"Batch {i + 1}/{len(df_batches)}",
        )

        # Re-associate results and handle errors
        with open(ERROR_LOG_PATH, "a", encoding="utf-8") as error_file:
            for record, (diff, error) in zip(batch_records, results_iterator):
                if error:
                    # If an error occurred, write it to the log file
                    error_file.write(
                        f"Commit: {record['commit_hash']}\nError: {error}\n---\n"
                    )

                # The 'diff' field will be empty for both errors and commits with no diff
                record["diff"] = diff if diff is not None else ""
                processed_records.append(record)

    if processed_records:
        try:
            collection.insert_many(processed_records, ordered=False)
            print(
                f"\nBatch {i + 1}: Inserted {len(processed_records)} documents into MongoDB."
            )
        except Exception as e:
            print(f"\nBatch {i + 1}: Error during bulk insert. Error: {e}")

    # Memory cleanup
    del processed_records
    del batch_records
    gc.collect()

print(f"\n--- Process Complete ---")
print(f"Any errors encountered have been saved to '{ERROR_LOG_PATH}'.")

  return bound(*args, **kwds)


Overall Progress:   0%|          | 0/251 [00:00<?, ?it/s]

Batch 1/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 1: Inserted 502 documents into MongoDB.


Batch 2/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 2: Error during bulk insert. Error: BSONObj size: 23535236 (0x1671E84) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687fa49789c8eb871baf732b'), full error: {'ok': 0.0, 'errmsg': "BSONObj size: 23535236 (0x1671E84) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687fa49789c8eb871baf732b')", 'code': 10334, 'codeName': 'BSONObjectTooLarge'}


Batch 3/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 3: Inserted 502 documents into MongoDB.


Batch 4/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 4: Inserted 502 documents into MongoDB.


Batch 5/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 5: Inserted 502 documents into MongoDB.


Batch 6/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 6: Inserted 502 documents into MongoDB.


Batch 7/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 7: Error during bulk insert. Error: BSON document too large (190630005 bytes) - the connected server supports BSON document sizes up to 16777216 bytes.


Batch 8/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 8: Inserted 502 documents into MongoDB.


Batch 9/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 9: Inserted 502 documents into MongoDB.


Batch 10/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 10: Error during bulk insert. Error: BSONObj size: 34280911 (0x20B15CF) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687fa8fa89c8eb871baf82da'), full error: {'ok': 0.0, 'errmsg': "BSONObj size: 34280911 (0x20B15CF) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687fa8fa89c8eb871baf82da')", 'code': 10334, 'codeName': 'BSONObjectTooLarge'}


Batch 11/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 11: Error during bulk insert. Error: BSONObj size: 17305913 (0x1081139) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687fa97e89c8eb871baf842f'), full error: {'ok': 0.0, 'errmsg': "BSONObj size: 17305913 (0x1081139) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687fa97e89c8eb871baf842f')", 'code': 10334, 'codeName': 'BSONObjectTooLarge'}


Batch 12/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 12: Error during bulk insert. Error: BSONObj size: 28338618 (0x1B069BA) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687faa0b89c8eb871baf871f'), full error: {'ok': 0.0, 'errmsg': "BSONObj size: 28338618 (0x1B069BA) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687faa0b89c8eb871baf871f')", 'code': 10334, 'codeName': 'BSONObjectTooLarge'}


Batch 13/251:   0%|          | 0/502 [00:00<?, ?it/s]


Batch 13: Error during bulk insert. Error: BSONObj size: 18824365 (0x11F3CAD) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687faaa189c8eb871baf880b'), full error: {'ok': 0.0, 'errmsg': "BSONObj size: 18824365 (0x11F3CAD) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('687faaa189c8eb871baf880b')", 'code': 10334, 'codeName': 'BSONObjectTooLarge'}


Batch 14/251:   0%|          | 0/502 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
import pandas as pd
import gc

# Read the CSV file in chunks
chunk_size = 100000  # Adjust this value based on your system's memory
chunks = []

try:
    # Create a chunk iterator
    chunk_iterator = pd.read_csv("final_dataset_with_diffs.csv", chunksize=chunk_size)

    # Process each chunk
    for chunk in chunk_iterator:
        # Process the chunk as needed
        chunks.append(chunk)

    # Combine all chunks
    df2 = pd.concat(chunks, ignore_index=True)
    print(f"Successfully loaded {len(df2)} rows")

except Exception as e:
    print(f"Error: {str(e)}")

# Free up memory
del chunks
gc.collect()

Error: 


16

In [15]:
sample = pd.read_csv("final_dataset_with_diffs.csv",nrows=10000)
print(sample.memory_usage(deep=True).sum() / (1024**2), "MB")

6951.470607757568 MB


In [2]:
import pandas as pd
df_test = pd.read_csv("final_dataset_with_diffs.csv", chunksize=1000)