In [None]:
import os
import subprocess
import csv
import pandas as pd
from pydriller import Repository
from tqdm import tqdm

REPO_URL = "https://github.com/3b1b/manim"
repo_name = REPO_URL.split("/")[-1].replace(".git", "")
output_csv = 'bugfix_commits.csv'
output_folder = 'results'

In [None]:
# clone the repositories if not already cloned
if not os.path.exists(repo_name):
    print(f"Cloning {repo_name} from {REPO_URL}...")
    subprocess.run(["git", "clone", REPO_URL])
else:
    print(f"Repository {repo_name} already exists. Skipping clone.")

📦 Repository manim already exists. Skipping clone.


In [None]:
bug_keywords = ['fix', 'bug', 'patch', 'error', 'issue', 'defect', 'crash', 'fault', 'flaw', 
                'glitch', 'mistake', 'repair', 'resolve', 'solve', 'fail', 'break', 'broke', 
                'overflow', 'leak', 'vulnerability']

def is_bugfix(msg):
    msg = msg.lower()
    return any(word in msg for word in bug_keywords)

In [None]:
fields = ['commit_hash', 'commit_message', 'parent_hashes', 'is_merge_commit', 'modified_files']

with open(f"{output_folder}/{output_csv}", 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fields)
    writer.writeheader()

    commits_list = list(Repository(repo_name).traverse_commits())
    for commit in tqdm(commits_list, desc="Processing commits"):
        if is_bugfix(commit.msg):
            writer.writerow({
                'commit_hash': commit.hash,
                'commit_message': commit.msg.replace('\n', ' ').replace('\r', ' '),
                'parent_hashes': '; '.join(commit.parents),
                'is_merge_commit': commit.merge,
                'modified_files': '; '.join([mf.filename for mf in commit.modified_files])
            })

print(f"Bug-fixing commits written to {output_folder}/{output_csv}")

Processing commits: 100%|██████████| 6344/6344 [02:48<00:00, 37.66it/s] 

Bug-fixing commits written to bugfix_commits.csv





In [8]:
bugfix_commits_df = pd.read_csv(f"{output_folder}/{output_csv}")
print("Number of bug-fixing commits found:", len(bugfix_commits_df))

display(bugfix_commits_df.head(20))

Number of bug-fixing commits found: 1358


Unnamed: 0,commit_hash,commit_message,parent_hashes,is_merge_commit,modified_files
0,014a277a97759bbc0e6ec8fba588bc6e6de65a86,A few fixes to initial point_thickness impleme...,c0994ed0a53f06a555ea8d42903c2038fb54db2a,False,constants.py; displayer.py; mobject.py
1,2e074afb60d13262ce1e42e83bcf0ed28d95ad82,"middle of massive restructure, everything stil...",096c5c1890e326e67ee387c921901f84ca3f3f37,False,__init__.py; __init__.py; animation.py; meta_a...
2,ac930952f151acf284f6e01e98e8f7256f29f9f4,"Beginning transform KA article, many small fix...",d294d0f951f01b307805d18679509c4c9d4b7301,False,animation.py; meta_animations.py; simple_anima...
3,2322018875b218cc156f50d30d865a86d4a68a13,quick rgb-should-be-numpy-array bug fix,c7389e008d3dc79f61e8270ff18758210b7e6a9e,False,mobject.py
4,7ae5a0eccb13713b3439d9c0a0b79bcfc96b05be,"Slightly faster sort_points method, and bug fi...",7f45044bafb27c94ae1b3f900014b3fbdc612e5f,False,mobject.py
5,f21f6619a5150d193d85ed31ef60ed5b986661ca,Fix to Stars Mobject,7ae5a0eccb13713b3439d9c0a0b79bcfc96b05be,False,geometry.py; three_dimensions.py
6,68160140233b8f7ebf00aa2894ea57984c504a90,Bug fix to bug fix on Mobject.fade method,8f8eeea870c2f780cc93950a6de6bd6a168fcfb4,False,mobject.py
7,4f57551344e2655e53c9dcf72ee7d17333605f62,Bug in Arrow buffer,68160140233b8f7ebf00aa2894ea57984c504a90,False,geometry.py
8,5964d5206ac15fc0544325728e232b69aa007e4d,Fixed ShowCreation,84054286824d0ca17219f7d3e11a5f256f098559,False,simple_animations.py
9,9e54a5a6802d12994b007434b98c9f82d7dfd085,"Fixed PiCreature mouth, but more work remains",c0009064d6ee0183f2d36fd15d7442b6efd10f54,False,characters.py
