In [None]:
## Run this cell to install required packages
# !uv pip install pydriller pandas matplotlib tqdm -q

In [None]:
import csv
from pydriller import Repository, Git
import os
import subprocess
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
# --- Part (a) & (b): Repository Selection ---
REPO_URLS = [
    "https://github.com/psf/requests.git",
    "https://github.com/pallets/flask.git",
    "https://github.com/scikit-learn/scikit-learn.git",
    # "https://github.com/ShardulJunagade/miniTorch.git"
]
NUM_REPOS = len(REPO_URLS)
REPO_FOLDER = "repos"
os.makedirs(REPO_FOLDER, exist_ok=True)

OUTPUT_FOLDER = "results"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
OUTPUT_CSV = "diff_analysis.csv"
OUTPUT_CSV_PATH = os.path.join(OUTPUT_FOLDER, OUTPUT_CSV)

In [3]:
# clone the repositories if not already cloned
for url in REPO_URLS:
    repo_name = url.split("/")[-1].replace(".git", "")
    repo_path = os.path.join(REPO_FOLDER, repo_name)
    if not os.path.exists(repo_path):
        print(f"🔄 Cloning {url}...")
        subprocess.run(["git", "clone", url, repo_path])
    else:
        print(f"📦 Repository {repo_name} already exists. Skipping clone.")


📦 Repository requests already exists. Skipping clone.
📦 Repository flask already exists. Skipping clone.
📦 Repository scikit-learn already exists. Skipping clone.


In [None]:
def analyze_repo(repo_url):
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    repo_path = os.path.join(REPO_FOLDER, repo_name)

    # Use the Git object for direct command execution
    git_repo = Git(repo_path)
    rows = []

    commits_list = list(Repository(repo_path).traverse_commits())
    for commit in tqdm(commits_list, desc=f"Traversing commits in {repo_name}"):
        if not commit.parents:
            continue
        parent_commit_sha = commit.parents[0]

        for modified_file in commit.modified_files:
            # Process only file modifications, not new files or deletions
            if modified_file.old_path and modified_file.new_path:
                # Define the common flags required by the assignment 
                # -w ignores whitespace
                # --ignore-blank-lines ignores differences in blank lines
                common_flags = [
                    '-w',
                    '--ignore-blank-lines',
                    parent_commit_sha,
                    commit.hash,
                    '--',
                    modified_file.new_path
                ]

                # 1. Myers Diff (standard git diff)
                diff_myers_output = git_repo.repo.git.diff(*common_flags)
                # 2. Histogram Diff
                diff_hist_output = git_repo.repo.git.diff('--histogram', *common_flags)
                # Compare Diff Outputs
                discrepancy = "Yes" if diff_myers_output != diff_hist_output else "No"

                rows.append({
                    "old_file_path": modified_file.old_path,
                    "new_file_path": modified_file.new_path,
                    "commit_sha": commit.hash,
                    "parent_commit_sha": parent_commit_sha,
                    "commit_message": commit.msg.strip(),
                    "diff_myers": diff_myers_output,
                    "diff_hist": diff_hist_output,
                    "Discrepancy": discrepancy
                })
    return rows

In [6]:
fieldnames = [
    "old_file_path", "new_file_path", "commit_sha", "parent_commit_sha",
    "commit_message", "diff_myers", "diff_hist", "Discrepancy"
]

all_data = []

for i, repo_url in enumerate(REPO_URLS, start=1):
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    print(f"\n--- Repository {i}/{NUM_REPOS} : {repo_name} ---")
    rows = analyze_repo(repo_url)
    print(f"📊 Collected {len(rows)} records from this repository")
    all_data.extend(rows)

    with open(os.path.join(OUTPUT_FOLDER, f"{repo_name}_{OUTPUT_CSV}"), mode='w', newline='', encoding='utf-8', errors='surrogateescape') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)
    print(f"💾 Data from {repo_name} written to {OUTPUT_FOLDER}/{repo_name}_{OUTPUT_CSV}")

with open(OUTPUT_CSV_PATH, mode='w', newline='', encoding='utf-8', errors='surrogateescape') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(all_data)

print(f"\n✅ Analysis complete. Data saved to {OUTPUT_CSV_PATH}.")


--- Repository 1/3 : requests ---


Traversing commits in requests: 100%|██████████| 6373/6373 [33:48<00:00,  3.14it/s]  


📊 Collected 7019 records from this repository
💾 Data from requests written to results/requests_diff_analysis.csv

--- Repository 2/3 : flask ---


Traversing commits in flask: 100%|██████████| 5463/5463 [33:14<00:00,  2.74it/s]   


📊 Collected 8380 records from this repository
💾 Data from flask written to results/flask_diff_analysis.csv

--- Repository 3/3 : scikit-learn ---


Traversing commits in scikit-learn: 100%|██████████| 32801/32801 [7:16:53<00:00,  1.25it/s]       


📊 Collected 80107 records from this repository
💾 Data from scikit-learn written to results/scikit-learn_diff_analysis.csv

🎉 Analysis complete. Data saved to results\diff_analysis.csv.


In [None]:
# Load the DataFrame
df = pd.read_csv("results/flask_diff_analysis.csv", encoding='utf-8', encoding_errors='surrogateescape')
display(df)

Unnamed: 0,old_file_path,new_file_path,commit_sha,parent_commit_sha,commit_message,diff_myers,diff_hist,Discrepancy
0,.gitignore,.gitignore,4ec7d2a0d8eac4f915dc0d38a886cd57045bb0c4,b15ad394279fc3b7f998fa56857f334a7c0156f6,Started working on documentation.\n\nSo far ju...,diff --git a/.gitignore b/.gitignore\nindex 21...,diff --git a/.gitignore b/.gitignore\nindex 21...,No
1,examples\minitwit\minitwit.py,examples\minitwit\minitwit.py,4ec7d2a0d8eac4f915dc0d38a886cd57045bb0c4,b15ad394279fc3b7f998fa56857f334a7c0156f6,Started working on documentation.\n\nSo far ju...,diff --git a/examples/minitwit/minitwit.py b/e...,diff --git a/examples/minitwit/minitwit.py b/e...,No
2,flask.py,flask.py,4ec7d2a0d8eac4f915dc0d38a886cd57045bb0c4,b15ad394279fc3b7f998fa56857f334a7c0156f6,Started working on documentation.\n\nSo far ju...,diff --git a/flask.py b/flask.py\nindex 83d8a8...,diff --git a/flask.py b/flask.py\nindex 83d8a8...,No
3,docs\api.rst,docs\api.rst,3b36bef2e6165bb4dad73d17f23ee1879e99f497,44b42e0fbd93d86e0f4e929bda8e5fb63e81035d,"Improved documentation, added a contextmanager...",diff --git a/docs/api.rst b/docs/api.rst\ninde...,diff --git a/docs/api.rst b/docs/api.rst\ninde...,No
4,docs\conf.py,docs\conf.py,3b36bef2e6165bb4dad73d17f23ee1879e99f497,44b42e0fbd93d86e0f4e929bda8e5fb63e81035d,"Improved documentation, added a contextmanager...",diff --git a/docs/conf.py b/docs/conf.py\ninde...,diff --git a/docs/conf.py b/docs/conf.py\ninde...,No
...,...,...,...,...,...,...,...,...
8375,uv.lock,uv.lock,55c62556571ee46a94da174643b50ece06edead4,d8259eb11900285af9b80b0fa47f841174c054e3,update dev dependencies,diff --git a/uv.lock b/uv.lock\nindex 206c2d53...,diff --git a/uv.lock b/uv.lock\nindex 206c2d53...,Yes
8376,.github\workflows\publish.yaml,.github\workflows\publish.yaml,4dd52ca9c768c9b6d04180f0547d6f4b6e34f211,55c62556571ee46a94da174643b50ece06edead4,Update GitHub Actions workflow for artifact ha...,diff --git a/.github/workflows/publish.yaml b/...,diff --git a/.github/workflows/publish.yaml b/...,No
8377,CHANGES.rst,CHANGES.rst,2c1b30d0503cfb064f1cb252e6614a06915a362a,1292419ddfc6a14fc7f85b5ed7efcc2d215f1ad3,release version 3.1.2,diff --git a/CHANGES.rst b/CHANGES.rst\nindex ...,diff --git a/CHANGES.rst b/CHANGES.rst\nindex ...,No
8378,pyproject.toml,pyproject.toml,2c1b30d0503cfb064f1cb252e6614a06915a362a,1292419ddfc6a14fc7f85b5ed7efcc2d215f1ad3,release version 3.1.2,diff --git a/pyproject.toml b/pyproject.toml\n...,diff --git a/pyproject.toml b/pyproject.toml\n...,No
