In [12]:
import glob, json
from datetime import datetime
import pandas as pd

## Data load

In [13]:
all_pr_rows = []
pr_files = glob.glob("snapshot_*/**/*_pr_sharing*.json", recursive=True)
print(f"Found {len(pr_files)} snapshot PR files")

for file in pr_files:
    with open(file, "r") as f:
        data = json.load(f)
    # Each file has a "Sources" list
    if "Sources" in data:
        all_pr_rows.extend(data["Sources"])

print(f"Total PR records collected: {len(all_pr_rows)}")

df_all = pd.DataFrame(all_pr_rows)
print(df_all.shape)
df_all.head(3)




Found 9 snapshot PR files
Total PR records collected: 1693
(1693, 19)


Unnamed: 0,Type,URL,Author,RepoName,RepoLanguage,Number,Title,Body,CreatedAt,ClosedAt,MergedAt,UpdatedAt,State,Additions,Deletions,ChangedFiles,CommitsTotalCount,ChatgptSharing,CommitSha
0,pull request,https://github.com/labdao/plex/pull/469,AdamGoyer,labdao/plex,Go,469,add readme for openbabel to PLEX,The Chatgpt Thread used to create this pull re...,2023-07-05T03:30:54Z,,,2023-07-05T03:30:59Z,OPEN,42,0,1,1,[{'URL': 'https://chat.openai.com/share/8bd338...,[a41124a8d7f8d54d68b777bd3781734188d35873]
1,pull request,https://github.com/labdao/plex/pull/468,AdamGoyer,labdao/plex,Go,468,add readme for Gnina #462,Link to the ChatGPT conversation used to creat...,2023-07-05T02:53:12Z,,,2023-07-05T04:20:31Z,OPEN,45,0,1,1,[{'URL': 'https://chat.openai.com/share/2c4b0d...,[8199fe4135efd1500e3dc5c868048aa3cf118cb4]
2,pull request,https://github.com/ActivityWatch/aw-webui/pull...,ErikBjare,ActivityWatch/aw-webui,Vue,455,feat: improved categorization helper to includ...,Came up with this while thinking about the bug...,2023-06-22T15:47:50Z,,,2023-06-22T15:52:32Z,OPEN,78,19,1,1,[{'URL': 'https://chat.openai.com/share/0c7588...,[c93f546f2b9d2b97a6f340d4a1859f2aa80fd0a7]


## Data Exploration
Json files as a dictionary

In [14]:
df_python = df_all[df_all["RepoLanguage"] == "Python"]
print("Total Python PRs:", df_python.shape)

Total Python PRs: (232, 19)


In [15]:
bugfix_patterns = r"fix|bug|error|fail|typo|broken|patch|regression|resolve|correct"
df_bugfix = df_python[df_python["Title"].str.contains(bugfix_patterns, case=False, na=False)]
print("Total Python bug-fix PRs across snapshots:", df_bugfix.shape)

Total Python bug-fix PRs across snapshots: (32, 19)


In [16]:
from datetime import datetime

def parse_time(ts):
    if ts is None:
        return None
    return datetime.fromisoformat(ts.replace("Z", "+00:00"))

df_bugfix = df_bugfix.copy()
df_bugfix["CreatedAt_dt"] = df_bugfix["CreatedAt"].apply(parse_time)
df_bugfix["MergedAt_dt"] = df_bugfix["MergedAt"].apply(parse_time)

df_bugfix["TimeToIntegration_hours"] = (
    df_bugfix["MergedAt_dt"] - df_bugfix["CreatedAt_dt"]
).dt.total_seconds() / 3600

df_bugfix[["RepoName", "Title", "TimeToIntegration_hours", "Additions", "Deletions"]].head(32)


Unnamed: 0,RepoName,Title,TimeToIntegration_hours,Additions,Deletions
24,chitalian/gptask,Fix: recursive/glob support,1.715556,78,35
170,chitalian/gptask,Fix: recursive/glob support,1.715556,78,35
302,comfyanonymous/ComfyUI,make ComfyUI installable via pip install with ...,,3151,444
326,chitalian/gptask,Fix: recursive/glob support,1.715556,78,35
476,comfyanonymous/ComfyUI,make ComfyUI installable via pip install with ...,,3157,450
491,chitalian/gptask,Fix: recursive/glob support,1.715556,78,35
555,gradio-app/gradio,Fixes audio streaming issues,18.679167,33,14
651,comfyanonymous/ComfyUI,make ComfyUI installable via pip install with ...,,3157,450
664,chitalian/gptask,Fix: recursive/glob support,1.715556,78,35
733,gradio-app/gradio,Fixes audio streaming issues,18.679167,33,14


In [17]:
columns_to_keep = [
    "RepoName", "Title", "URL", "CreatedAt", "MergedAt",
    "Additions", "Deletions", "ChangedFiles", "CommitsTotalCount",
    "TimeToIntegration_hours", "CommitSha", "ChatgptSharing"
]

df_bugfix_temp = df_bugfix[columns_to_keep].copy()

# Convert list columns to strings
for col in df_bugfix_temp.columns:
    if df_bugfix_temp[col].apply(lambda x: isinstance(x, list)).any():
        df_bugfix_temp[col] = df_bugfix_temp[col].astype(str)

df_bugfix_clean = df_bugfix_temp.drop_duplicates()


In [18]:
df_bugfix_clean.to_csv("bugfix_pr_dataset_all_snapshots.csv", index=False)
print("Saved dataset to bugfix_pr_dataset_all_snapshots.csv ")


Saved dataset to bugfix_pr_dataset_all_snapshots.csv 


# Compute three traditional key metrics 

In [19]:
# Extract Code Diff from GitHub (human fix)
import requests

def get_patch(repo_name, pr_number):
    """
    Download the raw patch (diff) of a PR from GitHub.
    e.g. repo_name = 'metaphorsystems/metaphor-python', pr_number = 14
    """
    url = f"https://github.com/{repo_name}/pull/{pr_number}.patch"
    r = requests.get(url)
    if r.status_code == 200:
        return r.text
    else:
        print(f"Failed to get patch for {repo_name} PR#{pr_number}")
        return None


In [20]:
# Parse Patch → Extract Before & After Snippets
import re

def extract_code_snippets_from_patch(patch_text):
    """
    A lightweight extractor:
    - Lines starting with '-' (excluding --- file headers) = before
    - Lines starting with '+' (excluding +++ headers) = after
    Returns: (before_code_str, after_code_str)
    """
    before_lines, after_lines = [], []
    for line in patch_text.splitlines():
        if line.startswith('---') or line.startswith('+++'):
            continue
        if line.startswith('-'):
            before_lines.append(line[1:])
        elif line.startswith('+'):
            after_lines.append(line[1:])
    return "\n".join(before_lines), "\n".join(after_lines)


In [None]:
# Compute CodeBLEU between Before & After

