In [1]:
# Install dependencies
!pip install pandas numpy matplotlib seaborn scipy wordcloud pyarrow datasets PyGithub python-dotenv --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import re
from collections import Counter
from datetime import datetime
from urllib.parse import urlparse
import time
from github import Github

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Plot settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Environment ready!")

Environment ready!


In [2]:
# Compatibility shim: some versions of fsspec don't expose url_to_fs at top-level.
# This ensures code that expects fsspec.url_to_fs (used by some IO backends) continues to work.
try:
    import fsspec
    if not hasattr(fsspec, "url_to_fs"):
        try:
            from fsspec.core import url_to_fs as _url_to_fs
        except Exception:
            try:
                import fsspec.core as _core
                _url_to_fs = _core.url_to_fs
            except Exception:
                # Fallback shim: create a minimal url_to_fs that returns a filesystem and the path.
                def _url_to_fs(url, **kwargs):
                    protocol = url.split("://")[0] if "://" in url else "file"
                    fs = fsspec.filesystem(protocol)
                    return fs, url
        fsspec.url_to_fs = _url_to_fs
except Exception:
    # If anything goes wrong, continue without failing here; subsequent IO calls will raise their own errors.
    pass

In [3]:
import os
from dotenv import load_dotenv

load_dotenv() 
GITHUB_API_TOKEN = os.environ.get("GITHUB_TOKEN")
gh = Github(GITHUB_API_TOKEN)

  gh = Github(GITHUB_API_TOKEN)


In [4]:
def extract_owner_repo(repo_url: str, html_url: str):
    for url in (repo_url, html_url):
        if not isinstance(url, str):
            continue
        try:
            path = urlparse(url).path.strip("/")
        except Exception:
            continue

        parts = path.split("/")

        # API: /repos/OWNER/REPO
        if "repos" in parts:
            idx = parts.index("repos")
            if len(parts) >= idx + 3:
                owner = parts[idx + 1]
                repo = parts[idx + 2]
                return owner, repo

        # Web: /OWNER/REPO/pull/123
        if len(parts) >= 2:
            owner = parts[0]
            repo = parts[1]
            return owner, repo

    return None, None

In [5]:
# Load datasets
print("Loading AIDev datasets...")


# ai PRs
ai_pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")
ai_pr_task_type_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_task_type.parquet")

perf_ai = (
ai_pr_df
    .merge(
        ai_pr_task_type_df[["id", "type"]], on="id", 
    )
    .query("type == 'perf'")
    .copy()
)

perf_ai[["repo_owner", "repo_name"]] = perf_ai.apply(
    lambda row: pd.Series(extract_owner_repo(row['repo_url'], row['html_url'])), axis=1
)

print(f"len = {len(perf_ai)}")
missing_repos = perf_ai[perf_ai['repo_owner'].isna() | perf_ai['repo_name'].isna()]
print(f"Missing repos: {len(missing_repos)}")

#Reconstruct pr_commit_df from human_pr_df


Loading AIDev datasets...
len = 340
Missing repos: 0


In [6]:
perf_ai.head()

Unnamed: 0,id,number,title,body,agent,user_id,user,state,created_at,closed_at,merged_at,repo_id,repo_url,html_url,type,repo_owner,repo_name
102,3200679276,4304,Implement lazy loading for RegistryInstance to improve latency in operations where the registry ...,ðŸ‘¨ \r\n\r\nBefore:\r\n\r\n```\r\njulia> @time Pkg.instantiate()\r\n 0.390297 seconds (1.95 M all...,Claude_Code,1282691,KristofferC,closed,2025-07-03T21:18:03Z,2025-07-04T08:34:04Z,2025-07-04T08:34:04Z,82341193,https://api.github.com/repos/JuliaLang/Pkg.jl,https://github.com/JuliaLang/Pkg.jl/pull/4304,perf,JuliaLang,Pkg.jl
114,3250477735,397,Optimize nancorrmatrix and nancovmatrix for cache locality,Refactor `nancorrmatrix` and `nancovmatrix` to process observations sequentially. This improves ...,Claude_Code,5635139,max-sixty,closed,2025-07-22T00:19:55Z,2025-07-22T00:28:17Z,2025-07-22T00:28:17Z,25501620,https://api.github.com/repos/numbagg/numbagg,https://github.com/numbagg/numbagg/pull/397,perf,numbagg,numbagg
249,3254647682,59071,skip unnecessary alias-check in collect(::AbstractArray) from copyto\!,"As discussed on Slack with @MasonProtter & @jakobnissen, `collect` currently does a usually chea...",Claude_Code,1814174,ChrisRackauckas,closed,2025-07-23T02:52:46Z,2025-07-23T23:55:54Z,,1644196,https://api.github.com/repos/JuliaLang/julia,https://github.com/JuliaLang/julia/pull/59071,perf,JuliaLang,julia
526,3151002300,6671,Use async file system APIs instead of sync APIs in install.ts,Replaces synchronous file system API calls with asynchronous equivalents in `vscode/npm-package/...,Copilot,198982749,Copilot,closed,2025-06-16T19:04:49Z,2025-06-17T20:07:53Z,2025-06-17T20:07:52Z,323665366,https://api.github.com/repos/microsoft/kiota,https://github.com/microsoft/kiota/pull/6671,perf,microsoft,kiota
534,3151370964,12025,Add fast-paths for ToolLocationHelper property functions,This PR adds fast-path implementations for two commonly used ToolLocationHelper property functio...,Copilot,198982749,Copilot,open,2025-06-16T21:24:28Z,,,32051890,https://api.github.com/repos/dotnet/msbuild,https://github.com/dotnet/msbuild/pull/12025,perf,dotnet,msbuild


In [7]:
rows_commits = []
rows_details = []
rows_runs = []
rows_issue_comments = []
rows_review_comments = []
skipped_count = 0

for idx, row in perf_ai.iterrows():
    pr_id = int(row["id"])
    owner = row["repo_owner"]
    repo_name = row["repo_name"]
    number = int(row["number"])

    full_repo = f"{owner}/{repo_name}"
    print(f"\nâž¡ Fetching commits, pipelines & comments for {full_repo} PR #{number} (dataset id={pr_id})")

    if pd.isna(owner) or pd.isna(repo_name):
        print("   Skipping due to missing owner/repo")
        skipped_count += 1
        continue

    try:
        repo = gh.get_repo(full_repo)
        pr = repo.get_pull(number)

        pr_title = pr.title
        pr_description = pr.body
        pr_comments_count = pr.comments  

        # ===================== ISSUE COMMENTS  =====================
        try:
            issue_comments = pr.get_issue_comments()
            for c in issue_comments:
                rows_issue_comments.append({
                    "pr_id": pr_id,
                    "pr_number": number,
                    "comment_id": c.id,
                    "user_login": c.user.login if c.user else None,
                    "user_type": c.user.type if c.user else None,
                    "body": c.body,
                    "created_at": c.created_at,
                    "updated_at": c.updated_at,
                    "url": c.html_url,
                })
        except Exception as e_ic:
            print(f"   Error fetching issue comments for PR #{number}: {e_ic}")

        # ===================== REVIEW COMMENTS =====================
        try:
            review_comments = pr.get_review_comments()
            for rc in review_comments:
                rows_review_comments.append({
                    "pr_id": pr_id,
                    "pr_number": number,
                    "comment_id": rc.id,
                    "user_login": rc.user.login if rc.user else None,
                    "user_type": rc.user.type if rc.user else None,
                    "body": rc.body,
                    "created_at": rc.created_at,
                    "updated_at": rc.updated_at,
                    "path": rc.path,
                    "position": rc.position,
                    "original_position": rc.original_position,
                    "commit_id": rc.commit_id,
                    "original_commit_id": rc.original_commit_id,
                    "in_reply_to_id": getattr(rc, "in_reply_to_id", None),
                    "diff_hunk": rc.diff_hunk,
                    "url": rc.html_url,
                })
        except Exception as e_rc:
            print(f"   Error fetching review comments for PR #{number}: {e_rc}")

        # ===================== COMMITS =====================
        commit_list = pr.get_commits()
        for c in commit_list:
            sha = c.sha
            commit_obj = c.commit

            author_name = None
            committer_name = None
            commit_message = None

            if commit_obj is not None:
                if commit_obj.author is not None:
                    author_name = commit_obj.author.name
                if commit_obj.committer is not None:
                    committer_name = commit_obj.committer.name
                commit_message = commit_obj.message

            stats = c.stats
            commit_stats_additions = getattr(stats, "additions", None)
            commit_stats_deletions = getattr(stats, "deletions", None)
            commit_stats_total = getattr(stats, "total", None)

            # ---- table commits ----
            rows_commits.append({
                "sha": sha,
                "pr_id": pr_id,
                "pr_number": number,
                "repo_owner": owner,
                "repo_name": repo_name,
                "author": author_name,
                "committer": committer_name,
                "commit_message": commit_message,
                "pr_title": pr_title,
                "pr_description": pr_description,
                "pr_comments_count": pr_comments_count,
            })

            # ---- table pr_commit_details ----
            for f in c.files:
                rows_details.append({
                    "sha": sha,
                    "pr_id": pr_id,
                    "pr_number": number,
                    "commit_stats_total": commit_stats_total,
                    "commit_stats_additions": commit_stats_additions,
                    "commit_stats_deletions": commit_stats_deletions,
                    "filename": f.filename,
                    "status": f.status,
                    "additions": f.additions,
                    "deletions": f.deletions,
                    "changes": f.changes,
                    "patch": f.patch,
                })

        # ===================== PIPELINES / WORKFLOW RUNS =====================
        head_sha = pr.head.sha
        head_branch = pr.head.ref

        for run in repo.get_workflow_runs(branch=head_branch, event="pull_request"):
            if run.head_sha != head_sha:
                continue

            rows_runs.append({
                "run_id": run.id,
                "pr_id": pr_id,
                "pr_number": number,
                "workflow_id": run.workflow_id,
                "workflow_name": getattr(run, "name", None),
                "head_branch": run.head_branch,
                "head_sha": run.head_sha,
                "event": run.event,
                "status": run.status,
                "conclusion": run.conclusion,
                "created_at": run.created_at,
                "updated_at": run.updated_at,
                "run_attempt": getattr(run, "run_attempt", None),
                "url": run.html_url,
            })

        time.sleep(0.7)

    except Exception as e:
        print(f"   Error fetching PR #{number} from {full_repo}: {e}")
        skipped_count += 1

# ===================== DATAFRAMES =====================
df_commits = pd.DataFrame(rows_commits)
df_details = pd.DataFrame(rows_details)
df_runs = pd.DataFrame(rows_runs)
df_issue_comments = pd.DataFrame(rows_issue_comments)
df_review_comments = pd.DataFrame(rows_review_comments)

print("\nTotal commit rows (ai_pr_commits):", len(df_commits))
print("Total detail rows (ai_pr_commit_details):", len(df_details))
print("Total run rows (ai_pr_workflow_runs):", len(df_runs))
print("Total issue comment rows (ai_pr_issue_comments):", len(df_issue_comments))
print("Total review comment rows (ai_pr_review_comments):", len(df_review_comments))

# ===================== FILTERS =====================
print("\nApplying filters to commit data...")

if len(df_details) == 0:
    print("  No commit detail rows; skipping filters.")
    filtered_pr_ids = set(df_commits.get('pr_id', pd.Series(dtype=int)).unique())
else:
    print(f"  Starting detail rows: {len(df_details):,}")
    
    # 1) Remove null filenames
    before_filename = len(df_details)
    df_details = df_details[df_details['filename'].notna()].copy()
    print(f"  Removed null filenames: {before_filename - len(df_details):,}")


    config_patterns = [
        r'^\.mvn/',
        r'^\.gradle/',
        r'^\.idea/',
        r'^\.vscode/',
        r'^\.github/workflows/',
        r'\.properties$',
        r'\.xml$',
        r'\.yml$',
        r'\.yaml$',
        r'\.json$',
        r'\.txt$',
        r'\.gitignore$',
        r'\.dockerignore$',
        r'/Dockerfile$',
        r'^Dockerfile$',
        r'/docker-compose',
        r'^docker-compose',
        r'\.lock$',
        r'^LICENSE',
        r'^README',
    ]
    config_pattern = '|'.join(config_patterns)
    before_config = len(df_details)
    df_details['is_config_file'] = df_details['filename'].str.contains(
        config_pattern, case=False, na=False, regex=True
    )
    df_details = df_details[~df_details['is_config_file']].copy()
    df_details = df_details.drop(columns=['is_config_file'])
    print(f"  Removed config/metadata files: {before_config - len(df_details):,}")


    merge_patterns = [
        r'^Merge\s+branch',
        r'^Merge\s+pull\s+request',
        r'^Merge\s+remote-tracking\s+branch',
        r'^Merge\s+.*\s+into\s+',
        r"^Merged\s+in\s+",
    ]
    merge_pattern = '|'.join(merge_patterns)
    merge_shas = set(
        df_commits[
            df_commits['commit_message'].str.match(merge_pattern, case=False, na=False)
        ]['sha'].tolist()
    )
    if merge_shas:
        df_commits = df_commits[~df_commits['sha'].isin(merge_shas)].copy()
        df_details = df_details[~df_details['sha'].isin(merge_shas)].copy()
    print(f"  Removed merge commits: {len(merge_shas)} commit(s)")


    filtered_pr_ids = set(df_details['pr_id'].unique())
    print(f"  PRs remaining after filters: {len(filtered_pr_ids):,}")

# Keep only PRs that still have code files after filtering
df_commits = df_commits[df_commits['pr_id'].isin(filtered_pr_ids)].copy()
df_runs = df_runs[df_runs['pr_id'].isin(filtered_pr_ids)].copy()
df_issue_comments = df_issue_comments[df_issue_comments['pr_id'].isin(filtered_pr_ids)].copy()
df_review_comments = df_review_comments[df_review_comments['pr_id'].isin(filtered_pr_ids)].copy()

print("\nTotals after filtering:")
print("  Commit rows (ai_pr_commits):", len(df_commits))
print("  Detail rows (ai_pr_commit_details):", len(df_details))
print("  Run rows (ai_pr_workflow_runs):", len(df_runs))
print("  Issue comment rows (ai_pr_issue_comments):", len(df_issue_comments))
print("  Review comment rows (ai_pr_review_comments):", len(df_review_comments))
print(f"  Unique PRs retained: {len(filtered_pr_ids):,}")


âž¡ Fetching commits, pipelines & comments for JuliaLang/Pkg.jl PR #4304 (dataset id=3200679276)

âž¡ Fetching commits, pipelines & comments for numbagg/numbagg PR #397 (dataset id=3250477735)

âž¡ Fetching commits, pipelines & comments for JuliaLang/julia PR #59071 (dataset id=3254647682)

âž¡ Fetching commits, pipelines & comments for microsoft/kiota PR #6671 (dataset id=3151002300)

âž¡ Fetching commits, pipelines & comments for dotnet/msbuild PR #12025 (dataset id=3151370964)

âž¡ Fetching commits, pipelines & comments for dotnet/aspnetcore PR #62056 (dataset id=3081695764)

âž¡ Fetching commits, pipelines & comments for ant-design/ant-design PR #54325 (dataset id=3210885983)

âž¡ Fetching commits, pipelines & comments for nearai/nearai PR #1179 (dataset id=3122455352)

âž¡ Fetching commits, pipelines & comments for microsoft/testfx PR #6060 (dataset id=3235179464)

âž¡ Fetching commits, pipelines & comments for robertpenner/as3-signals PR #74 (dataset id=3074606452)

âž¡ Fetching

Following Github server redirection from /repos/unibeck/solstatus to /repositories/969089225



âž¡ Fetching commits, pipelines & comments for unibeck/solstatus PR #55 (dataset id=3075349977)

âž¡ Fetching commits, pipelines & comments for microsoft/testfx PR #6162 (dataset id=3262412016)

âž¡ Fetching commits, pipelines & comments for cschleiden/go-workflows PR #388 (dataset id=3220396620)

âž¡ Fetching commits, pipelines & comments for primer/react PR #6197 (dataset id=3137892942)

âž¡ Fetching commits, pipelines & comments for mlflow/mlflow PR #16039 (dataset id=3113006799)

âž¡ Fetching commits, pipelines & comments for celestiaorg/rsmt2d PR #361 (dataset id=3113051088)

âž¡ Fetching commits, pipelines & comments for tomhrr/cosh PR #181 (dataset id=3158727370)

âž¡ Fetching commits, pipelines & comments for microsoft/HydraLab PR #694 (dataset id=3096236895)

âž¡ Fetching commits, pipelines & comments for microsoft/HydraLab PR #695 (dataset id=3096249565)

âž¡ Fetching commits, pipelines & comments for Krande/adapy PR #146 (dataset id=3189634328)

âž¡ Fetching commits, pipeli

Following Github server redirection from /repos/stanford-crfm/levanter to /repositories/496005961



âž¡ Fetching commits, pipelines & comments for stanford-crfm/levanter PR #1066 (dataset id=3234031765)

âž¡ Fetching commits, pipelines & comments for Doriandarko/make-it-heavy PR #9 (dataset id=3240593081)

âž¡ Fetching commits, pipelines & comments for moonbitlang/core PR #2253 (dataset id=3137138306)

âž¡ Fetching commits, pipelines & comments for MihaiCristianCondrea/Smart-Cleaner-for-Android PR #125 (dataset id=3213528854)

âž¡ Fetching commits, pipelines & comments for ryokun6/ryos PR #50 (dataset id=3073532077)

âž¡ Fetching commits, pipelines & comments for ryokun6/ryos PR #57 (dataset id=3074924091)

âž¡ Fetching commits, pipelines & comments for prebid/Prebid.js PR #13460 (dataset id=3189294728)

âž¡ Fetching commits, pipelines & comments for moonbitlang/core PR #2267 (dataset id=3142771614)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #7897 (dataset id=3224827777)

âž¡ Fetching commits, pipelines & comments for gacela-project/gacela PR #326 (dataset id

Following Github server redirection from /repos/stanford-crfm/levanter to /repositories/496005961



âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #484 (dataset id=3142986664)

âž¡ Fetching commits, pipelines & comments for jscarle/LightResults PR #74 (dataset id=3213850102)

âž¡ Fetching commits, pipelines & comments for jscarle/LightResults PR #75 (dataset id=3213857892)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #12900 (dataset id=3261822593)

âž¡ Fetching commits, pipelines & comments for phellipeandrade/rbac PR #43 (dataset id=3128644658)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #7004 (dataset id=3216964251)

âž¡ Fetching commits, pipelines & comments for wieslawsoltes/Xaml.Behaviors PR #82 (dataset id=3071083444)

âž¡ Fetching commits, pipelines & comments for prebid/Prebid.js PR #13334 (dataset id=3130957636)

âž¡ Fetching commits, pipelines & comments for marin-community/marin PR #1429 (dataset id=3200979351)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #11961 (dataset id=3252596861)



Following Github server redirection from /repos/stanford-crfm/levanter to /repositories/496005961



âž¡ Fetching commits, pipelines & comments for stanford-crfm/levanter PR #1102 (dataset id=3264767865)

âž¡ Fetching commits, pipelines & comments for Wtrwx/DYYY PR #250 (dataset id=3196281528)

âž¡ Fetching commits, pipelines & comments for moonbitlang/core PR #2266 (dataset id=3142406085)

âž¡ Fetching commits, pipelines & comments for parse-community/parse-dashboard PR #2920 (dataset id=3239561220)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #3893 (dataset id=3184544966)

âž¡ Fetching commits, pipelines & comments for OpenHFT/Chronicle-Core PR #813 (dataset id=3106780046)

âž¡ Fetching commits, pipelines & comments for OpenHFT/Chronicle-Core PR #814 (dataset id=3106804055)

âž¡ Fetching commits, pipelines & comments for MontrealAI/AGI-Alpha-Agent-v0 PR #1377 (dataset id=3107735616)

âž¡ Fetching commits, pipelines & comments for static-frame/static-frame PR #1069 (dataset id=3115186500)

âž¡ Fetching commits, pipelines & comments for MontrealAI/AGI-Alpha-Agen

Following Github server redirection from /repos/stanford-crfm/levanter to /repositories/496005961



âž¡ Fetching commits, pipelines & comments for stanford-crfm/levanter PR #1065 (dataset id=3233988388)

âž¡ Fetching commits, pipelines & comments for jdereg/java-util PR #194 (dataset id=3147149820)

âž¡ Fetching commits, pipelines & comments for Rello/audioplayer PR #611 (dataset id=3225788754)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #3266 (dataset id=3176300978)

âž¡ Fetching commits, pipelines & comments for swhan0329/vehicle_speed_estimation PR #7 (dataset id=3117019425)

âž¡ Fetching commits, pipelines & comments for ltwlf/json-diff-ts PR #301 (dataset id=3206861578)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #9165 (dataset id=3238396793)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #9213 (dataset id=3238582253)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #9244 (dataset id=3238674493)

âž¡ Fetching commits, pipelines & comments for mochilang/mochi PR #9252 (dataset id=3238720815)

âž¡

Following Github server redirection from /repos/buger/probe to /repositories/943383028



âž¡ Fetching commits, pipelines & comments for buger/probe PR #56 (dataset id=3235100943)

âž¡ Fetching commits, pipelines & comments for buffdb/buffdb PR #26 (dataset id=3250089415)

âž¡ Fetching commits, pipelines & comments for Yamashou/gqlgenc PR #289 (dataset id=3148602658)

âž¡ Fetching commits, pipelines & comments for calcom/cal.com PR #21220 (dataset id=3053649404)

âž¡ Fetching commits, pipelines & comments for calcom/cal.com PR #20676 (dataset id=2991070962)

âž¡ Fetching commits, pipelines & comments for web-infra-dev/rspack PR #10677 (dataset id=3147340923)

âž¡ Fetching commits, pipelines & comments for sikanhe/gqtx PR #79 (dataset id=3148127134)

âž¡ Fetching commits, pipelines & comments for liam-hq/liam PR #1985 (dataset id=3136694740)

âž¡ Fetching commits, pipelines & comments for jodendaal/OpenAI.Net PR #89 (dataset id=3137786825)


Following Github server redirection from /repos/appdotbuild/agent to /repositories/913914262



âž¡ Fetching commits, pipelines & comments for appdotbuild/agent PR #185 (dataset id=3052357500)

âž¡ Fetching commits, pipelines & comments for calcom/cal.com PR #21217 (dataset id=3053325093)

âž¡ Fetching commits, pipelines & comments for liam-hq/liam PR #2102 (dataset id=3161908700)

âž¡ Fetching commits, pipelines & comments for ateliee/jquery.schedule PR #58 (dataset id=3161909204)

âž¡ Fetching commits, pipelines & comments for sourcebot-dev/sourcebot PR #357 (dataset id=3155001680)

âž¡ Fetching commits, pipelines & comments for vercel/vercel PR #13284 (dataset id=3034997303)

âž¡ Fetching commits, pipelines & comments for calcom/cal.com PR #21162 (dataset id=3046430027)

âž¡ Fetching commits, pipelines & comments for calcom/cal.com PR #21418 (dataset id=3077061912)

âž¡ Fetching commits, pipelines & comments for calcom/cal.com PR #21370 (dataset id=3070150168)

âž¡ Fetching commits, pipelines & comments for calcom/cal.com PR #21371 (dataset id=3070165463)

âž¡ Fetching commit

In [9]:
# Remove PRs where commits only touch markdown files (no code changes)
print("\nChecking for PRs with markdown-only changes...")

if len(df_details) == 0:
    print("  No commit detail rows; skipping markdown-only filter.")
    print(f"  Unique PRs retained (unchanged): {len(filtered_pr_ids):,}")
else:
    df_details["__is_markdown"] = df_details["filename"].str.lower().str.endswith((".md", ".markdown"))

    md_only_pr_ids = set(
        df_details.groupby("pr_id")["__is_markdown"]
        .agg(lambda s: bool(len(s)) and s.all())
        .pipe(lambda s: s[s].index)
    )

    print(f"  PRs with only markdown files: {len(md_only_pr_ids):,}")
    if md_only_pr_ids:
        df_details = df_details[~df_details["pr_id"].isin(md_only_pr_ids)].copy()
        df_commits = df_commits[~df_commits["pr_id"].isin(md_only_pr_ids)].copy()
        df_runs = df_runs[~df_runs["pr_id"].isin(md_only_pr_ids)].copy()
        df_issue_comments = df_issue_comments[~df_issue_comments["pr_id"].isin(md_only_pr_ids)].copy()
        df_review_comments = df_review_comments[~df_review_comments["pr_id"].isin(md_only_pr_ids)].copy()

    filtered_pr_ids = set(df_details["pr_id"].unique())
    df_details = df_details.drop(columns=["__is_markdown"])

    print(f"  Unique PRs retained after markdown-only filter: {len(filtered_pr_ids):,}")
    print("\nTotals after markdown-only filter:")
    print("  Commit rows (ai_pr_commits):", len(df_commits))
    print("  Detail rows (ai_pr_commit_details):", len(df_details))
    print("  Run rows (ai_pr_workflow_runs):", len(df_runs))
    print("  Issue comment rows (ai_pr_issue_comments):", len(df_issue_comments))
    print("  Review comment rows (ai_pr_review_comments):", len(df_review_comments))



Checking for PRs with markdown-only changes...
  PRs with only markdown files: 3
  Unique PRs retained after markdown-only filter: 324

Totals after markdown-only filter:
  Commit rows (ai_pr_commits): 758
  Detail rows (ai_pr_commit_details): 4155
  Run rows (ai_pr_workflow_runs): 457
  Issue comment rows (ai_pr_issue_comments): 427
  Review comment rows (ai_pr_review_comments): 265


In [11]:
# ===================== SAVE TO PARQUET =====================
df_commits.to_parquet("./ai_pr_commits.parquet", index=False)
df_details.to_parquet("./ai_pr_commit_details.parquet", index=False)
df_runs.to_parquet("./ai_pr_workflow_runs.parquet", index=False)
df_issue_comments.to_parquet("./ai_pr_issue_comments.parquet", index=False)
df_review_comments.to_parquet("./ai_pr_review_comments.parquet", index=False)
print("\nSaved: ai_pr_commits.parquet")
print("Saved: ai_pr_commit_details.parquet")
print("Saved: ai_pr_workflow_runs.parquet")
print("Saved: ai_pr_issue_comments.parquet")
print("Saved: ai_pr_review_comments.parquet")
print(f"\nSkipped {skipped_count} PRs due to errors or missing data.")


Saved: ai_pr_commits.parquet
Saved: ai_pr_commit_details.parquet
Saved: ai_pr_workflow_runs.parquet
Saved: ai_pr_issue_comments.parquet
Saved: ai_pr_review_comments.parquet

Skipped 2 PRs due to errors or missing data.
