In [38]:
!pip install openai google-genai pandas python-dotenv
import json
from pathlib import Path
from typing import Dict, Iterable, List, Optional
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from openai import OpenAI
from google import genai
from google.genai import types
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv()) 




True

In [39]:

schema = {
    "type": "object",
    "properties": {
        "validation_present": {
            "type": "boolean"
        },
        "evidence_sources": {
            "type": "array",
            "items": {
                "type": "string",
                "enum": ["pipeline", "description", "comments"]
            }
        },
        "validation_type": {
            "type": "string",
            "enum": [
                "benchmark",
                "profiling",
                "static-analysis",
                "anecdotal",
                "load/canary",
                "none"
            ]
        },
        "validation_description": {
            "type": "string"
        },
        "pipeline_signal": {
            "type": "string"
        },
        "description_signal": {
            "type": "string"
        },
        "comment_signal": {
            "type": "string"
        }
    },
    "required": [
        "validation_present",
        "evidence_sources",
        "validation_type",
        "validation_description",
        "pipeline_signal",
        "description_signal",
        "comment_signal"
    ]
}

schema_openai = {
        "type": "object",
        "properties": {
            "validation_present": {"type": "boolean"},
            "evidence_sources": {
                "type": "array",
                "items": {
                    "type": "string",
                    "enum": ["pipeline", "description", "comments"]
                }
            },
            "validation_type": {
                "type": "string",
                "enum": [
                    "benchmark",
                    "profiling",
                    "static-analysis",
                    "anecdotal",
                    "load/canary",
                    "none"
                ]
            },
            "validation_description": {"type": "string"},
            "pipeline_signal": {"type": "string"},
            "description_signal": {"type": "string"},
            "comment_signal": {"type": "string"}
        },
        "required": [
            "validation_present",
            "evidence_sources",
            "validation_type",
            "validation_description",
            "pipeline_signal",
            "description_signal",
            "comment_signal"
        ],
        "additionalProperties": False,
        }

In [48]:

def run_llm(prompt: str,user_prompt: str, type: str = "openai", model: str = "gpt-5.1-2025-11-13") -> str:
    """Call local OLLAMA."""
    LLM_BASE_URL = None
    LLM_API_KEY = os.environ.get("OPENAI_API_KEY")
    llm_client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
    
    if(type == "openai"):
        model = "gpt-5.1-2025-11-13"
        print("Running OPENAI, model:", model)
  
        messages = [
            {"role": "developer", "content": prompt},
            {"role": "user", "content": user_prompt},
        ]
        

        r = llm_client.chat.completions.create(
            model=model,
            temperature=0.0,
            messages=messages,
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "validation_schema",
                    "schema": schema_openai,
                    "strict": True,
                },
            },
        )

        result = r.choices[0].message.content.strip()
        return result
    elif(type == "gemini"):
        
        model = "models/gemini-pro-latest"
        print("Running GEMINI, model:", model)
        
        GEMINI_API_KEY = "AIzaSyCpfO_nW1hsQpuB7fDWEZwLocSVO2z0UxA"
        client = genai.Client(api_key=GEMINI_API_KEY)

        
        config = types.GenerateContentConfig(
            temperature=0.0,
            response_mime_type="application/json",
            response_schema=schema,
            # thinking_config=types.ThinkingConfig(
            #     thinking_level=types.ThinkingLevel.HIGH
            # )
        )
        
        messages = [
            prompt,
            user_prompt,
        ]
        
        response = client.models.generate_content(
            model=model,
            contents=messages,
            config=config,
        )
        return response.text
    

In [49]:
def find_datasets_dir(start: Optional[Path] = None) -> Path:
    start = start or Path.cwd()
    for path in (start, *start.parents):
        candidate = path / "datasets"
        if candidate.exists():
            return candidate
    raise FileNotFoundError(f"Could not find 'datasets' directory from {start}")


DATASETS_DIR = find_datasets_dir()
PROJECT_ROOT = DATASETS_DIR.parent

In [50]:
def extract_json(text: str) -> Dict:
    """Best-effort JSON extraction from model output."""
    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        snippet = text[start : end + 1]
        try:
            return json.loads(snippet)
        except json.JSONDecodeError:
            pass
    return {}

def truncate(text: str, limit: int = 10000) -> str:
    return text if len(text) <= limit else text[:limit] + "...[truncated]"

In [51]:
def load_pr_core(prefix: str) -> pd.DataFrame:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    return commits.drop_duplicates("pr_id").set_index("pr_id")


def collect_comments(prefix: str, pr_id: int) -> List[str]:
    issue = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_issue_comments.parquet"
    )
    review = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_review_comments.parquet"
    )
    texts = []
    for df in (issue, review):
        subset = df[df["pr_id"] == pr_id]
        texts.extend(subset["body"].dropna().tolist())
    return texts


def collect_pipeline_names(prefix: str, pr_id: int) -> List[str]:
    workflows = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_workflow_runs.parquet"
    )
    subset = workflows[workflows["pr_id"] == pr_id]
    return sorted(subset["workflow_name"].dropna().unique().tolist())


In [52]:
from typing import Dict, Tuple, List
from concurrent.futures import ThreadPoolExecutor

def analyze_pr(
    prefix: str,
    pr_id: int,
    author_type: str,
    pr_core: pd.DataFrame,
    model: List[str] = ["openai", "gemini"],
) -> Tuple[Dict, Dict]:
    row = pr_core.loc[pr_id]
    pipeline_names = collect_pipeline_names(prefix, pr_id)
    comments = collect_comments(prefix, pr_id)
    description = (row.get("pr_description") or "").strip()
    code_diff = (row.get("patch") or "").strip()

    DEVELOPER_PROMPT_TEMPLATE = """
    You are a classifier for performance validation evidence in GitHub Pull Requests (PRs).

Your job is to decide whether there is explicit performance validation, and if so,
to classify it into EXACTLY ONE validation_type category based on the definitions below.

First, carefully read and internalize these validation_type categories:

1. Benchmark-Based Validation (validation_type="benchmark")
Definition:
The PR validates performance by running benchmark tests—either existing unit tests or newly
added benchmark/microbenchmark tests. It includes explicit before-and-after comparisons such as
runtime, throughput, memory, CPU usage, or any quantitative metric collected from tests.

2. Profiling-Based Validation (validation_type="profiling")
Definition:
The PR uses profiling tools to validate performance, typically capturing stack samples,
CPU hotspots, flamegraphs, or function-level timings. Evidence includes profiling outputs
before and after the change.

3. Static-Analysis-Based Validation (validation_type="static-analysis")
Definition:
The PR argues for performance improvement through static reasoning about the code—algorithmic
complexity, data-structure changes, loop bounds, allocation count reduction, etc.—without
providing runtime or profiling data.

4. Anecdotal or Informal Local Testing (validation_type="anecdotal")
Definition:
The PR claims that performance is improved based on local testing, intuition, or manual
observation (for example, “this feels faster on my machine”, “latency looks better now”),
but provides no quantitative metrics, no profiling output, and no detailed static-analysis
justification.

5. Load/Canary-Based Validation (validation_type="load/canary")
Definition:
The PR validates performance by running load tests, stress tests, or canary/phased rollouts
under real or synthetic traffic. Evidence includes references to load-testing tools,
canary deployments, A/B tests, gradual rollout with monitoring, or production metrics under load.

6. No Validation Evidence (validation_type="none")
Definition:
The PR provides no explicit evidence of performance validation. It may mention optimization intent,
but there are no benchmarks, no profiling, no static reasoning, no local-testing claims, and no
load/canary validation described.

Decision Rules:

1) First decide if performance validation is explicitly present (validation_present).
   - Set validation_present = TRUE only when the PR explicitly shows some form of validation evidence:
     benchmarks/microbenchmarks, profiling traces, static performance reasoning, load/canary rollout,
     or explicit local testing statements.
   - Do NOT infer validation from performance intent alone.

2) If validation_present = FALSE:
   - Set validation_type = "none".
   - Set evidence_sources = [].
   - Set validation_description to explain the absence of validation.
   - Never assign any other validation_type.

3) If validation_present = TRUE:
   - Choose exactly ONE non-"none" validation_type with this priority:
     benchmark > profiling > load/canary > static-analysis > anecdotal

4) evidence_sources must list where the validation is explicitly mentioned:
   - "pipeline", "description", "comments"

5) When validation_type is "benchmark", "profiling", or "load/canary",
   mention the metrics used (latency, throughput, memory, CPU, etc.).

You must ALWAYS return STRICT JSON with exactly these keys:

validation_present, evidence_sources, validation_type,
validation_description, pipeline_signal,
description_signal, comment_signal.

No extra commentary. No markdown.
No explanations.
    """

    USER_PROMPT_TEMPLATE = """
    Classify the following PR strictly using the rules and definitions from the system.

    PIPELINES:
    {pipeline_names}

    DESCRIPTION:
    {description}

    COMMENTS:
    {comments}
    
    CODE DIFF:
    {code_diff}
    """

    empty_record = {
        "pr_id": pr_id,
        "author_type": author_type,
        "repo": f"{row.get('repo_owner')}/{row.get('repo_name')}",
        "pr_number": row.get("pr_number"),
        "pr_title": row.get("pr_title"),
        "pipeline_names": pipeline_names,
        "validation_present": False,
        "evidence_sources": [],
        "validation_type": "none",
        "validation_description": "No validation evidence",
        "pipeline_signal": "",
        "description_signal": "",
        "comment_signal": "",
    }

    # If absolutely no textual signals, short-circuit
    if not pipeline_names and not description and not comments:
        return empty_record, empty_record

    prompt = USER_PROMPT_TEMPLATE.format(
        pipeline_names="- " + "- ".join(pipeline_names) if pipeline_names else "None",
        description=truncate(description) if description else "None",
        comments="- " + "- ".join(truncate(" | ".join(comments)).split(" | ")) if comments else "None",
        code_diff=truncate(code_diff) if code_diff else "None",
    )

    developer_prompt = DEVELOPER_PROMPT_TEMPLATE

    models = [m.lower().strip() for m in (model or [])]
    models_set = set(models)

    raw_openai = ""
    raw_gemini = ""

    # Build only the futures requested
    futures = {}
    if "openai" in models_set:
        futures["openai"] = (developer_prompt, prompt, "openai")
    if "gemini" in models_set:
        futures["gemini"] = (developer_prompt, prompt, "gemini")

    if futures:
        with ThreadPoolExecutor(max_workers=len(futures)) as executor:
            running = {
                name: executor.submit(
                    run_llm,
                    prompt=dev_p,
                    user_prompt=user_p,
                    type=typ,
                )
                for name, (dev_p, user_p, typ) in futures.items()
            }

            for name, fut in running.items():
                try:
                    result = fut.result()
                except Exception as exc:
                    result = ""
                    print(f"{name} model call failed for PR {pr_id}: {exc}")

                if name == "openai":
                    raw_openai = result
                elif name == "gemini":
                    raw_gemini = result

    parsed_openai = extract_json(raw_openai) or {}
    parsed_gemini = extract_json(raw_gemini) or {}

    def build_result(parsed: Dict):
        evidence_sources = parsed.get("evidence_sources") or []
        if isinstance(evidence_sources, (tuple, list)):
            evidence_sources = list(evidence_sources)

        return {
            "pr_id": pr_id,
            "author_type": author_type,
            "repo": f"{row.get('repo_owner')}/{row.get('repo_name')}",
            "pr_number": row.get("pr_number"),
            "pr_title": row.get("pr_title"),
            "pipeline_names": pipeline_names,
            "validation_present": parsed.get("validation_present"),
            "evidence_sources": evidence_sources,
            "validation_type": parsed.get("validation_type"),
            "validation_description": parsed.get("validation_description"),
            "pipeline_signal": parsed.get("pipeline_signal"),
            "description_signal": parsed.get("description_signal"),
            "comment_signal": parsed.get("comment_signal"),
        }

    # If a model wasn't requested, return a safe empty_record for that slot
    result_openai = build_result(parsed_openai) if "openai" in models_set else empty_record
    result_gemini = build_result(parsed_gemini) if "gemini" in models_set else empty_record

    return result_openai, result_gemini

In [53]:
def pr_ids_from_commits(prefix: str, limit: Optional[int] = None) -> Iterable[int]:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    pr_ids = sorted(commits["pr_id"].dropna().astype(int).unique().tolist())
    return pr_ids if limit is None else pr_ids[:limit]


In [9]:
out_dir = PROJECT_ROOT / "RQ3"
out_dir.mkdir(exist_ok=True, parents=True)
out_path_openai = out_dir / "rq3_validation_evidence_openai.parquet"
out_path_gemini = out_dir / "rq3_validation_evidence_gemini.parquet"
error_csv_path = out_dir / "rq3_validation_errors.csv"

records_openai = []
records_gemini = []
ai_core = load_pr_core("ai")
human_core = load_pr_core("human")

limit = None

def save_partial(records, out_path):
    if not records:
        return
    df_tmp = pd.DataFrame(records)
    df_tmp.to_parquet(out_path, index=False)
    print(f"[partial save] Saved {len(df_tmp)} rows to {out_path}")

def log_error(pr_id, prefix, author_type, exc):
    row = {
        "prefix": prefix,  
        "pr_id": pr_id,
        "author_type": author_type,
        "error": str(exc),
    }
    df_err = pd.DataFrame([row])
    header = not error_csv_path.exists()
    df_err.to_csv(error_csv_path, mode="a", header=header, index=False)
    print(f"[error] Logged PR {pr_id} ({prefix}/{author_type}) to {error_csv_path}: {exc}")

ai_ids = list(pr_ids_from_commits("ai", limit=limit))
human_ids = list(pr_ids_from_commits("human", limit=limit))
print(f"Processing {len(ai_ids)} AI PRs and {len(human_ids)} human PRs (first {limit} each).")

# ============================
# Process AI PRs
# ============================
for idx, pr_id in enumerate(ai_ids, 1):
    print(f"Processing AI PR {idx}/{len(ai_ids)}: {pr_id}")
    try:
        openai_res, gemini_res = analyze_pr("ai", pr_id, "ai_agent", ai_core)
        records_openai.append(openai_res)
        records_gemini.append(gemini_res)
    except Exception as exc:
        log_error(pr_id, prefix="ai", author_type="ai_agent", exc=exc)

    # ---- SAVE EVERY 10 ----
    if len(records_openai) % 10 == 0:
        save_partial(records_openai, out_path_openai)
        save_partial(records_gemini, out_path_gemini)

# ============================
# Process Human PRs
# ============================
for idx, pr_id in enumerate(human_ids, 1):
    print(f"Processing human PR {idx}/{len(human_ids)}: {pr_id}")
    try:
        openai_res, gemini_res = analyze_pr("human", pr_id, "human", human_core)
        records_openai.append(openai_res)
        records_gemini.append(gemini_res)
    except Exception as exc:
        log_error(pr_id, prefix="human", author_type="human", exc=exc)

    # ---- SAVE EVERY 10 ----
    if len(records_openai) % 10 == 0:
        save_partial(records_openai, out_path_openai)
        save_partial(records_gemini, out_path_gemini)

# ============================
# Final save
# ============================
df_open_ai = pd.DataFrame(records_openai)
df_gemini = pd.DataFrame(records_gemini)
df_open_ai.to_parquet(out_path_openai, index=False)
df_gemini.to_parquet(out_path_gemini, index=False)
print(f"Saved FINAL OPENAI {len(df_open_ai)} rows to {out_path_openai}")
print(f"Saved FINAL GEMINI {len(df_gemini)} rows to {out_path_gemini}")
print(f"Errored PRs (if any) logged to {error_csv_path}")

Processing 324 AI PRs and 83 human PRs (first None each).
Processing AI PR 1/324: 2766896431
Running GEMINI, model: models/gemini-pro-latest
Running OPENAI, model: gpt-5.1-2025-11-13
Processing AI PR 2/324: 2843312341
Running GEMINI, model: models/gemini-pro-latest
Running OPENAI, model: gpt-5.1-2025-11-13
Processing AI PR 3/324: 2843334531
Running OPENAI, model: gpt-5.1-2025-11-13
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 4/324: 2855302194
Running GEMINI, model: models/gemini-pro-latest
Running OPENAI, model: gpt-5.1-2025-11-13
Processing AI PR 5/324: 2859989652
Running OPENAI, model: gpt-5.1-2025-11-13
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 6/324: 2887787232
Running GEMINI, model: models/gemini-pro-latest
Running OPENAI, model: gpt-5.1-2025-11-13
Processing AI PR 7/324: 2920983723
Running GEMINI, model: models/gemini-pro-latest
Running OPENAI, model: gpt-5.1-2025-11-13
Processing AI PR 8/324: 2926188053
Running OPENAI, model: gpt-5.1-2

In [46]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.expand_frame_repr", True)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

data_temp = pd.read_parquet(out_path_openai)
data_temp.tail(20)

Unnamed: 0,pr_id,author_type,repo,pr_number,pr_title,pipeline_names,validation_present,evidence_sources,validation_type,validation_description,pipeline_signal,description_signal,comment_signal
387,2555753483,human,dotnet/msbuild,11934,update to stop closures from lazy functions and linq,[],True,[description],static-analysis,"The PR argues for performance improvement by reducing allocations from closures, verified via ILSpy inspection showing that compiler-generated closure classes (DisplayClass) disappeared. This is static reasoning about allocation behavior, not runtime benchmarks, profiling, or load tests.",,Mentions looking at a trace of allocations to find closures and then using ILSpy to verify that the compiler-generated closure classes (DisplayClass) are gone after the change.,
388,2558083620,human,bionic-gpt/bionic-gpt,776,Cache busting,[],False,[],none,"No performance validation is mentioned; the comment only shows deployment status and preview URLs without any benchmarks, profiling, load tests, static reasoning, or anecdotal performance claims.",No pipelines provided.,No description provided.,"Comments only reference deployment and logs, with no explicit performance validation."
389,2560305820,human,antiwork/gumroad,289,Added Typhoeus client for HTTP connection pooling + re-use,[autofix.ci],False,[],none,"The PR claims performance should improve by reusing HTTP connections but provides no benchmarks, profiling data, static performance reasoning beyond a brief intuition, local testing results, or load/canary evidence.",No performance-related signals in the pipeline; autofix.ci is just an automated fix tool.,"Description only states that performance ""should improve"" by using persistent HTTP connections, without any explicit validation or testing evidence.","Comments contain only auto-generated summaries and a poem, with no mention of performance validation or testing."
390,2564432253,human,tokens-studio/figma-plugin,3402,Github Sync Optimization,"[ESLint, Node.js CI]",False,[],none,"The PR discusses an optimization to avoid unnecessary GitHub writes and mentions that users could observe time differences when pushing changes, but it does not report any actual measurements, tests performed, or other explicit performance validation evidence.","CI pipelines and coverage reports are mentioned only for correctness/coverage, not for performance metrics.","Description explains the intended performance improvement and suggests how a user might notice faster syncs, but provides no concrete tests, timings, or metrics.","Comments focus on code review, coverage, and implementation details, without any explicit performance testing, benchmarks, profiling, or load/canary results."
391,2573225924,human,microsoft/TypeScript,61822,"optimization, reduce memory usage","[CI, Code Scanning - Action]",False,[],none,"The description claims an 11% speedup but provides no details on how this was measured, no benchmarks, profiling data, static reasoning, or mention of local testing methodology. A comment asks how the stats were determined, but no validation evidence is shown.",No performance-related validation in the pipeline; only generic code scanning is mentioned.,"Mentions an 11% speedup but gives no explanation of measurement method, metrics, or tests used, so it does not qualify as explicit validation evidence under the defined categories.","A reviewer asks how the stats were determined, indicating that validation details are missing rather than provided."
392,2577421996,human,antiwork/gumroad,307,Fixed duplicate context lookups across app,[autofix.ci],True,[description],static-analysis,"The description explains that the custom_context function was called twice, causing duplicate DB queries, and that the change avoids ~4 DB queries per page load, implying reduced work based on reasoning about query count rather than reported measurements.",,"States that calling custom_context twice caused all DB queries to run twice and that this saves ~4 DB queries per page load with an estimated ~2% win, but provides no benchmark or profiling data.",
393,2590261382,human,microsoft/vscode,251382,Optimized concat with reduce,[],True,[description],static-analysis,"The PR argues performance improvement via static reasoning about reduced allocations by using push instead of concat, without any runtime measurements or tests.",,The concat method creates a new collection each time based on the existing one. Using push in this case saves resources.,
394,2596620305,human,microsoft/qsharp,2530,Improve JupyterLab extension build time,"[Benchmark Reports, CI Build and Test, DevSkim]",True,[description],benchmark,"The description provides explicit before-and-after build time measurements on Windows/Ubuntu/Mac (from ~70s/30s/30s to ~15s/10s/13s), which are quantitative benchmark-style metrics of build performance.",Pipeline list mentions a generic benchmark report job but provides no concrete metrics or results in the text provided.,Explicit build-time comparisons across OSes (~70s/30s/30s to ~15s/10s/13s) demonstrating reduced build duration.,"No comments were provided, so there is no validation evidence from comments."
395,2597070258,human,calcom/cal.com,21855,perf: use repository for me query & caching in /settings/my-account/general/ RSC,[],False,[],none,"The PR mentions performance improvements and shows before/after screencasts, but there is no explicit statement of performance validation such as benchmarks, profiling data, load tests, static performance reasoning, or even anecdotal claims about observed speed. The screencasts are presented as a visual demo, not described as performance measurement.",No pipelines or automated performance tests are referenced.,"Description notes caching and faster queries but does not describe any benchmarks, profiling, load tests, or explicit local performance testing results.","Comments discuss code structure, caching behavior, and security concerns but do not mention any form of performance validation or measurement."
396,2604024784,human,calcom/cal.com,21923,fix: Improve performance of settings/admin/organizations page,[],True,[comments],anecdotal,"Performance was evaluated informally via Loom videos and by manually checking out the branch locally with around 5000 organizations to see if the page still lagged or became unresponsive; no quantitative metrics, benchmarks, or profiling data were provided.",,,"Comments mention Loom videos demonstrating responsiveness before and after, and a reviewer states they checked out the branch locally with ~5000 organizations and observed little difference, but no numeric measurements or formal tests are reported."


In [54]:
import pandas as pd
from pathlib import Path

# ============================================================
# Paths
# ============================================================
out_dir = PROJECT_ROOT / "RQ3"
out_dir.mkdir(exist_ok=True, parents=True)

out_path_gemini = out_dir / "rq3_validation_evidence_gemini.parquet"
error_csv_path = out_dir / "rq3_validation_errors.csv"

# ============================================================
# Config
# ============================================================
LAST_N = 115
SAVE_EVERY = 10

# ============================================================
# Load PR cores
# ============================================================
ai_core = load_pr_core("ai")
human_core = load_pr_core("human")

# ============================================================
# Helpers
# ============================================================
def save_partial(records, out_path):
    """Save checkpoint parquet with current in-memory records."""
    if not records:
        return
    df_tmp = pd.DataFrame(records)
    df_tmp.to_parquet(out_path, index=False)
    print(f"[partial save] Saved {len(df_tmp)} rows to {out_path}")

def log_error(pr_id, prefix, author_type, exc):
    """Append error info to a CSV log."""
    row = {
        "prefix": prefix,
        "pr_id": pr_id,
        "author_type": author_type,
        "error": str(exc),
    }
    df_err = pd.DataFrame([row])
    header = not error_csv_path.exists()
    df_err.to_csv(error_csv_path, mode="a", header=header, index=False)
    print(f"[error] Logged PR {pr_id} ({prefix}/{author_type}) to {error_csv_path}: {exc}")

def get_pr_id_col(df):
    """Detect the PR id column name in the parquet."""
    for c in ["pr_id", "id"]:
        if c in df.columns:
            return c
    raise ValueError("PR id column not found. Expected 'pr_id' or 'id'.")

def infer_prefix(row):
    """Infer whether this row belongs to AI or human PRs."""
    # Prefer an explicit prefix column if present
    if "prefix" in row and pd.notna(row["prefix"]):
        v = str(row["prefix"]).lower().strip()
        if v in {"ai", "human"}:
            return v

    # Fallback to author_type
    at = str(row.get("author_type", "")).lower().strip()
    if at in {"ai_agent", "ai"}:
        return "ai"
    if at == "human":
        return "human"

    # Default conservative choice
    return "human"

# ============================================================
# 1) Load existing Gemini parquet
# ============================================================
if not out_path_gemini.exists():
    raise FileNotFoundError(f"File not found: {out_path_gemini}")

data_temp = pd.read_parquet(out_path_gemini)
if len(data_temp) == 0:
    raise ValueError("Gemini parquet is empty.")

# ============================================================
# 2) Select the last N rows to re-run
# ============================================================
tail_df = data_temp.tail(LAST_N).copy()
pr_id_col = get_pr_id_col(tail_df)

tasks = []
for _, row in tail_df.iterrows():
    pr_id = int(row[pr_id_col])

    # Preserve author_type if available
    author_type = row.get("author_type", "human")
    if pd.isna(author_type) or author_type is None:
        author_type = "human"
    author_type = str(author_type)

    prefix = infer_prefix(row)

    tasks.append((prefix, pr_id, author_type))

# De-duplicate while preserving order
tasks = list(dict.fromkeys(tasks))

print(f"Re-running GEMINI for {len(tasks)} PRs (last {LAST_N} rows).")

# ============================================================
# 3) Re-run ONLY Gemini using analyze_pr(model=['gemini'])
# ============================================================
records_gemini_new = []
partial_rerun_path = out_dir / "rq3_validation_evidence_gemini_rerun_partial.parquet"

for idx, (prefix, pr_id, author_type) in enumerate(tasks, 1):
    print(f"Processing GEMINI {idx}/{len(tasks)}: {prefix} PR {pr_id}")
    try:
        core = ai_core if prefix == "ai" else human_core

        # Your analyze_pr will only schedule the Gemini future
        _empty_openai, gemini_res = analyze_pr(
            prefix=prefix,
            pr_id=pr_id,
            author_type=author_type,
            pr_core=core,
            model=["gemini"],
        )

        # Ensure key fields exist for safe merging
        if isinstance(gemini_res, dict):
            gemini_res.setdefault("pr_id", pr_id)
            gemini_res.setdefault("author_type", author_type)
            gemini_res.setdefault("prefix", prefix)

        records_gemini_new.append(gemini_res)

    except Exception as exc:
        log_error(pr_id, prefix=prefix, author_type=author_type, exc=exc)

    # Save checkpoint every SAVE_EVERY successful records
    if len(records_gemini_new) % SAVE_EVERY == 0:
        save_partial(records_gemini_new, partial_rerun_path)

# ============================================================
# 4) Update the existing Gemini parquet with new results
#     Key: pr_id (+ author_type if present)
# ============================================================
df_new = pd.DataFrame(records_gemini_new)

if len(df_new) == 0:
    print("No new Gemini results were produced. Check the error CSV.")
else:
    # Normalize PR id column name in both dataframes
    if "pr_id" not in df_new.columns and pr_id_col in df_new.columns:
        df_new = df_new.rename(columns={pr_id_col: "pr_id"})
    if "pr_id" not in data_temp.columns and pr_id_col in data_temp.columns:
        data_temp = data_temp.rename(columns={pr_id_col: "pr_id"})

    # Build merge/update keys
    key_cols = ["pr_id"]
    if "author_type" in data_temp.columns and "author_type" in df_new.columns:
        key_cols.append("author_type")

    df_existing = data_temp.copy()

    # Ensure both sides have the key columns
    for kc in key_cols:
        if kc not in df_existing.columns:
            df_existing[kc] = None
        if kc not in df_new.columns:
            df_new[kc] = None

    # Index for aligned update
    df_existing = df_existing.set_index(key_cols)
    df_new = df_new.set_index(key_cols)

    # Add any new columns from the rerun output
    for col in df_new.columns:
        if col not in df_existing.columns:
            df_existing[col] = pd.NA

    # Update matching keys in-place
    df_existing.update(df_new)

    # Save updated parquet
    df_updated = df_existing.reset_index()
    df_updated.to_parquet(out_path_gemini, index=False)

    print(f"✅ Updated GEMINI parquet: {out_path_gemini}")

print(f"Errors (if any) were logged to: {error_csv_path}")

Re-running GEMINI for 115 PRs (last 115 rows).
Processing GEMINI 1/115: ai PR 3245892725
Running GEMINI, model: models/gemini-pro-latest
Processing GEMINI 2/115: ai PR 3245899488
Running GEMINI, model: models/gemini-pro-latest
Processing GEMINI 3/115: ai PR 3245927515
Running GEMINI, model: models/gemini-pro-latest
Processing GEMINI 4/115: ai PR 3245957050
Running GEMINI, model: models/gemini-pro-latest
Processing GEMINI 5/115: ai PR 3245970844
Running GEMINI, model: models/gemini-pro-latest
Processing GEMINI 6/115: ai PR 3246099511
Running GEMINI, model: models/gemini-pro-latest
Processing GEMINI 7/115: ai PR 3246105987
Running GEMINI, model: models/gemini-pro-latest
Processing GEMINI 8/115: ai PR 3246117305
Running GEMINI, model: models/gemini-pro-latest
Processing GEMINI 9/115: ai PR 3246122368
Running GEMINI, model: models/gemini-pro-latest
Processing GEMINI 10/115: ai PR 3246158661
Running GEMINI, model: models/gemini-pro-latest
[partial save] Saved 10 rows to /Users/antoniozhong/

In [56]:

data_temp = pd.read_parquet(out_path_gemini)
data_temp.tail(115)

Unnamed: 0,pr_id,author_type,repo,pr_number,pr_title,pipeline_names,validation_present,evidence_sources,validation_type,validation_description,pipeline_signal,description_signal,comment_signal,prefix
292,3245892725,ai_agent,mochilang/mochi,10497,Improve Dart transpiler progress,[Test],False,[],none,"The PR does not contain any explicit performance validation. The description outlines refactoring and feature changes without mentioning performance. The testing section refers to a standard test compilation command, not a performance benchmark.","The pipeline shows a `go test -c` command, which compiles tests but does not execute them or provide performance metrics.","The PR description focuses on refactoring type inference and task management. It does not mention performance improvements, optimizations, or any form of validation.",No comments are present.,ai
293,3245899488,ai_agent,mochilang/mochi,10505,Improve Lua transpiler,[Test],False,[],none,"The PR description mentions 'inline builtin operations', which suggests a performance optimization intent. However, there is no evidence provided to validate this change. The testing section only shows commands for running correctness tests, with no mention of performance benchmarks or metrics.",No performance validation signals found in the pipeline.,"The description suggests a performance optimization by mentioning 'inline builtin operations', but the testing section only describes running correctness tests without providing any performance metrics or benchmark results.",No comments were provided.,ai
294,3245927515,ai_agent,mochilang/mochi,10515,Improve C transpiler output,[Test],False,[],none,"The PR description mentions changes like 'fine tune C transpiler printing logic' and 'tighten generated for loop syntax', which imply performance intent. However, there is no explicit validation provided, such as benchmark results, profiling data, or static analysis, to demonstrate a performance improvement. The testing command is for a standard functional test, not a performance test.",The pipeline contains a generic 'Test' which is not specific to performance validation.,"The description indicates performance intent with phrases like 'fine tune' and 'tighten syntax', but lacks any validation evidence. The provided test command is for a functional golden file test, not a benchmark.",No comments were found.,ai
295,3245957050,ai_agent,mochilang/mochi,10525,Improve TS transpiler output,[Test],False,[],none,"The PR description states an intent to 'improve' the transpiler's performance. However, the testing section only provides a command to run a test suite, without presenting any benchmark results, performance metrics, or before-and-after comparisons. There is no explicit evidence of performance validation.","The pipeline contains a generic 'Test' stage, which does not specifically indicate performance validation.",The description mentions an intent to 'improve' the transpiler. The testing section shows a command to run tests but lacks any performance metrics or results.,No comments were provided.,ai
296,3245970844,ai_agent,mochilang/mochi,10557,Improve rkt transpiler header and tasks,[Test],False,[],none,"The PR describes functional changes related to using git timestamps and provides a standard 'go test' command for functional testing. There is no mention of performance optimization, benchmarks, profiling, or any other form of performance validation.","The pipeline contains a generic 'Test' stage, which does not specifically indicate performance validation.","The description provides a 'go test' command, but it is for a standard functional test, not a benchmark. The summary of the PR does not mention performance improvements.",No comments were found.,ai
297,3246099511,ai_agent,Rello/audioplayer,634,Optimize album art lookup,[],True,[description],static-analysis,"The PR provides a static-analysis based justification for the performance improvement, stating that the change will 'remove repeated album art checks to reduce filesystem lookups'. This is a qualitative argument about reducing I/O operations without providing any runtime metrics.",No performance signal.,"The description states the change will 'remove repeated album art checks to reduce filesystem lookups', which is a form of static reasoning for performance improvement.",No performance signal.,ai
298,3246105987,ai_agent,Rello/analytics,522,Improve indexing and sharing performance,[REUSE Compliance Check],True,[description],static-analysis,"The PR author justifies performance improvements through static reasoning about the code. The description explicitly mentions reducing algorithmic complexity ('reduce O(n^2) scans'), batching DOM updates, and adding a cache, all of which are forms of static analysis of performance.",No performance validation signals found in the pipeline.,"The PR description provides static analysis as evidence of performance improvement by mentioning a reduction in algorithmic complexity from O(n^2), batching of DOM updates, and caching.",No comments were provided.,ai
299,3246117305,ai_agent,mochilang/mochi,10727,Improve ts transpiler,[Test],False,[],none,"The PR description mentions a 'better numeric boolean conversion', which suggests a potential optimization. However, there is no explicit evidence provided to validate any performance improvement. The testing section only refers to a generic test command (`go test ./...`) without any performance metrics or comparisons.",The pipeline information is generic ('Test') and does not indicate any performance-related validation.,"The description suggests an improvement with the phrase 'better numeric boolean conversion' but lacks any form of validation evidence such as benchmarks, profiling, or even anecdotal claims.",No comments were provided.,ai
300,3246122368,ai_agent,Stirling-Tools/Stirling-PDF,3992,Improve temp cleanup performance,"[Build and Test Workflow, Dependency Review]",False,[],none,"The PR describes changes that are likely intended to improve performance and resource management, such as replacing `Files.list` with `DirectoryStream` and adding an async executor. However, it provides no explicit evidence of performance validation through benchmarks, profiling, or even anecdotal claims.","The pipeline shows standard build and test workflows, with no indication of performance-specific validation.","The description details the implementation of performance-related features (async executor, batching, streaming directory listing) but does not include any data or analysis to validate their performance impact.","The comments focus on code logic, correctness, and configuration, with no discussion of performance validation.",ai
301,3246158661,ai_agent,MihaiCristianCondrea/Smart-Cleaner-for-Android,212,Improve WhatsAppCleanerViewModel thread usage,[Android CI],True,[description],static-analysis,"The PR author provides a static analysis of the performance improvements by describing how they offloaded heavy sorting to a different thread dispatcher and avoided collecting flows on IO threads. This reasoning about thread management serves as the validation, without providing any quantitative metrics.",The pipeline is a generic CI run and does not contain performance validation signals.,"The description explains performance improvements by reasoning about thread management, such as 'offload heavy sorting... to Dispatchers.Default' and 'avoid collecting DataStore flows on IO threads'. This constitutes static-analysis based validation.",No comments were provided.,ai
