In [1]:
!pip install openai pandas python-dotenv
import json
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv()) 



True

In [2]:
from pydantic import BaseModel
from typing import List, Literal


class ValidationSchema(BaseModel):
    validation_present: bool

    evidence_sources: List[
        Literal["pipeline", "description", "comments", "code_diff"]
    ]

    validation_type: Literal[
        "benchmark",
        "profiling",
        "static-analysis",
        "anecdotal",
        "none"
    ]

    validation_description: str
    pipeline_signal: str
    description_signal: str
    comment_signal: str

In [3]:
def run_llm(prompt: str, user_prompt: str) -> ValidationSchema:
    LLM_BASE_URL = None
    LLM_API_KEY = os.environ.get("OPENAI_API_KEY")
    llm_client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
    
    model = "gpt-5.1-2025-11-13"
    print("Running OPENAI, model:", model)

    r = llm_client.responses.parse(
        model=model,
        reasoning={"effort": "high"},
        instructions=prompt,
        input=[
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        text_format=ValidationSchema,
    )

    result: ValidationSchema = r.output_parsed
    return result

In [4]:
def find_datasets_dir(start: Optional[Path] = None) -> Path:
    start = start or Path.cwd()
    for path in (start, *start.parents):
        candidate = path / "datasets"
        if candidate.exists():
            return candidate
    raise FileNotFoundError(f"Could not find 'datasets' directory from {start}")


DATASETS_DIR = find_datasets_dir()
PROJECT_ROOT = DATASETS_DIR.parent

def truncate(text: str, limit: int = 15000) -> str:
    return text if len(text) <= limit else text[:limit] + "...[truncated]"

def load_pr_core(prefix: str) -> pd.DataFrame:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    return commits.drop_duplicates("pr_id").set_index("pr_id")


def collect_comments(prefix: str, pr_id: int) -> List[str]:
    issue = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_issue_comments.parquet"
    )
    review = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_review_comments.parquet"
    )
    texts = []
    for df in (issue, review):
        subset = df[df["pr_id"] == pr_id]
        texts.extend(subset["body"].dropna().tolist())
    return texts


def collect_pipeline_names(prefix: str, pr_id: int) -> List[str]:
    workflows = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_workflow_runs.parquet"
    )
    subset = workflows[workflows["pr_id"] == pr_id]
    return sorted(subset["workflow_name"].dropna().unique().tolist())

def pr_ids_from_commits(prefix: str, limit: Optional[int] = None) -> Iterable[int]:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    pr_ids = sorted(commits["pr_id"].dropna().astype(int).unique().tolist())
    return pr_ids if limit is None else pr_ids[:limit]


In [15]:
def analyze_pr(
    prefix: str,
    pr_id: int,
    author_type: str,
    pr_core: pd.DataFrame,
) -> Tuple[Dict, Dict]:
    row = pr_core.loc[pr_id]
    pipeline_names = collect_pipeline_names(prefix, pr_id)
    comments = collect_comments(prefix, pr_id)
    description = (row.get("pr_description") or "").strip()
    code_diff = (row.get("patch") or "").strip()

    DEVELOPER_PROMPT_TEMPLATE = """
    You are a classifier for performance validation evidence in GitHub Pull Requests (PRs).

    Your job is to decide whether there is explicit performance validation, and if so,
    to classify it into EXACTLY ONE validation_type category based on the definitions below.

    First, carefully read and internalize these validation_type categories:

    1. Benchmark-Based Validation (Unit Tests or Microbenchmarks)
    Definition:
    The PR validates performance by running benchmark tests—either existing unit tests or newly added benchmark/microbenchmark tests. It includes explicit before-and-after comparisons such as runtime, throughput, memory, CPU usage, or any quantitative metric collected from tests.

    2. Profiling-Based Validation (Application- or Function-Level Profiling)
    Definition:
    The PR uses profiling tools to validate performance, typically capturing stack samples, CPU hotspots, flamegraphs, or function-level timings. Evidence includes profiling outputs before and after the change.

    3. Static-Analysis-Based Validation (Reasoning Without Runtime Evidence)
    Definition:
    The PR argues for performance improvement through static reasoning about the code—algorithmic complexity, data-structure changes, loop bounds, allocation count reduction, etc., without providing runtime/profiling data.

    4. Anecdotal or Informal Local Testing (No Evidence Provided)
    Definition:
    The PR claims that performance is improved based on local testing, intuition, or manual observation, but provides no quantitative metrics, no profiling output, and no static-analysis justification.

    Decision Rules:

    1) First decide if performance validation is explicitly present (validation_present).
    - Set validation_present = TRUE only when the PR explicitly shows some form of validation evidence: benchmarks/microbenchmarks, profiling traces, static performance reasoning, or explicit local testing statements.
    - Do NOT infer validation from performance intent alone.

    2) If validation_present is "FALSE", please set validation_type = "none", also set evidence_sources = [] also set validation_description to explain the absence of validation. Never assign any other validation_type.
    
    3) If validation_present is "TRUE", please choose exactly ONE non-"none" validation_type from the following: Benchmark-Based Validation; Profiling-Based Validation; Static-Analysis-Based Validation; Anecdotal or Informal Local Testing

    4) evidence_sources must list where the validation is explicitly mentioned: "pipeline", "description", "comments", "code_diff"

    5) When validation_type is "benchmark", "profiling": mention the metrics used (latency, throughput, memory, CPU, etc.).

    You must ALWAYS return STRICT JSON with exactly these keys: validation_present, evidence_sources, validation_type, metrics, validation_description, pipeline_signal, description_signal, comment_signal.

    No extra commentary. No markdown. No explanations.
    """

    USER_PROMPT_TEMPLATE = """
    Classify the following PR strictly using the rules and definitions from the system.

    PIPELINES:
    {pipeline_names}

    DESCRIPTION:
    {description}

    COMMENTS:
    {comments}
    
    CODE DIFF:
    {code_diff}
    """

    if not pipeline_names and not description and not comments and not code_diff:
        print(f"Short-circuiting PR {pr_id} with no signals")
        print(f"pipeline_names: {pipeline_names}")
        print(f"description: {description}")
        print(f"comments: {comments}")
        print(f"code_diff: {code_diff}")
        raise Exception("No signals present")

    prompt = USER_PROMPT_TEMPLATE.format(
        pipeline_names="- " + "- ".join(pipeline_names) if pipeline_names else "None",
        description=truncate(description) if description else "None",
        comments="- " + "- ".join(truncate(" | ".join(comments)).split(" | ")) if comments else "None",
        code_diff=truncate(code_diff) if code_diff else "None",
    )

    developer_prompt = DEVELOPER_PROMPT_TEMPLATE

    raw_openai = ""
    try:
        raw_openai = run_llm(
            prompt=developer_prompt,
            user_prompt=prompt,
        ).model_dump()
    except Exception as exc:
        raise exc

    parsed_openai = raw_openai

    evidence_sources = parsed_openai.get("evidence_sources") or []
    if isinstance(evidence_sources, (tuple, list)):
        evidence_sources = list(evidence_sources)
    else:
        evidence_sources = []

    result_openai = {
        "pr_id": pr_id,
        "author_type": author_type,
        "repo": f"{row.get('repo_owner')}/{row.get('repo_name')}",
        "pr_number": row.get("pr_number"),
        "pr_title": row.get("pr_title"),
        "pipeline_names": pipeline_names,
        "validation_present": parsed_openai.get("validation_present", False),
        "evidence_sources": evidence_sources,
        "validation_type": parsed_openai.get("validation_type", "none"),
        "validation_description": parsed_openai.get("validation_description", "No validation evidence"),
        "pipeline_signal": parsed_openai.get("pipeline_signal", ""),
        "description_signal": parsed_openai.get("description_signal", ""),
        "comment_signal": parsed_openai.get("comment_signal", ""),
    }

    return result_openai

In [17]:
out_dir = PROJECT_ROOT / "RQ3"
out_dir.mkdir(exist_ok=True, parents=True)

out_path_openai = out_dir / "rq3_validation_evidence_openai.parquet"
error_csv_path = out_dir / "rq3_validation_errors.csv"

ai_core = load_pr_core("ai")
human_core = load_pr_core("human")

limit = None

if out_path_openai.exists():
    df_existing = pd.read_parquet(out_path_openai)
    records_openai = df_existing.to_dict("records")
    processed_ids = set(df_existing["pr_id"].astype(int).tolist())
    print(f"[resume] Loaded {len(records_openai)} existing records")
else:
    records_openai = []
    processed_ids = set()
    print("[resume] No previous file found, starting fresh")


def save_partial(records, out_path):
    if not records:
        return
    df_tmp = pd.DataFrame(records)
    df_tmp.to_parquet(out_path, index=False)
    #print(f"[partial save] Saved {len(df_tmp)} rows to {out_path}")


def log_error(pr_id, prefix, author_type, exc):
    row = {
        "prefix": prefix,
        "pr_id": pr_id,
        "author_type": author_type,
        "error": str(exc),
    }
    df_err = pd.DataFrame([row])
    header = not error_csv_path.exists()
    df_err.to_csv(error_csv_path, mode="a", header=header, index=False)
    print(f"[error] Logged PR {pr_id} ({prefix}/{author_type}) to {error_csv_path}: {exc}")


ai_ids = list(pr_ids_from_commits("ai", limit=limit))
human_ids = list(pr_ids_from_commits("human", limit=limit))
print(f"Processing {len(ai_ids)} AI PRs and {len(human_ids)} human PRs (first {limit} each).")



for idx, pr_id in enumerate(ai_ids, 1):

    if pr_id in processed_ids:
        print(f"[skip] AI PR {idx}/{len(ai_ids)} already processed: {pr_id}")
        continue

    print(f"Processing AI PR {idx}/{len(ai_ids)}: {pr_id}")

    try:
        openai_res = analyze_pr("ai", pr_id, "ai_agent", ai_core)
        records_openai.append(openai_res)

    except Exception as exc:
        log_error(pr_id, prefix="ai", author_type="ai_agent", exc=exc)

    # ---- SAVE EVERY 1 ----
    if len(records_openai) % 1 == 0:
        save_partial(records_openai, out_path_openai)



for idx, pr_id in enumerate(human_ids, 1):

    if pr_id in processed_ids:
        print(f"[skip] human PR {idx}/{len(human_ids)} already processed: {pr_id}")
        continue

    print(f"Processing human PR {idx}/{len(human_ids)}: {pr_id}")

    try:
        openai_res = analyze_pr("human", pr_id, "human", human_core)
        records_openai.append(openai_res)

    except Exception as exc:
        log_error(pr_id, prefix="human", author_type="human", exc=exc)

    # ---- SAVE EVERY 1 ----
    if len(records_openai) % 1 == 0:
        save_partial(records_openai, out_path_openai)


# ============================
# Final save ✅
# ============================
df_open_ai = pd.DataFrame(records_openai)
df_open_ai.to_parquet(out_path_openai, index=False)
print(f"Saved FINAL OPENAI {len(df_open_ai)} rows to {out_path_openai}")
print(f"Errored PRs (if any) logged to {error_csv_path}")

[resume] Loaded 405 existing records
Processing 324 AI PRs and 83 human PRs (first None each).
[skip] AI PR 1/324 already processed: 2766896431
[skip] AI PR 2/324 already processed: 2843312341
[skip] AI PR 3/324 already processed: 2843334531
[skip] AI PR 4/324 already processed: 2855302194
[skip] AI PR 5/324 already processed: 2859989652
[skip] AI PR 6/324 already processed: 2887787232
[skip] AI PR 7/324 already processed: 2920983723
[skip] AI PR 8/324 already processed: 2926188053
[skip] AI PR 9/324 already processed: 2927184629
[skip] AI PR 10/324 already processed: 2973653748
[skip] AI PR 11/324 already processed: 2991070962
[skip] AI PR 12/324 already processed: 3006445782
[skip] AI PR 13/324 already processed: 3006507938
[skip] AI PR 14/324 already processed: 3006534682
[skip] AI PR 15/324 already processed: 3006544045
[skip] AI PR 16/324 already processed: 3006562482
[skip] AI PR 17/324 already processed: 3033566586
[skip] AI PR 18/324 already processed: 3033886992
[skip] AI PR 1