In [1]:
import json
from pathlib import Path
from typing import Dict, Iterable, List, Optional
import pandas as pd
from openai import OpenAI
from google import genai
from google.genai import types
import os
from dotenv import load_dotenv
load_dotenv() 


True

In [2]:
def run_llm(system_pront: str, prompt: str, type: str = "ollama", model: str = "gemma3:27b") -> str:
    """Call local OLLAMA."""
    LLM_BASE_URL = "http://localhost:11434/v1"
    LLM_API_KEY = ""
    llm_client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
    
  
    
    if(type == "ollama"):
        print("Running OLLAMA, model:", model)
        
        messages = [
            {"role": "system", "content": system_pront},
            {"role": "user", "content": prompt},
        ]
        
        r = llm_client.chat.completions.create(
            model= model,
            messages=messages,
        )
        result = r.choices[0].message.content.strip()
        print("OLLAMA response:", result)
        return result.strip()
    elif(type == "gemini"):
        
        model = "gemini-3-pro-preview"
        print("Running GEMINI, model:", model)
        
        GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
        client = genai.Client(api_key=GEMINI_API_KEY)

        schema={
                "type": "object",
                "properties": {
                    "validation_present": {
                        "type": "boolean"
                    },
                    "evidence_sources": {
                        "type": "array",
                        "items": {
                            "type": "string",
                            "enum": ["pipeline", "description", "comments"]
                        }
                    },
                    "validation_type": {
                        "type": "string",
                        "enum": [
                            "benchmark",
                            "profiling",
                            "load/canary",
                            "unit-only",
                            "unspecified",
                            "none"
                        ]
                    },
                    "validation_description": {
                        "type": "string"
                    },
                    "pipeline_signal": {
                        "type": "string"
                    },
                    "description_signal": {
                        "type": "string"
                    },
                    "comment_signal": {
                        "type": "string"
                    }
                },
                "required": [
                    "validation_present",
                    "evidence_sources",
                    "validation_type",
                    "validation_description",
                    "pipeline_signal",
                    "description_signal",
                    "comment_signal"
                ]
            }
        
        config = types.GenerateContentConfig(
            temperature=0.0,
            response_mime_type="application/json",
            response_schema=schema,
            thinking_config=types.ThinkingConfig(
                thinking_level=types.ThinkingLevel.HIGH
            )
        )
        
        messages = [
            system_pront,
            prompt
        ]
        
        response = client.models.generate_content(
            model=model,
            contents=messages,
            config=config,
        )
        return response.text
    

In [3]:
def find_datasets_dir(start: Optional[Path] = None) -> Path:
    start = start or Path.cwd()
    for path in (start, *start.parents):
        candidate = path / "datasets"
        if candidate.exists():
            return candidate
    raise FileNotFoundError(f"Could not find 'datasets' directory from {start}")


DATASETS_DIR = find_datasets_dir()
PROJECT_ROOT = DATASETS_DIR.parent

In [4]:
def extract_json(text: str) -> Dict:
    """Best-effort JSON extraction from model output."""
    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        snippet = text[start : end + 1]
        try:
            return json.loads(snippet)
        except json.JSONDecodeError:
            pass
    return {}

def truncate(text: str, limit: int = 10000) -> str:
    return text if len(text) <= limit else text[:limit] + "...[truncated]"

In [5]:
def load_pr_core(prefix: str) -> pd.DataFrame:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    return commits.drop_duplicates("pr_id").set_index("pr_id")


def collect_comments(prefix: str, pr_id: int) -> List[str]:
    issue = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_issue_comments.parquet"
    )
    review = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_review_comments.parquet"
    )
    texts = []
    for df in (issue, review):
        subset = df[df["pr_id"] == pr_id]
        texts.extend(subset["body"].dropna().tolist())
    return texts


def collect_pipeline_names(prefix: str, pr_id: int) -> List[str]:
    workflows = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_workflow_runs.parquet"
    )
    subset = workflows[workflows["pr_id"] == pr_id]
    return sorted(subset["workflow_name"].dropna().unique().tolist())


In [6]:
def analyze_pr(
    prefix: str, pr_id: int, author_type: str, pr_core: pd.DataFrame
) -> Dict:
    row = pr_core.loc[pr_id]
    pipeline_names = collect_pipeline_names(prefix, pr_id)
    comments = collect_comments(prefix, pr_id)
    description = (row.get("pr_description") or "").strip()
    
    SYSTEM_PROMPT_TEMPLATE = """You classify evidence of performance validation for a PR.
    Return compact JSON only with keys:
    validation_present (bool), evidence_sources (list of "pipeline","description","comments"),
    validation_type (benchmark,profiling,load/canary,unit-only,unspecified,none),
    validation_description (short text),
    pipeline_signal (short), description_signal (short), comment_signal (short).

    Rules:
    - Pipelines count only if workflow names imply perf/benchmark/load/canary; note when they are unit/lint-only.
    - Description/comments count if they mention perf benchmarks, profiling, latency/throughput numbers,
    load/canary rollout, A/B tests, perf tools, or explicit "no perf validation".
    - If nothing indicates perf validation, set validation_present=false,
    validation_type="none", evidence_sources=[],
    validation_description="No validation evidence".
    """
    
    PROMPT_TEMPLATE = """
    You are given information about a GitHub Pull Request (PR).
    Using the provided PIPELINES, DESCRIPTION, and COMMENTS, determine if there is evidence of performance validation for the PR.
    Input (TOONS format):

    PIPELINES:
    {pipeline_names}

    DESCRIPTION:
    {description}

    COMMENTS:
    {comments}

    JSON:
    """

    if not pipeline_names and not description and not comments:
        return {
            "pr_id": pr_id,
            "author_type": author_type,
            "repo": f"{row.get('repo_owner')}/{row.get('repo_name')}",
            "pr_number": row.get("pr_number"),
            "pr_title": row.get("pr_title"),
            "pipeline_names": pipeline_names,
            "validation_present": False,
            "evidence_sources": [],
            "validation_type": "none",
            "validation_description": "No validation evidence",
            "pipeline_signal": "",
            "description_signal": "",
            "comment_signal": "",
        }

    prompt = PROMPT_TEMPLATE.format(
    pipeline_names="\n  - " + "\n  - ".join(pipeline_names) if pipeline_names else "  None",
    description="  " + truncate(description).replace("\n", "\n  ") if description else "  None",
    comments="  - " + "\n  - ".join(truncate(" | ".join(comments)).split(" | ")) if comments else "  None",
    )
    
    raw = run_llm(SYSTEM_PROMPT_TEMPLATE, prompt, type="gemini")
    parsed = extract_json(raw)

    evidence_sources = parsed.get("evidence_sources") or []
    if isinstance(evidence_sources, (tuple, list)):
        evidence_sources = list(evidence_sources)

    return {
        "pr_id": pr_id,
        "author_type": author_type,
        "repo": f"{row.get('repo_owner')}/{row.get('repo_name')}",
        "pr_number": row.get("pr_number"),
        "pr_title": row.get("pr_title"),
        "pipeline_names": pipeline_names,
        "validation_present": parsed.get("validation_present"),
        "evidence_sources": evidence_sources,
        "validation_type": parsed.get("validation_type"),
        "validation_description": parsed.get("validation_description"),
        "pipeline_signal": parsed.get("pipeline_signal"),
        "description_signal": parsed.get("description_signal"),
        "comment_signal": parsed.get("comment_signal"),
    }

In [7]:
def pr_ids_from_commits(prefix: str, limit: Optional[int] = None) -> Iterable[int]:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    pr_ids = sorted(commits["pr_id"].dropna().astype(int).unique().tolist())
    return pr_ids if limit is None else pr_ids[:limit]


In [8]:
out_dir = PROJECT_ROOT / "RQ3"
out_dir.mkdir(exist_ok=True, parents=True)
out_path = out_dir / "rq3_validation_evidence.parquet"
records = []
ai_core = load_pr_core("ai")
human_core = load_pr_core("human")

limit = 10

ai_ids = list(pr_ids_from_commits("ai", limit=limit))
human_ids = list(pr_ids_from_commits("human", limit=limit))
print(f"Processing {len(ai_ids)} AI PRs and {len(human_ids)} human PRs (first {limit} each).")
for idx, pr_id in enumerate(ai_ids, 1):
    print(f"Processing AI PR {idx}/{len(ai_ids)}: {pr_id}")
    try:
        records.append(analyze_pr("ai", pr_id, "ai_agent", ai_core))
    except Exception as exc:
        records.append(
                {
                    "pr_id": pr_id,
                    "author_type": "ai_agent",
                    "repo": "",
                    "pr_number": None,
                    "pr_title": "",
                    "pipeline_names": [],
                    "validation_present": None,
                    "evidence_sources": [],
                    "validation_type": "error",
                    "validation_description": f"error: {exc}",
                    "pipeline_signal": "",
                    "description_signal": "",
                    "comment_signal": "",
                }
            )
for idx, pr_id in enumerate(human_ids, 1):
    print(f"Processing human PR {idx}/{len(human_ids)}: {pr_id}")
    try:
        records.append(analyze_pr("human", pr_id, "human", human_core))
    except Exception as exc:
        records.append(
                {
                    "pr_id": pr_id,
                    "author_type": "human",
                    "repo": "",
                    "pr_number": None,
                    "pr_title": "",
                    "pipeline_names": [],
                    "validation_present": None,
                    "evidence_sources": [],
                    "validation_type": "error",
                    "validation_description": f"error: {exc}",
                    "pipeline_signal": "",
                    "description_signal": "",
                    "comment_signal": "",
                }
            )

df = pd.DataFrame(records)
df.to_parquet(out_path, index=False)
print(f"Saved {len(df)} rows to {out_path}")




Processing 10 AI PRs and 10 human PRs (first 10 each).
Processing AI PR 1/10: 2766896431
Running GEMINI, model: gemini-3-pro-preview
Processing AI PR 2/10: 2843312341
Running GEMINI, model: gemini-3-pro-preview
Processing AI PR 3/10: 2843334531
Running GEMINI, model: gemini-3-pro-preview
Processing AI PR 4/10: 2855302194
Running GEMINI, model: gemini-3-pro-preview
Processing AI PR 5/10: 2859989652
Running GEMINI, model: gemini-3-pro-preview
Processing AI PR 6/10: 2887787232
Running GEMINI, model: gemini-3-pro-preview
Processing AI PR 7/10: 2920983723
Running GEMINI, model: gemini-3-pro-preview
Processing AI PR 8/10: 2926188053
Running GEMINI, model: gemini-3-pro-preview
Processing AI PR 9/10: 2927184629
Running GEMINI, model: gemini-3-pro-preview
Processing AI PR 10/10: 2965102818
Running GEMINI, model: gemini-3-pro-preview
Processing human PR 1/10: 2260441374
Running GEMINI, model: gemini-3-pro-preview
Processing human PR 2/10: 2260678480
Running GEMINI, model: gemini-3-pro-preview
Pr

In [10]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.expand_frame_repr", True)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

data_temp = pd.read_parquet(out_path)
data_temp.head(20)

Unnamed: 0,pr_id,author_type,repo,pr_number,pr_title,pipeline_names,validation_present,evidence_sources,validation_type,validation_description,pipeline_signal,description_signal,comment_signal
0,2766896431,ai_agent,onlook-dev/onlook,982,Replace motion library with Tailwind transitions in EditPanel,[],False,[],none,"The PR description claims performance improvements by removing a dependency, but the testing section only confirms compilation, linting, and functional correctness of transitions without providing performance metrics or profiling.",,Mentions performance improvement as a goal but testing is limited to build/lint and visual verification.,Bot comment only
1,2843312341,ai_agent,promptfoo/promptfoo,3046,perf: optimize cache and token handling,"[CI, Validate PR Title]",False,[],none,"The PR describes performance optimizations and high-load issues but only provides evidence of unit, integration, and manual functional testing. No benchmarks or load tests are reported.","CI, Validate PR Title (generic)","Mentions performance optimizations but testing is limited to unit, integration, and manual functional checks",Bot interactions and unit test generation status
2,2843334531,ai_agent,promptfoo/promptfoo,3047,perf: optimize cache and token handling,"[CI, Validate PR Title]",False,[],none,"The PR description claims performance optimizations (cache, token counting) but only lists unit, integration, and manual functional testing; no benchmarks or load tests are provided to verify the performance improvements.","CI, Validate PR Title (no perf workflows)",Mentions performance optimizations but lists only unit/integration/manual functional tests,Bot interactions only
3,2855302194,ai_agent,pdfme/pdfme,711,Optimize Font Loading Performance in Tests,[Unit Testing],True,[description],benchmark,"Description provides explicit 'Before' and 'After' timing comparisons for specific tests and the full suite (e.g., 10.7s to 4.2s).",Unit Testing,Performance Improvements section lists specific timings (Before/After) for tests.,Automated bot comments only
4,2859989652,ai_agent,wolfSSL/wolfssh,779,Update SFTP status callback to output once per second,"[Cppcheck Test, Kyber Tests, OS Check Test, Single-thread Check Test, Windows Build Test, Zephyr tests, wolfSSH SCP Test, wolfSSHd Test]",False,[],none,No validation evidence,"Static analysis and functional tests (Cppcheck, wolfSSH)",Verified using cppcheck; mentions reducing status update frequency,"User mentions fixing a bottleneck and requests performance difference, but no results are provided"
5,2887787232,ai_agent,stack-auth/stack-auth,495,[DEVIN: Ryan] Optimize create user query to not use interactive transaction,"[Dev Environment Test, Docker Build and Push, Docker Test, Ensure Prisma migrations are in sync with the schema, Lint & build, Preview Docs, Run setup tests, Runs E2E API Tests, TOC Generator, all-good: Did all the other checks pass?]",False,[],none,"The PR description claims the changes make the implementation more efficient by avoiding interactive transaction overhead, but no specific performance metrics, benchmarks, or profiling results are provided to validate this claim.","Standard CI/CD (tests, lint, build, E2E) only",Claims optimization and efficiency but lacks metrics,No performance discussions
6,2920983723,ai_agent,Cap-go/capgo,1066,perf: optimize MAU loading mechanism for better performance with large datasets,"[Run tests, autofix.ci]",True,[description],load/canary,Manual testing performed with a large number of apps to ensure performance remains acceptable with large datasets.,"Run tests, autofix.ci",Tested with a large number of apps to ensure performance is acceptable,
7,2926188053,ai_agent,onlook-dev/onlook,1630,Migrate from chokidar to @parcel/watcher,[],False,[],none,No validation evidence,,Claims improved performance via library change but provides no metrics or test results,
8,2927184629,ai_agent,onlook-dev/onlook,1634,Add 1-second delay between restore changes and refresh webviews,[],False,[],none,No validation evidence,,"Mentions adding a 1-second delay for functional correctness, but no performance validation.",Bot comment only
9,2965102818,ai_agent,Cap-go/capgo,1108,perf: Performance analysis for default channel migration,"[Run tests, autofix.ci]",True,[description],profiling,"The PR description provides a detailed performance analysis identifying specific scalability issues, inefficient subqueries, and sequential operations in migration scripts and APIs.",Run tests,Performance Analysis for Default Channel Migration,
