In [22]:
import json
from pathlib import Path
from typing import Dict, Iterable, List, Optional
import pandas as pd
from openai import OpenAI
from google import genai
from google.genai import types
import os
from dotenv import load_dotenv
load_dotenv() 


True

In [23]:
def run_llm(system_pront: str, prompt: str, type: str = "ollama", model: str = "gemma3:27b") -> str:
    """Call local OLLAMA."""
    LLM_BASE_URL = "http://localhost:11434/v1"
    LLM_API_KEY = ""
    llm_client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
    
  
    
    if(type == "ollama"):
        print("Running OLLAMA, model:", model)
        
        messages = [
            {"role": "system", "content": system_pront},
            {"role": "user", "content": prompt},
        ]
        
        r = llm_client.chat.completions.create(
            model= model,
            messages=messages,
        )
        result = r.choices[0].message.content.strip()
        print("OLLAMA response:", result)
        return result.strip()
    elif(type == "gemini"):
        
        model = "gemini-3-pro-preview"
        print("Running GEMINI, model:", model)
        
        GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
        client = genai.Client(api_key=GEMINI_API_KEY)

        schema={
                "type": "object",
                "properties": {
                    "validation_present": {
                        "type": "boolean"
                    },
                    "evidence_sources": {
                        "type": "array",
                        "items": {
                            "type": "string",
                            "enum": ["pipeline", "description", "comments"]
                        }
                    },
                    "validation_type": {
                        "type": "string",
                        "enum": [
                            "benchmark",
                            "profiling",
                            "load/canary",
                            "unit-only",
                            "unspecified",
                            "none"
                        ]
                    },
                    "validation_description": {
                        "type": "string"
                    },
                    "pipeline_signal": {
                        "type": "string"
                    },
                    "description_signal": {
                        "type": "string"
                    },
                    "comment_signal": {
                        "type": "string"
                    }
                },
                "required": [
                    "validation_present",
                    "evidence_sources",
                    "validation_type",
                    "validation_description",
                    "pipeline_signal",
                    "description_signal",
                    "comment_signal"
                ]
            }
        
        config = types.GenerateContentConfig(
            temperature=0.0,
            response_mime_type="application/json",
            response_schema=schema,
            thinking_config=types.ThinkingConfig(
                thinking_level=types.ThinkingLevel.HIGH
            )
        )
        
        messages = [
            system_pront,
            prompt
        ]
        
        response = client.models.generate_content(
            model=model,
            contents=messages,
            config=config,
        )
        return response.text
    

In [24]:
def find_datasets_dir(start: Optional[Path] = None) -> Path:
    start = start or Path.cwd()
    for path in (start, *start.parents):
        candidate = path / "datasets"
        if candidate.exists():
            return candidate
    raise FileNotFoundError(f"Could not find 'datasets' directory from {start}")


DATASETS_DIR = find_datasets_dir()
PROJECT_ROOT = DATASETS_DIR.parent

In [25]:
def extract_json(text: str) -> Dict:
    """Best-effort JSON extraction from model output."""
    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        snippet = text[start : end + 1]
        try:
            return json.loads(snippet)
        except json.JSONDecodeError:
            pass
    return {}

def truncate(text: str, limit: int = 10000) -> str:
    return text if len(text) <= limit else text[:limit] + "...[truncated]"

In [26]:
def load_pr_core(prefix: str) -> pd.DataFrame:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    return commits.drop_duplicates("pr_id").set_index("pr_id")


def collect_comments(prefix: str, pr_id: int) -> List[str]:
    issue = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_issue_comments.parquet"
    )
    review = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_review_comments.parquet"
    )
    texts = []
    for df in (issue, review):
        subset = df[df["pr_id"] == pr_id]
        texts.extend(subset["body"].dropna().tolist())
    return texts


def collect_pipeline_names(prefix: str, pr_id: int) -> List[str]:
    workflows = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_workflow_runs.parquet"
    )
    subset = workflows[workflows["pr_id"] == pr_id]
    return sorted(subset["workflow_name"].dropna().unique().tolist())


In [27]:
def analyze_pr(
    prefix: str, pr_id: int, author_type: str, pr_core: pd.DataFrame
) -> Dict:
    row = pr_core.loc[pr_id]
    pipeline_names = collect_pipeline_names(prefix, pr_id)
    comments = collect_comments(prefix, pr_id)
    description = (row.get("pr_description") or "").strip()
    
    SYSTEM_PROMPT_TEMPLATE = """You classify evidence of performance validation for a PR.
    Return compact JSON only with keys:
    validation_present (bool), evidence_sources (list of "pipeline","description","comments"),
    validation_type (benchmark,profiling,load/canary,unit-only,unspecified,none),
    validation_description (short text),
    pipeline_signal (short), description_signal (short), comment_signal (short).

    Rules:
    - Pipelines count only if workflow names imply perf/benchmark/load/canary; note when they are unit/lint-only.
    - Description/comments count if they mention perf benchmarks, profiling, latency/throughput numbers,
    load/canary rollout, A/B tests, perf tools, or explicit "no perf validation".
    - If nothing indicates perf validation, set validation_present=false,
    validation_type="none", evidence_sources=[],
    validation_description="No validation evidence".
    """
    
    PROMPT_TEMPLATE = """
    You are given information about a GitHub Pull Request (PR).
    Using the provided PIPELINES, DESCRIPTION, and COMMENTS, determine if there is evidence of performance validation for the PR.
    Input (TOONS format):

    PIPELINES:
    {pipeline_names}

    DESCRIPTION:
    {description}

    COMMENTS:
    {comments}

    JSON:
    """

    if not pipeline_names and not description and not comments:
        return {
            "pr_id": pr_id,
            "author_type": author_type,
            "repo": f"{row.get('repo_owner')}/{row.get('repo_name')}",
            "pr_number": row.get("pr_number"),
            "pr_title": row.get("pr_title"),
            "pipeline_names": pipeline_names,
            "validation_present": False,
            "evidence_sources": [],
            "validation_type": "none",
            "validation_description": "No validation evidence",
            "pipeline_signal": "",
            "description_signal": "",
            "comment_signal": "",
        }

    prompt = PROMPT_TEMPLATE.format(
    pipeline_names="\n  - " + "\n  - ".join(pipeline_names) if pipeline_names else "  None",
    description="  " + truncate(description).replace("\n", "\n  ") if description else "  None",
    comments="  - " + "\n  - ".join(truncate(" | ".join(comments)).split(" | ")) if comments else "  None",
    )
    
    raw = run_llm(SYSTEM_PROMPT_TEMPLATE, prompt, type="gemini")
    parsed = extract_json(raw)

    evidence_sources = parsed.get("evidence_sources") or []
    if isinstance(evidence_sources, (tuple, list)):
        evidence_sources = list(evidence_sources)

    return {
        "pr_id": pr_id,
        "author_type": author_type,
        "repo": f"{row.get('repo_owner')}/{row.get('repo_name')}",
        "pr_number": row.get("pr_number"),
        "pr_title": row.get("pr_title"),
        "pipeline_names": pipeline_names,
        "validation_present": parsed.get("validation_present"),
        "evidence_sources": evidence_sources,
        "validation_type": parsed.get("validation_type"),
        "validation_description": parsed.get("validation_description"),
        "pipeline_signal": parsed.get("pipeline_signal"),
        "description_signal": parsed.get("description_signal"),
        "comment_signal": parsed.get("comment_signal"),
    }

In [28]:
def pr_ids_from_commits(prefix: str, limit: Optional[int] = None) -> Iterable[int]:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    pr_ids = sorted(commits["pr_id"].dropna().astype(int).unique().tolist())
    return pr_ids if limit is None else pr_ids[:limit]


In [29]:

out_dir = PROJECT_ROOT / "RQ3"
out_dir.mkdir(exist_ok=True, parents=True)
out_path = out_dir / "rq3_validation_evidence.parquet"

records = []
ai_core = load_pr_core("ai")
human_core = load_pr_core("human")

limit = None

def save_partial(records, out_path):
    df_tmp = pd.DataFrame(records)
    df_tmp.to_parquet(out_path, index=False)
    print(f"[partial save] Saved {len(df_tmp)} rows to {out_path}")

ai_ids = list(pr_ids_from_commits("ai", limit=limit))
human_ids = list(pr_ids_from_commits("human", limit=limit))
print(f"Processing {len(ai_ids)} AI PRs and {len(human_ids)} human PRs (first {limit} each).")

# ============================
# Process AI PRs
# ============================
for idx, pr_id in enumerate(ai_ids, 1):
    print(f"Processing AI PR {idx}/{len(ai_ids)}: {pr_id}")
    try:
        records.append(analyze_pr("ai", pr_id, "ai_agent", ai_core))
    except Exception as exc:
        records.append({
            "pr_id": pr_id,
            "author_type": "ai_agent",
            "repo": "",
            "pr_number": None,
            "pr_title": "",
            "pipeline_names": [],
            "validation_present": None,
            "evidence_sources": [],
            "validation_type": "error",
            "validation_description": f"error: {exc}",
            "pipeline_signal": "",
            "description_signal": "",
            "comment_signal": "",
        })

    # ---- SAVE EVERY 20 ----
    if len(records) % 20 == 0:
        save_partial(records, out_path)

# ============================
# Process Human PRs
# ============================
for idx, pr_id in enumerate(human_ids, 1):
    print(f"Processing human PR {idx}/{len(human_ids)}: {pr_id}")
    try:
        records.append(analyze_pr("human", pr_id, "human", human_core))
    except Exception as exc:
        records.append({
            "pr_id": pr_id,
            "author_type": "human",
            "repo": "",
            "pr_number": None,
            "pr_title": "",
            "pipeline_names": [],
            "validation_present": None,
            "evidence_sources": [],
            "validation_type": "error",
            "validation_description": f"error: {exc}",
            "pipeline_signal": "",
            "description_signal": "",
            "comment_signal": "",
        })

    # ---- SAVE EVERY 20 ----
    if len(records) % 20 == 0:
        save_partial(records, out_path)

# ============================
# Final save
# ============================
df = pd.DataFrame(records)
df.to_parquet(out_path, index=False)
print(f"Saved FINAL {len(df)} rows to {out_path}")

Processing 324 AI PRs and 83 human PRs (first None each).
Running GEMINI, model: gemini-3-pro-preview


ClientError: 403 PERMISSION_DENIED. {'error': {'code': 403, 'message': 'The caller does not have permission', 'status': 'PERMISSION_DENIED'}}

In [30]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.expand_frame_repr", True)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

data_temp = pd.read_parquet(out_path)
data_temp.tail(20)

Unnamed: 0,pr_id,author_type,repo,pr_number,pr_title,pipeline_names,validation_present,evidence_sources,validation_type,validation_description,pipeline_signal,description_signal,comment_signal
280,3240593081,ai_agent,Doriandarko/make-it-heavy,9.0,Codex/integrate tygent module for performance,[],False,[],none,No validation evidence,,,
281,3241057566,ai_agent,vllm-project/vllm,21146.0,[Core] Freeze gc during cuda graph capture to speed up init,"[Lint and Deploy Charts, pre-commit]",True,"[description, comments]",benchmark,"The PR description and comments provide detailed before/after benchmarks and comparisons of different configurations, showing a significant speedup in CUDA graph capture time (e.g., from 35s to 2s).","Pipelines are for linting and deployment, not performance.",Description provides before/after benchmark results showing a speedup from 35s to 2s for cudagraph capture.,"Comments contain extensive benchmark results comparing different approaches and configurations, showing significant performance improvements."
282,3241523087,ai_agent,doodlum/skyrim-community-shaders,1281.0,perf: cache GetRuntimeData usage for improved performance,[WIP],True,"[description, comments]",profiling,"The PR description claims performance improvements by caching function calls. However, a detailed analysis in the comments refutes the initial claims, showing the actual performance impact is minimal after profiling the underlying library calls.",No perf-related pipelines.,Claims performance improvement by eliminating 'expensive' function calls.,"Detailed analysis in comments refutes initial claims, showing minimal performance gain after profiling the underlying code."
283,3241690700,ai_agent,mochilang/mochi,9435.0,Update Clojure compiler,[Test],False,[],none,The PR description mentions an optimization but provides no performance data or benchmark results to validate it. The pipeline is a generic test run.,The 'Test' pipeline does not suggest performance validation.,Describes an optimization but provides no performance data to validate it.,No comments.
284,3241691177,ai_agent,mochilang/mochi,9436.0,Improve Dart aggregate inference,[Test],False,[],none,"No validation evidence. The description mentions compiler refinements which could impact performance, but no benchmarks or metrics are provided. The pipeline is a generic test run.",The 'Test' pipeline does not suggest performance validation.,"Description mentions compiler refinements but lacks any performance metrics, benchmarks, or profiling data.",No comments.
285,3241695471,ai_agent,mochilang/mochi,9440.0,Improve Lua compiler membership optimization,[Test],False,[],none,"No validation evidence found. The description mentions an optimization but provides no performance data, benchmarks, or profiling results to support the claim.",The 'Test' pipeline does not indicate performance validation.,Mentions 'optimize' but provides no performance data or benchmark results.,No comments.
286,3241758610,ai_agent,mochilang/mochi,9484.0,Improve Clojure join compilation,[Test],False,[],none,The PR description mentions 'optimize' but provides no performance validation data like benchmarks or profiling results. The testing described is functional.,The 'Test' pipeline does not suggest performance validation.,"The description mentions 'optimize' but the testing section only shows a standard 'go test' command, not a benchmark.",No comments.
287,3242128024,ai_agent,mochilang/mochi,9492.0,Improve Python compiler list set ops,[Test],False,[],none,"The PR description mentions an 'optimize' goal, but no performance validation evidence like benchmarks or profiling is provided. The pipeline is a standard test run.",Pipeline 'Test' does not imply performance validation.,"Mentions 'optimize' but provides no performance data, only a standard 'go test' command.",No comments.
288,3242396116,ai_agent,mochilang/mochi,9550.0,Improve Prolog compiler map indexing,[Test],False,[],none,"No validation evidence. The PR description mentions optimizations but the testing section only shows unit test commands, not performance benchmarks or results.",The 'Test' pipeline is generic and does not suggest performance testing.,"Mentions optimizations but only provides unit test commands, no benchmark results or performance metrics.",No comments.
289,3242666013,ai_agent,ohcnetwork/care_fe,12979.0,Optimize encounter page API calls,"[Auto Label Conflicts, Code scanning - action, Cypress Tests, Deploy Care Fe, Lint Code Base, OSSAR]",True,"[description, comments]",unspecified,The PR description and comments detail optimizations to reduce redundant API calls and improve data fetching strategies for performance.,"Pipelines are for linting, E2E tests, and deployment, with no indication of performance testing.",The description explicitly states the goal is to optimize queries and remove duplicate API calls to improve performance by reducing the number of initial API calls.,An auto-generated comment summarizes the refactoring of data fetching logic for performance improvement.
