In [24]:
import json
from pathlib import Path
from typing import Dict, Iterable, List, Optional
import pandas as pd
from openai import OpenAI
from google import genai
from google.genai import types
import os
from dotenv import load_dotenv
load_dotenv() 


True

In [25]:
from google import genai

client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
pager = client.models.list(config={"page_size": 50})

for m in pager:
    print(m.name)

models/embedding-gecko-001
models/gemini-2.5-flash
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models/gemini-flash-latest
models/gemini-flash-lite-latest
models/gemini-pro-latest
models/gemini-2.5-flash-lite
models/gemini-2.5-flash-image-preview
models/gemini-2.5-flash-image
models/gemini-2.5-flash-preview-09-2025
models/gemini-2.5-flash-lite-preview-09-2025
models/gemini-3-pro-preview
models/gemini-3-pro-image-preview
models/nano-banana-pro-preview
models/gemini-robo

In [26]:
def run_llm(system_pront: str, prompt: str, type: str = "ollama", model: str = "gemma3:27b") -> str:
    """Call local OLLAMA."""
    LLM_BASE_URL = "http://localhost:11434/v1"
    LLM_API_KEY = ""
    llm_client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
    
  
    
    if(type == "ollama"):
        print("Running OLLAMA, model:", model)
        
        messages = [
            {"role": "system", "content": system_pront},
            {"role": "user", "content": prompt},
        ]
        
        r = llm_client.chat.completions.create(
            model= model,
            messages=messages,
        )
        result = r.choices[0].message.content.strip()
        print("OLLAMA response:", result)
        return result.strip()
    elif(type == "gemini"):
        
        model = "models/gemini-pro-latest"
        print("Running GEMINI, model:", model)
        
        GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
        client = genai.Client(api_key=GEMINI_API_KEY)

        schema={
                "type": "object",
                "properties": {
                    "validation_present": {
                        "type": "boolean"
                    },
                    "evidence_sources": {
                        "type": "array",
                        "items": {
                            "type": "string",
                            "enum": ["pipeline", "description", "comments"]
                        }
                    },
                    "validation_type": {
                        "type": "string",
                        "enum": [
                            "benchmark",
                            "profiling",
                            "load/canary",
                            "unit-only",
                            "unspecified",
                            "none"
                        ]
                    },
                    "validation_description": {
                        "type": "string"
                    },
                    "pipeline_signal": {
                        "type": "string"
                    },
                    "description_signal": {
                        "type": "string"
                    },
                    "comment_signal": {
                        "type": "string"
                    }
                },
                "required": [
                    "validation_present",
                    "evidence_sources",
                    "validation_type",
                    "validation_description",
                    "pipeline_signal",
                    "description_signal",
                    "comment_signal"
                ]
            }
        
        config = types.GenerateContentConfig(
            temperature=0.0,
            response_mime_type="application/json",
            response_schema=schema,
            # thinking_config=types.ThinkingConfig(
            #     thinking_level=types.ThinkingLevel.HIGH
            # )
        )
        
        messages = [
            system_pront,
            prompt
        ]
        
        response = client.models.generate_content(
            model=model,
            contents=messages,
            config=config,
        )

        return response.text
    

In [27]:
def find_datasets_dir(start: Optional[Path] = None) -> Path:
    start = start or Path.cwd()
    for path in (start, *start.parents):
        candidate = path / "datasets"
        if candidate.exists():
            return candidate
    raise FileNotFoundError(f"Could not find 'datasets' directory from {start}")


DATASETS_DIR = find_datasets_dir()
PROJECT_ROOT = DATASETS_DIR.parent

In [28]:
def extract_json(text: str) -> Dict:
    """Best-effort JSON extraction from model output."""
    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        snippet = text[start : end + 1]
        try:
            return json.loads(snippet)
        except json.JSONDecodeError:
            pass
    return {}

def truncate(text: str, limit: int = 10000) -> str:
    return text if len(text) <= limit else text[:limit] + "...[truncated]"

In [29]:
def load_pr_core(prefix: str) -> pd.DataFrame:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    return commits.drop_duplicates("pr_id").set_index("pr_id")


def collect_comments(prefix: str, pr_id: int) -> List[str]:
    issue = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_issue_comments.parquet"
    )
    review = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_review_comments.parquet"
    )
    texts = []
    for df in (issue, review):
        subset = df[df["pr_id"] == pr_id]
        texts.extend(subset["body"].dropna().tolist())
    return texts


def collect_pipeline_names(prefix: str, pr_id: int) -> List[str]:
    workflows = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_workflow_runs.parquet"
    )
    subset = workflows[workflows["pr_id"] == pr_id]
    return sorted(subset["workflow_name"].dropna().unique().tolist())


In [30]:
def analyze_pr(
    prefix: str, pr_id: int, author_type: str, pr_core: pd.DataFrame
) -> Dict:
    row = pr_core.loc[pr_id]
    pipeline_names = collect_pipeline_names(prefix, pr_id)
    comments = collect_comments(prefix, pr_id)
    description = (row.get("pr_description") or "").strip()
    
    SYSTEM_PROMPT_TEMPLATE = """You classify evidence of performance validation for a PR.
    Return compact JSON only with keys:
    validation_present (bool), evidence_sources (list of "pipeline","description","comments"),
    validation_type (benchmark,profiling,load/canary,unit-only,unspecified,none),
    validation_description (short text),
    pipeline_signal (short), description_signal (short), comment_signal (short).

    Rules:
    - Pipelines count only if workflow names imply perf/benchmark/load/canary; note when they are unit/lint-only.
    - Description/comments count if they mention perf benchmarks, profiling, latency/throughput numbers,
    load/canary rollout, A/B tests, perf tools, or explicit "no perf validation".
    - If nothing indicates perf validation, set validation_present=false,
    validation_type="none", evidence_sources=[],
    validation_description="No validation evidence".
    """
    
    PROMPT_TEMPLATE = """
    You are given information about a GitHub Pull Request (PR).
    Using the provided PIPELINES, DESCRIPTION, and COMMENTS, determine if there is evidence of performance validation for the PR.
    Input (TOONS format):

    PIPELINES:
    {pipeline_names}

    DESCRIPTION:
    {description}

    COMMENTS:
    {comments}

    JSON:
    """

    if not pipeline_names and not description and not comments:
        return {
            "pr_id": pr_id,
            "author_type": author_type,
            "repo": f"{row.get('repo_owner')}/{row.get('repo_name')}",
            "pr_number": row.get("pr_number"),
            "pr_title": row.get("pr_title"),
            "pipeline_names": pipeline_names,
            "validation_present": False,
            "evidence_sources": [],
            "validation_type": "none",
            "validation_description": "No validation evidence",
            "pipeline_signal": "",
            "description_signal": "",
            "comment_signal": "",
        }

    prompt = PROMPT_TEMPLATE.format(
    pipeline_names="\n  - " + "\n  - ".join(pipeline_names) if pipeline_names else "  None",
    description="  " + truncate(description).replace("\n", "\n  ") if description else "  None",
    comments="  - " + "\n  - ".join(truncate(" | ".join(comments)).split(" | ")) if comments else "  None",
    )
    
    raw = run_llm(SYSTEM_PROMPT_TEMPLATE, prompt, type="gemini")
    parsed = extract_json(raw)
    


    evidence_sources = parsed.get("evidence_sources") or []
    if isinstance(evidence_sources, (tuple, list)):
        evidence_sources = list(evidence_sources)

    return {
        "pr_id": pr_id,
        "author_type": author_type,
        "repo": f"{row.get('repo_owner')}/{row.get('repo_name')}",
        "pr_number": row.get("pr_number"),
        "pr_title": row.get("pr_title"),
        "pipeline_names": pipeline_names,
        "validation_present": parsed.get("validation_present"),
        "evidence_sources": evidence_sources,
        "validation_type": parsed.get("validation_type"),
        "validation_description": parsed.get("validation_description"),
        "pipeline_signal": parsed.get("pipeline_signal"),
        "description_signal": parsed.get("description_signal"),
        "comment_signal": parsed.get("comment_signal"),
    }

In [31]:
def pr_ids_from_commits(prefix: str, limit: Optional[int] = None) -> Iterable[int]:
    commits = pd.read_parquet(
        DATASETS_DIR / f"{prefix}_pr" / f"{prefix}_pr_commits.parquet"
    )
    pr_ids = sorted(commits["pr_id"].dropna().astype(int).unique().tolist())
    return pr_ids if limit is None else pr_ids[:limit]


In [32]:
out_dir = PROJECT_ROOT / "RQ3"
out_dir.mkdir(exist_ok=True, parents=True)
out_path = out_dir / "rq3_validation_evidence.parquet"

# Cargar progreso previo y continuar desde AI PR 221 en adelante
if out_path.exists():
    prev_df = pd.read_parquet(out_path)
    records = prev_df.to_dict(orient="records")
    print(f"Loaded {len(records)} existing rows from {out_path}")
else:
    records = []
    print("No existing parquet found; starting fresh records list.")

aio_core = load_pr_core("ai")
human_core = load_pr_core("human")

limit = None
ai_start_index = 220  # reanudar despues de los primeros 220 PRs de AI

error_log = []
error_log_path = out_dir / "rq3_error_log.csv"

def save_partial(records, out_path):
    df_tmp = pd.DataFrame(records)
    df_tmp.to_parquet(out_path, index=False)
    print(f"[partial save] Saved {len(df_tmp)} rows to {out_path}")

ai_ids_all = list(pr_ids_from_commits("ai", limit=limit))
human_ids = list(pr_ids_from_commits("human", limit=limit))

aio_ids = ai_ids_all[ai_start_index:]
print(f"Processing {len(aio_ids)} AI PRs (starting from index {ai_start_index + 1}) and {len(human_ids)} human PRs (total).")

# ============================
# Process AI PRs
# ============================
for idx, pr_id in enumerate(aio_ids, 1):
    print(f"Processing AI PR {idx}/{len(aio_ids)}: {pr_id}")
    try:
        result = analyze_pr("ai", pr_id, "ai_agent", aio_core)
        records.append(result)
    except Exception as exc:
        err_msg = f"error: {exc}"
        print(f"[ERROR][AI][{pr_id}] {err_msg}")
        error_log.append({"pr_id": pr_id, "author_type": "ai_agent", "error": err_msg})
        records.append({
            "pr_id": pr_id,
            "author_type": "ai_agent",
            "repo": "",
            "pr_number": None,
            "pr_title": "",
            "pipeline_names": [],
            "validation_present": None,
            "evidence_sources": [],
            "validation_type": "error",
            "validation_description": err_msg,
            "pipeline_signal": "",
            "description_signal": "",
            "comment_signal": "",
        })

    # ---- SAVE EVERY 20 ----
    if len(records) % 20 == 0:
        save_partial(records, out_path)

# ============================
# Process Human PRs
# ============================
for idx, pr_id in enumerate(human_ids, 1):
    print(f"Processing human PR {idx}/{len(human_ids)}: {pr_id}")
    try:
        result = analyze_pr("human", pr_id, "human", human_core)
        records.append(result)
    except Exception as exc:
        err_msg = f"error: {exc}"
        print(f"[ERROR][HUMAN][{pr_id}] {err_msg}")
        error_log.append({"pr_id": pr_id, "author_type": "human", "error": err_msg})
        records.append({
            "pr_id": pr_id,
            "author_type": "human",
            "repo": "",
            "pr_number": None,
            "pr_title": "",
            "pipeline_names": [],
            "validation_present": None,
            "evidence_sources": [],
            "validation_type": "error",
            "validation_description": err_msg,
            "pipeline_signal": "",
            "description_signal": "",
            "comment_signal": "",
        })

    # ---- SAVE EVERY 20 ----
    if len(records) % 20 == 0:
        save_partial(records, out_path)

# ============================
# Final save
# ============================
df = pd.DataFrame(records)
df.to_parquet(out_path, index=False)
print(f"Saved FINAL {len(df)} rows to {out_path}")

if error_log:
    pd.DataFrame(error_log).to_csv(error_log_path, index=False)
    print(f"Saved {len(error_log)} error IDs to {error_log_path}")
else:
    print("No errors to log.")


Loaded 220 existing rows from /Users/antoniozhong/Documents/dev/purdue/MSR2026/github_perf_patch_study/RQ3/rq3_validation_evidence.parquet
Processing 104 AI PRs (starting from index 221) and 83 human PRs (total).
Processing AI PR 1/104: 3213528854
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 2/104: 3213723251
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 3/104: 3213724164
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 4/104: 3213728031
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 5/104: 3213730809
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 6/104: 3213747226
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 7/104: 3213750237
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 8/104: 3213850102
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 9/104: 3213857892
Running GEMINI, model: models/gemini-pro-latest
Processing AI PR 10/104: 3213876116
Runn

In [35]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.expand_frame_repr", True)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

data_temp = pd.read_parquet(out_path)
data_temp.tail(20)

Unnamed: 0,pr_id,author_type,repo,pr_number,pr_title,pipeline_names,validation_present,evidence_sources,validation_type,validation_description,pipeline_signal,description_signal,comment_signal
387,2555753483,human,dotnet/msbuild,11934.0,update to stop closures from lazy functions and linq,[],True,[description],profiling,"The author used ILSpy to verify that closures, which were causing allocations, were removed after the changes.",No pipelines.,Fixes an allocation issue and provides before/after ILSpy screenshots to verify closures were removed.,"Code review comments, no validation evidence."
388,2558083620,human,bionic-gpt/bionic-gpt,776.0,Cache busting,[],False,[],none,No validation evidence,No pipelines,No description,"A Cloudflare Pages deployment bot comment is present, but it contains no performance validation information."
389,2560305820,human,antiwork/gumroad,289.0,Added Typhoeus client for HTTP connection pooling + re-use,[autofix.ci],False,[],none,"No validation evidence. The PR description claims a performance improvement but provides no data, benchmarks, or profiling results to support it.",The 'autofix.ci' pipeline does not suggest performance testing.,Claims performance improvement by reusing HTTP connections but provides no data or benchmarks.,Auto-generated comments with no performance content.
390,2564432253,human,tokens-studio/figma-plugin,3402.0,Github Sync Optimization,"[ESLint, Node.js CI]",True,[description],unspecified,"The PR description states the goal is to reduce 'longer sync times' and suggests a manual, qualitative test to observe the performance improvement.","Pipelines 'ESLint' and 'Node.js CI' are for linting and general continuous integration, not performance validation.",The PR description identifies 'longer sync times' as a performance issue and suggests a manual test to verify the improvement.,"Comments are from bots (changeset, artifacts, code coverage) and code review feedback, with no mention of performance."
391,2573225924,human,microsoft/TypeScript,61822.0,"optimization, reduce memory usage","[CI, Code Scanning - Action]",True,"[description, comments]",benchmark,The PR description claims an 11% speedup in project initialization time. A comment asks for the methodology used to obtain these stats.,CI and Code Scanning pipelines are not performance-related.,Claims an 11% speedup in project initialization time for tsserver in large repositories.,A comment asks how the performance statistics were determined.
392,2577421996,human,antiwork/gumroad,307.0,Fixed duplicate context lookups across app,[autofix.ci],True,[description],benchmark,"The PR description states the change saves ~4 DB queries per page load, resulting in a ~2% performance win.",The 'autofix.ci' pipeline does not appear to be for performance validation.,"The description quantifies a performance improvement: 'saves ~4 DB queries per page load when logged in, resulting in a ~2% win'.",Comments are auto-generated and do not contain performance validation information.
393,2590261382,human,microsoft/vscode,251382.0,Optimized concat with reduce,[],False,[],none,No validation evidence. The description claims a performance improvement ('saves resources') but provides no data to support it.,No pipelines.,The description claims resource savings by using `push` instead of `concat`.,No comments.
394,2596620305,human,microsoft/qsharp,2530.0,Improve JupyterLab extension build time,"[Benchmark Reports, CI Build and Test, DevSkim]",True,"[pipeline, description]",benchmark,The PR description provides specific build time improvements (from ~70s to ~15s) and a 'Benchmark Reports' pipeline was executed.,A 'Benchmark Reports' pipeline was run.,The description provides specific build time improvements with before/after numbers (~70s to ~15s).,No comments provided.
395,2597070258,human,calcom/cal.com,21855.0,perf: use repository for me query & caching in /settings/my-account/general/ RSC,[],True,[description],unspecified,"The PR description claims performance improvements from caching and faster queries, supported by before and after screencasts demonstrating the change.",No pipelines found.,"Description includes a summary claiming improved performance through caching and faster queries, and provides before/after screencasts.",No performance validation mentioned.
396,2604024784,human,calcom/cal.com,21923.0,fix: Improve performance of settings/admin/organizations page,[],True,"[description, comments]",load/canary,"Manual validation was performed by testing UI responsiveness with a large number of organizations (5000). The author provided before/after videos, and a reviewer tested the branch locally, though the improvement was disputed.",No pipelines.,PR description claims performance improvement by memoizing and reducing re-renders for large organization lists.,"Reviewers requested before/after videos and manually tested the branch with 5000 organizations to check for performance improvements, concluding there was little difference."
