
# Combine OpenAI and Gemini validation labels

Align OpenAI and Gemini RQ3 validation outputs, flag disagreements, and fill those rows with manual labels from the human/agent spreadsheets. The notebook saves a single resolved parquet file.


In [12]:

from pathlib import Path
import pandas as pd
import json


def normalize_validation_type(series):
    return (
        series.fillna('none')
        .astype(str)
        .str.strip()
        .str.lower()
        .replace({'static analysis': 'static-analysis'})
    )

# Use absolute paths to avoid notebook CWD issues
BASE_DIR = Path('/Users/antoniozhong/Documents/dev/purdue/MSR2026/github_perf_patch_study/RQ4_test_and_validation')
LLM_DIR = BASE_DIR / 'llm_data'
MANUAL_DIR = BASE_DIR / 'manual_label'
OUTPUT_DIR = BASE_DIR / 'final_data'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

openai_path = LLM_DIR / 'rq4_validation_evidence_openai.parquet'
gemini_path = LLM_DIR / 'rq4_validation_evidence_gemini.parquet'
manual_human_path = MANUAL_DIR / 'Final - Human PRs.csv'
manual_agent_path = MANUAL_DIR / 'Final-Agent PRs.csv'
print('Using BASE_DIR:', BASE_DIR)
print('OpenAI file:', openai_path)
print('Gemini file:', gemini_path)


Using BASE_DIR: /Users/antoniozhong/Documents/dev/purdue/MSR2026/github_perf_patch_study/RQ4_test_and_validation
OpenAI file: /Users/antoniozhong/Documents/dev/purdue/MSR2026/github_perf_patch_study/RQ4_test_and_validation/llm_data/rq4_validation_evidence_openai.parquet
Gemini file: /Users/antoniozhong/Documents/dev/purdue/MSR2026/github_perf_patch_study/RQ4_test_and_validation/llm_data/rq4_validation_evidence_gemini.parquet


In [13]:
openai_df = pd.read_parquet(openai_path)
gemini_df = pd.read_parquet(gemini_path)

print(f'OpenAI rows: {len(openai_df)}')
print(f'Gemini rows: {len(gemini_df)}')
print(f'Columns: {list(openai_df.columns)}')


openai_df['validation_type'] = normalize_validation_type(openai_df['validation_type'])
gemini_df['validation_type'] = normalize_validation_type(gemini_df['validation_type'])
assert openai_df['pr_id'].is_unique and gemini_df['pr_id'].is_unique, 'pr_id must be unique'


OpenAI rows: 407
Gemini rows: 407
Columns: ['pr_id', 'author_type', 'repo', 'pr_number', 'pr_title', 'pipeline_names', 'validation_present', 'evidence_sources', 'validation_type', 'validation_description', 'pipeline_signal', 'description_signal', 'comment_signal']


In [14]:

comparison_columns = {
    'validation_present': lambda s: s.fillna(False).astype(bool),
    'validation_type': normalize_validation_type,
}

merged = openai_df.merge(
    gemini_df,
    on=['pr_id', 'repo', 'pr_number'],
    suffixes=('_openai', '_gemini'),
    how='inner',
)
assert len(merged) == len(openai_df) == len(gemini_df)

mismatch_mask = pd.Series(False, index=merged.index)
for col, normalizer in comparison_columns.items():
    left = normalizer(merged[f"{col}_openai"])
    right = normalizer(merged[f"{col}_gemini"])
    merged[f"{col}_mismatch"] = left != right
    mismatch_mask = mismatch_mask | merged[f"{col}_mismatch"]

merged['has_mismatch'] = mismatch_mask
agreement_ids = set(merged.loc[~mismatch_mask, 'pr_id'])
mismatch_ids = set(merged.loc[mismatch_mask, 'pr_id'])

print(f'Rows compared: {len(merged)}')
print(f'Agreements: {len(agreement_ids)} | Mismatches: {len(mismatch_ids)}')
print('Mismatches by author_type (OpenAI side):')
print(merged.loc[mismatch_mask, 'author_type_openai'].value_counts())


Rows compared: 407
Agreements: 344 | Mismatches: 63
Mismatches by author_type (OpenAI side):
author_type_openai
ai_agent    53
human       10
Name: count, dtype: int64


In [15]:

manual_human_df = pd.read_csv(manual_human_path)
manual_agent_df = pd.read_csv(manual_agent_path)
manual_df = pd.concat([manual_human_df, manual_agent_df], ignore_index=True)

manual_df = manual_df.rename(columns={'ID': 'pr_id', 'URL': 'html_url'})
manual_df['pr_id'] = manual_df['pr_id'].astype(int)
manual_df['validation_type'] = normalize_validation_type(manual_df['validation_type'])
manual_df['evidence_sources'] = manual_df['evidence_sources'].fillna('')
manual_df['validation_present'] = manual_df['validation_present'].astype(bool)

manual_lookup = manual_df.set_index('pr_id')
assert manual_lookup.index.is_unique
missing_manual = mismatch_ids - set(manual_lookup.index)
if missing_manual:
    raise ValueError(f'Missing manual labels for mismatch ids: {sorted(missing_manual)}')

print(f'Manual labels loaded: {len(manual_lookup)} rows')


Manual labels loaded: 63 rows


In [16]:

final_df = openai_df.copy()
final_df['label_source'] = 'openai_gemini_agree'

for pr_id in mismatch_ids:
    manual_row = manual_lookup.loc[pr_id]
    mask = final_df['pr_id'] == pr_id
    final_df.loc[mask, 'validation_present'] = bool(manual_row['validation_present'])
    final_df.loc[mask, 'validation_type'] = manual_row['validation_type']
    final_df.loc[mask, 'evidence_sources'] = manual_row['evidence_sources']
    final_df.loc[mask, 'label_source'] = 'manual_override'

final_df = final_df.merge(manual_df[['pr_id', 'html_url']], on='pr_id', how='left')
final_df = final_df.drop(columns=['html_url'], errors='ignore')

def _normalize_evidence_sources(val):
    if val is None:
        return ''
    if isinstance(val, float) and pd.isna(val):
        return ''
    if isinstance(val, str):
        return val
    if isinstance(val, (list, tuple, set)):
        return '; '.join(map(str, val))
    if isinstance(val, dict):
        try:
            return json.dumps(val, ensure_ascii=True)
        except Exception:
            return str(val)
    return str(val)

final_df['evidence_sources'] = final_df['evidence_sources'].apply(_normalize_evidence_sources)

print(final_df['label_source'].value_counts())

final_df = final_df.drop(columns=['html_url', 'label_source'], errors='ignore')


label_source
openai_gemini_agree    344
manual_override         63
Name: count, dtype: int64


In [17]:

resolved_parquet = OUTPUT_DIR / 'rq4_validation_evidence_final.parquet'
final_df_to_save = final_df.drop(columns=['html_url', 'label_source'], errors='ignore')
final_df_to_save.to_parquet(resolved_parquet, index=False)
print(f'Saved resolved data to: {resolved_parquet}')


Saved resolved data to: /Users/antoniozhong/Documents/dev/purdue/MSR2026/github_perf_patch_study/RQ4_test_and_validation/final_data/rq4_validation_evidence_final.parquet


In [18]:
len(final_df)

407

In [19]:
final_df.columns

Index(['pr_id', 'author_type', 'repo', 'pr_number', 'pr_title',
       'pipeline_names', 'validation_present', 'evidence_sources',
       'validation_type', 'validation_description', 'pipeline_signal',
       'description_signal', 'comment_signal'],
      dtype='object')