In [1]:
import json
import pandas as pd
from tabulate import tabulate

# Original

In [19]:
with open('.\quality\self_pref_quality.json', 'r') as file:
    preference_results = json.load(file)

with open('.\quality\pref_other_wrong_quality.json', 'r') as file:
    preference_results_other_wrong = json.load(file)

with open('.\quality\pref_results_third_party_eval_original_harmful.json', 'r') as file:
    pref_results_third_party_eval_original_harmful = json.load(file)

In [None]:
# Create lookup dictionaries
other_wrong_lookup = {(r['evaluator'], r['evaluatee'], r['pid']): r for r in preference_results_other_wrong}
third_party_lookup = {(r['correct_answer_model'], r['wrong_answer_model'], r['pid']): r for r in pref_results_third_party_eval_original_harmful}

# Assemble results
ensemble_results = []

for r in preference_results:
    pid = r['pid']
    evaluator = r['evaluator']
    evaluatee = r['evaluatee']

    result = {
        'pid': pid,
        'judge_1': evaluator,
        'judge_1_correct': (r['forward_comparison'] == '2' and r['backward_comparison'] == '1')
    }

    # Judge 2
    key2 = (evaluatee, evaluator, pid)
    if key2 in other_wrong_lookup:
        r2 = other_wrong_lookup[key2]
        result['judge_2'] = r2['evaluator']
        result['judge_2_correct'] = (r2['forward_comparison'] == '1' and r2['backward_comparison'] == '2')
    else:
        result['judge_2'] = None
        result['judge_2_correct'] = None

    # Judge 3
    key3 = (evaluatee, evaluator, pid)
    if key3 in third_party_lookup:
        r3 = third_party_lookup[key3]
        result['judge_3'] = r3['judge_model']
        result['judge_3_correct'] = (r3['forward_comparison'] == '2' and r3['backward_comparison'] == '1')
    else:
        result['judge_3'] = None
        result['judge_3_correct'] = None

    ensemble_results.append(result)



In [21]:
# Initialize counter
correct_ensemble = 0

# Count how many records have all three judges correct
for record in ensemble_results:
    correct_votes = sum([
        record.get("judge_1_correct") is True,
        record.get("judge_2_correct") is True,
        record.get("judge_3_correct") is True
    ])
    if correct_votes >= 2:
        correct_ensemble += 1

# Calculate accuracy
total_records = len(ensemble_results)
ensemble_accuracy = (correct_ensemble / total_records * 100) if total_records > 0 else 0

correct_ensemble, total_records, f"{ensemble_accuracy:.2f}%"


(1439, 2353, '61.16%')

# Synonym

In [22]:
with open('.\quality\perturb2_meta_self_pref_quality.json', 'r') as file:
    perturb2_meta_preference_results = json.load(file)

with open('.\quality\perturb2_meta_self_pref_quality_other_wrong.json', 'r') as file:
    perturb2_meta_preference_results_other_wrong = json.load(file)

with open('.\quality\pref_results_third_party_eval_perturb2.json', 'r') as file:
    pref_results_third_party_eval_perturb2 = json.load(file)

In [23]:
# Create lookup dictionaries
other_wrong_lookup = {(r['evaluator'], r['evaluatee'], r['pid']): r for r in perturb2_meta_preference_results_other_wrong}
third_party_lookup = {(r['correct_answer_model'], r['wrong_answer_model'], r['pid']): r for r in pref_results_third_party_eval_perturb2}

# Assemble results
ensemble_results = []

for r in perturb2_meta_preference_results:
    pid = r['pid']
    evaluator = r['evaluator']
    evaluatee = r['evaluatee']

    result = {
        'pid': pid,
        'judge_1': evaluator,
        'judge_1_correct': (r['forward_comparison'] == '2' and r['backward_comparison'] == '1')
    }

    # Judge 2
    key2 = (evaluatee, evaluator, pid)
    if key2 in other_wrong_lookup:
        r2 = other_wrong_lookup[key2]
        result['judge_2'] = r2['evaluator']
        result['judge_2_correct'] = (r2['forward_comparison'] == '1' and r2['backward_comparison'] == '2')
    else:
        result['judge_2'] = None
        result['judge_2_correct'] = None

    # Judge 3
    key3 = (evaluatee, evaluator, pid)
    if key3 in third_party_lookup:
        r3 = third_party_lookup[key3]
        result['judge_3'] = r3['judge_model']
        result['judge_3_correct'] = (r3['forward_comparison'] == '2' and r3['backward_comparison'] == '1')
    else:
        result['judge_3'] = None
        result['judge_3_correct'] = None

    ensemble_results.append(result)



In [24]:
# Initialize counter
correct_ensemble = 0

for record in ensemble_results:
    correct_votes = sum([
        record.get("judge_1_correct") is True,
        record.get("judge_2_correct") is True,
        record.get("judge_3_correct") is True
    ])
    if correct_votes >= 2:
        correct_ensemble += 1

# Calculate accuracy
total_records = len(ensemble_results)
ensemble_accuracy = (correct_ensemble / total_records * 100) if total_records > 0 else 0

correct_ensemble, total_records, f"{ensemble_accuracy:.2f}%"


(1596, 2353, '67.83%')

# Paraphrase

In [25]:
with open('.\quality\paraphrase_other_by_eval_preference_results.json', 'r') as file:
    paraphrase_other_by_eval_preference_results = json.load(file)

with open('.\quality\paraphrase_other_by_eval_preference_results_other_wrong.json', 'r') as file:
    paraphrase_other_by_eval_preference_results_other_wrong = json.load(file)

with open('.\quality\pref_results_third_party_eval_paraphrase.json', 'r') as file:
    pref_results_third_party_eval_paraphrase = json.load(file)

In [26]:
# Create lookup dictionaries
other_wrong_lookup = {(r['evaluator'], r['evaluatee'], r['pid']): r for r in paraphrase_other_by_eval_preference_results_other_wrong}
third_party_lookup = {(r['correct_answer_model'], r['wrong_answer_model'], r['pid']): r for r in pref_results_third_party_eval_paraphrase}

# Assemble results
ensemble_results = []

for r in paraphrase_other_by_eval_preference_results:
    pid = r['pid']
    evaluator = r['evaluator']
    evaluatee = r['evaluatee']

    result = {
        'pid': pid,
        'judge_1': evaluator,
        'judge_1_correct': (r['forward_comparison'] == '2' and r['backward_comparison'] == '1')
    }

    # Judge 2
    key2 = (evaluatee, evaluator, pid)
    if key2 in other_wrong_lookup:
        r2 = other_wrong_lookup[key2]
        result['judge_2'] = r2['evaluator']
        result['judge_2_correct'] = (r2['forward_comparison'] == '1' and r2['backward_comparison'] == '2')
    else:
        result['judge_2'] = None
        result['judge_2_correct'] = None

    # Judge 3
    key3 = (evaluatee, evaluator, pid)
    if key3 in third_party_lookup:
        r3 = third_party_lookup[key3]
        result['judge_3'] = r3['judge_model']
        result['judge_3_correct'] = (r3['forward_comparison'] == '2' and r3['backward_comparison'] == '1')
    else:
        result['judge_3'] = None
        result['judge_3_correct'] = None

    ensemble_results.append(result)



In [27]:
# Initialize counter
correct_ensemble = 0

for record in ensemble_results:
    correct_votes = sum([
        record.get("judge_1_correct") is True,
        record.get("judge_2_correct") is True,
        record.get("judge_3_correct") is True
    ])
    if correct_votes >= 2:
        correct_ensemble += 1

# Calculate accuracy
total_records = len(ensemble_results)
ensemble_accuracy = (correct_ensemble / total_records * 100) if total_records > 0 else 0

correct_ensemble, total_records, f"{ensemble_accuracy:.2f}%"


(1243, 2353, '52.83%')