In [2]:
import json
import pandas as pd
from tabulate import tabulate

# Original

### Preference (Harmful self pref)

In [3]:
with open('.\quality\preference_results_llm_council_original_harmful.json', 'r') as file:
    preference_results_llm_council_original_harmful = json.load(file)

In [4]:
df = pd.DataFrame(preference_results_llm_council_original_harmful)

# Initialize a dictionary to store counts grouped by evaluator
grouped_counts = {}

# Group by evaluator and count occurrences
for evaluator, group in df.groupby("evaluator"):
    count_1_1 = ((group["forward_comparison"] == "1") & (group["backward_comparison"] == "1")).sum()
    count_1_2 = ((group["forward_comparison"] == "1") & (group["backward_comparison"] == "2")).sum()
    count_2_1 = ((group["forward_comparison"] == "2") & (group["backward_comparison"] == "1")).sum()
    count_2_2 = ((group["forward_comparison"] == "2") & (group["backward_comparison"] == "2")).sum()

    grouped_counts[evaluator] = {
        "(1,1)": count_1_1,
        "(1,2)": count_1_2,
        "(2,1)": count_2_1,
        "(2,2)": count_2_2,
    }

# Convert to DataFrame for display
grouped_counts_df = pd.DataFrame.from_dict(grouped_counts, orient="index")
print(tabulate(grouped_counts_df, headers="keys", tablefmt="grid", floatfmt=".2f"))
print("\nPercentages")
# Convert counts to percentages
grouped_counts_df_percentage = grouped_counts_df.div(grouped_counts_df.sum(axis=1), axis=0) * 100

# Print the result in table format
print(tabulate(grouped_counts_df_percentage, headers="keys", tablefmt="grid", floatfmt=".2f"))

+----------------------------------+---------+---------+---------+---------+
|                                  |   (1,1) |   (1,2) |   (2,1) |   (2,2) |
| DeepSeek-V3                      |   96.00 |  102.00 |   97.00 |    4.00 |
+----------------------------------+---------+---------+---------+---------+
| Meta-Llama-3.1-8B-Instruct-Turbo |    8.00 |   56.00 |  230.00 |  826.00 |
+----------------------------------+---------+---------+---------+---------+
| Qwen2.5-7B-Instruct-Turbo        |   86.00 |  116.00 |  607.00 |  125.00 |
+----------------------------------+---------+---------+---------+---------+

Percentages
+----------------------------------+---------+---------+---------+---------+
|                                  |   (1,1) |   (1,2) |   (2,1) |   (2,2) |
| DeepSeek-V3                      |   32.11 |   34.11 |   32.44 |    1.34 |
+----------------------------------+---------+---------+---------+---------+
| Meta-Llama-3.1-8B-Instruct-Turbo |    0.71 |    5.00 |   20.5

In [None]:
with open('.\quality\self_recog_quality.json', 'r') as file:
    results = json.load(file)
recog_results = results

In [None]:
df = pd.DataFrame(recog_results)

# Initialize a dictionary to store counts grouped by evaluator
grouped_counts = {}
print("DETECTION")
# Group by evaluator and count occurrences
for evaluator, group in df.groupby("evaluator"):
    count_1_1 = ((group["forward_detection"] == "1") & (group["backward_detection"] == "1")).sum()
    count_1_2 = ((group["forward_detection"] == "1") & (group["backward_detection"] == "2")).sum()
    count_2_1 = ((group["forward_detection"] == "2") & (group["backward_detection"] == "1")).sum()
    count_2_2 = ((group["forward_detection"] == "2") & (group["backward_detection"] == "2")).sum()

    grouped_counts[evaluator] = {
        "(1,1)": count_1_1,
        "(1,2)": count_1_2,
        "(2,1)": count_2_1,
        "(2,2)": count_2_2,
    }

# Convert to DataFrame for display
grouped_counts_df = pd.DataFrame.from_dict(grouped_counts, orient="index")

# Convert counts to percentages
grouped_counts_df_percentage = grouped_counts_df.div(grouped_counts_df.sum(axis=1), axis=0) * 100

# Print the result in table format
print(tabulate(grouped_counts_df_percentage, headers="keys", tablefmt="grid", floatfmt=".2f"))

### Preference/Recog (both correct)

In [3]:
with open('.\quality\pref_both_correct_quality.json', 'r') as file:
    preference_results_both_correct = json.load(file)

### Preference/Recog (both wrong)

In [4]:
with open('.\quality\pref_both_wrong_quality.json', 'r') as file:
    preference_results_both_wrong = json.load(file)

### Pref Other wrong (source correct)

In [5]:
with open('.\quality\pref_other_wrong_quality.json', 'r') as file:
    preference_results_other_wrong = json.load(file)

### Answer and Judge Acc: Overall vs Harmful 

In [8]:
all_pref = preference_results + preference_results_both_correct + preference_results_both_wrong + preference_results_other_wrong

#### Overall

In [6]:
# Load JSON file into a variable
with open('.\quality\paraphrased_by_others.json', 'r') as file:
    responses = json.load(file)

In [None]:
# Define model names
model_names = ["Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3"]

# Initialize result tracking
accuracy_results = {model: {'correct': 0, 'total': 0} for model in model_names}

# Calculate accuracy for each model
for record in responses[0]:
    gt_label = record.get('output_label')
    for model in model_names:
        model_label = record.get(f"{model}_output_label")
        if model_label is not None:
            accuracy_results[model]['total'] += 1
            if model_label == gt_label:
                accuracy_results[model]['correct'] += 1

# Create a DataFrame with counts and accuracy percentages
accuracy_df = pd.DataFrame([
    {
        "Model": model,
        "Correct": result["correct"],
        "Total": result["total"],
        "Answer Accuracy (%)": f"{(result['correct'] / result['total'] * 100):.2f}%" if result["total"] > 0 else "N/A"
    }
    for model, result in accuracy_results.items()
])

# Print the result
print(tabulate(accuracy_df, headers="keys", tablefmt="grid"))

In [None]:
# Model list
model_names = ["Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3"]

# Initialize tracking dictionaries
result_stats = {model: {"correct": 0, "total": 0} for model in model_names}

# Process preference_results
for record in preference_results:
    model = record["evaluator"]
    if model in result_stats:
        result_stats[model]["total"] += 1
        if record["forward_comparison"] == "2" and record["backward_comparison"] == "1":
            result_stats[model]["correct"] += 1

# Process preference_results_other_wrong
for record in preference_results_other_wrong:
    model = record["evaluator"]
    if model in result_stats:
        result_stats[model]["total"] += 1
        if record["forward_comparison"] == "1" and record["backward_comparison"] == "2":
            result_stats[model]["correct"] += 1

# Create DataFrame to display results
results_df = pd.DataFrame([
    {
        "Model": model,
        "Correct": stats["correct"],
        "Total": stats["total"],
        "Accuracy (%)": f"{(stats['correct'] / stats['total'] * 100):.2f}%" if stats["total"] > 0 else "N/A"
    }
    for model, stats in result_stats.items()
])

print("\nEvaluator Accuracy Results")
# Display the table
print(tabulate(results_df, headers="keys", tablefmt="grid"))

#### In Harmful Quadrant

In [8]:
# Model list
model_names = ["Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3"]

# Initialize tracking dictionaries
result_stats = {model: {"correct": 0, "total": 0} for model in model_names}

# Process preference_results
for record in preference_results_llm_council_original_harmful:
    model = record["evaluator"]
    if model in result_stats:
        result_stats[model]["total"] += 1
        if record["forward_comparison"] == "2" and record["backward_comparison"] == "1":
            result_stats[model]["correct"] += 1


# Create DataFrame to display results
results_df = pd.DataFrame([
    {
        "Model": model,
        "Correct": stats["correct"],
        "Total": stats["total"],
        "Accuracy (%)": f"{(stats['correct'] / stats['total'] * 100):.2f}%" if stats["total"] > 0 else "N/A"
    }
    for model, stats in result_stats.items()
])

print("\nEvaluator Accuracy Results")
# Display the table
print(tabulate(results_df, headers="keys", tablefmt="grid"))


Evaluator Accuracy Results
+----+----------------------------------+-----------+---------+----------------+
|    | Model                            |   Correct |   Total | Accuracy (%)   |
|  0 | Qwen2.5-7B-Instruct-Turbo        |       607 |     934 | 64.99%         |
+----+----------------------------------+-----------+---------+----------------+
|  1 | Meta-Llama-3.1-8B-Instruct-Turbo |       230 |    1120 | 20.54%         |
+----+----------------------------------+-----------+---------+----------------+
|  2 | DeepSeek-V3                      |        97 |     299 | 32.44%         |
+----+----------------------------------+-----------+---------+----------------+


# Second Format Prompt (A>B)

In [9]:
with open('.\quality\preference_results_llm_council_format_original_harmful.json', 'r') as file:
    preference_results_llm_council_format_original_harmful = json.load(file)

In [10]:
df = pd.DataFrame(preference_results_llm_council_format_original_harmful)

# Initialize a dictionary to store counts grouped by evaluator
grouped_counts = {}

# Group by evaluator and count occurrences
for evaluator, group in df.groupby("evaluator"):
    count_1_1 = ((group["forward_comparison"].str.contains("A>B")) & (group["backward_comparison"].str.contains("A>B"))).sum()
    count_1_2 = ((group["forward_comparison"].str.contains("A>B")) & (group["backward_comparison"].str.contains("B>A"))).sum()
    count_2_1 = ((group["forward_comparison"].str.contains("B>A")) & (group["backward_comparison"].str.contains("A>B"))).sum()
    count_2_2 = ((group["forward_comparison"].str.contains("B>A")) & (group["backward_comparison"].str.contains("B>A"))).sum()

    grouped_counts[evaluator] = {
        "(1,1)": count_1_1,
        "(1,2)": count_1_2,
        "(2,1)": count_2_1,
        "(2,2)": count_2_2,
    }

# Convert to DataFrame for display
grouped_counts_df = pd.DataFrame.from_dict(grouped_counts, orient="index")
print(tabulate(grouped_counts_df, headers="keys", tablefmt="grid", floatfmt=".2f"))
print("\nPercentages")
# Convert counts to percentages
grouped_counts_df_percentage = grouped_counts_df.div(grouped_counts_df.sum(axis=1), axis=0) * 100

# Print the result in table format
print(tabulate(grouped_counts_df_percentage, headers="keys", tablefmt="grid", floatfmt=".2f"))

+----------------------------------+---------+---------+---------+---------+
|                                  |   (1,1) |   (1,2) |   (2,1) |   (2,2) |
| DeepSeek-V3                      |    0.00 |    0.00 |    0.00 |    0.00 |
+----------------------------------+---------+---------+---------+---------+
| Meta-Llama-3.1-8B-Instruct-Turbo |    0.00 |    0.00 |    0.00 |    0.00 |
+----------------------------------+---------+---------+---------+---------+
| Qwen2.5-7B-Instruct-Turbo        |    0.00 |    0.00 |    0.00 |    0.00 |
+----------------------------------+---------+---------+---------+---------+

Percentages
+----------------------------------+---------+---------+---------+---------+
|                                  |   (1,1) |   (1,2) |   (2,1) |   (2,2) |
| DeepSeek-V3                      |     nan |     nan |     nan |     nan |
+----------------------------------+---------+---------+---------+---------+
| Meta-Llama-3.1-8B-Instruct-Turbo |     nan |     nan |     na

Eval Accuracy

In [11]:
# Model list
model_names = ["Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3"]

# Initialize tracking dictionaries
result_stats = {model: {"correct": 0, "total": 0} for model in model_names}

# Process preference_results
for record in preference_results_llm_council_original_harmful:
    model = record["evaluator"]
    if model in result_stats:
        result_stats[model]["total"] += 1
        if record["forward_comparison"].str.contains("B>A") and record["backward_comparison"].str.contains("A>B"):
            result_stats[model]["correct"] += 1


# Create DataFrame to display results
results_df = pd.DataFrame([
    {
        "Model": model,
        "Correct": stats["correct"],
        "Total": stats["total"],
        "Accuracy (%)": f"{(stats['correct'] / stats['total'] * 100):.2f}%" if stats["total"] > 0 else "N/A"
    }
    for model, stats in result_stats.items()
])

print("\nEvaluator Accuracy Results")
# Display the table
print(tabulate(results_df, headers="keys", tablefmt="grid"))

AttributeError: 'str' object has no attribute 'str'